diff --git a/.gitignore b/.gitignore
index 54f5f4cff73c9..62aba435754d3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,16 +8,19 @@
 *.pyc
 .idea/
 .idea_modules/
-sbt/*.jar
+build/*.jar
 .settings
 .cache
+cache
 .generated-mima*
-/build/
 work/
 out/
 .DS_Store
 third_party/libmesos.so
 third_party/libmesos.dylib
+build/apache-maven*
+build/zinc*
+build/scala*
 conf/java-opts
 conf/*.sh
 conf/*.cmd
@@ -51,10 +54,11 @@ checkpoint
 derby.log
 dist/
 dev/create-release/*txt
-dev/create-release/*new
+dev/create-release/*final
 spark-*-bin-*.tgz
 unit-tests.log
 /lib/
+ec2/lib/
 rat-results.txt
 scalastyle.txt
 conf/spark-defaults.conf.bak
diff --git a/.rat-excludes b/.rat-excludes
index d8bee1f8e49c9..769defbac11b7 100644
--- a/.rat-excludes
+++ b/.rat-excludes
@@ -64,3 +64,4 @@ dist/*
 logs
 .*scalastyle-output.xml
 .*dependency-reduced-pom.xml
+known_translations
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 78fb908f9a9ef..b2a9d0780ee2b 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -354,5 +354,25 @@
         </dependency>
       </dependencies>
     </profile>
+
+    <!-- Profiles that disable inclusion of certain dependencies. -->
+    <profile>
+      <id>hadoop-provided</id>
+      <properties>
+        <hadoop.deps.scope>provided</hadoop.deps.scope>
+      </properties>
+    </profile>
+    <profile>
+      <id>hive-provided</id>
+      <properties>
+        <hive.deps.scope>provided</hive.deps.scope>
+      </properties>
+    </profile>
+    <profile>
+      <id>parquet-provided</id>
+      <properties>
+        <parquet.deps.scope>provided</parquet.deps.scope>
+      </properties>
+    </profile>
   </profiles>
 </project>
diff --git a/bagel/pom.xml b/bagel/pom.xml
index 0327ffa402671..510e92640eff8 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -40,15 +40,6 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
-    <dependency>
-      <groupId>org.eclipse.jetty</groupId>
-      <artifactId>jetty-server</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
@@ -58,11 +49,5 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
-    <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
-    </plugins>
   </build>
 </project>
diff --git a/bagel/src/test/resources/log4j.properties b/bagel/src/test/resources/log4j.properties
index 789869f72e3b0..853ef0ed2986f 100644
--- a/bagel/src/test/resources/log4j.properties
+++ b/bagel/src/test/resources/log4j.properties
@@ -15,10 +15,10 @@
 # limitations under the License.
 #
 
-# Set everything to be logged to the file bagel/target/unit-tests.log
+# Set everything to be logged to the file target/unit-tests.log
 log4j.rootCategory=INFO, file
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/bin/compute-classpath.cmd b/bin/compute-classpath.cmd
index a4c099fb45b14..088f993954d9e 100644
--- a/bin/compute-classpath.cmd
+++ b/bin/compute-classpath.cmd
@@ -109,6 +109,13 @@ if "x%YARN_CONF_DIR%"=="x" goto no_yarn_conf_dir
   set CLASSPATH=%CLASSPATH%;%YARN_CONF_DIR%
 :no_yarn_conf_dir
 
+rem To allow for distributions to append needed libraries to the classpath (e.g. when
+rem using the "hadoop-provided" profile to build Spark), check SPARK_DIST_CLASSPATH and
+rem append it to tbe final classpath.
+if not "x%$SPARK_DIST_CLASSPATH%"=="x" (
+  set CLASSPATH=%CLASSPATH%;%SPARK_DIST_CLASSPATH%
+)
+
 rem A bit of a hack to allow calling this script within run2.cmd without seeing output
 if "%DONT_PRINT_CLASSPATH%"=="1" goto exit
 
diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
index 298641f2684de..9e8d0b785194e 100755
--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -25,7 +25,11 @@ FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
 
 . "$FWDIR"/bin/load-spark-env.sh
 
-CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH"
+if [ -n "$SPARK_CLASSPATH" ]; then
+  CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH"
+else
+  CLASSPATH="$SPARK_SUBMIT_CLASSPATH"
+fi
 
 # Build up classpath
 if [ -n "$SPARK_CONF_DIR" ]; then
@@ -68,22 +72,25 @@ else
   assembly_folder="$ASSEMBLY_DIR"
 fi
 
-num_jars="$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*\.jar" | wc -l)"
-if [ "$num_jars" -eq "0" ]; then
-  echo "Failed to find Spark assembly in $assembly_folder"
-  echo "You need to build Spark before running this program."
-  exit 1
-fi
+num_jars=0
+
+for f in ${assembly_folder}/spark-assembly*hadoop*.jar; do
+  if [[ ! -e "$f" ]]; then
+    echo "Failed to find Spark assembly in $assembly_folder" 1>&2
+    echo "You need to build Spark before running this program." 1>&2
+    exit 1
+  fi
+  ASSEMBLY_JAR="$f"
+  num_jars=$((num_jars+1))
+done
+
 if [ "$num_jars" -gt "1" ]; then
-  jars_list=$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*.jar")
-  echo "Found multiple Spark assembly jars in $assembly_folder:"
-  echo "$jars_list"
-  echo "Please remove all but one jar."
+  echo "Found multiple Spark assembly jars in $assembly_folder:" 1>&2
+  ls ${assembly_folder}/spark-assembly*hadoop*.jar 1>&2
+  echo "Please remove all but one jar." 1>&2
   exit 1
 fi
 
-ASSEMBLY_JAR="$(ls "$assembly_folder"/spark-assembly*hadoop*.jar 2>/dev/null)"
-
 # Verify that versions of java used to build the jars and run Spark are compatible
 jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
 if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
@@ -108,7 +115,7 @@ else
   datanucleus_dir="$FWDIR"/lib_managed/jars
 fi
 
-datanucleus_jars="$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar")"
+datanucleus_jars="$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar$")"
 datanucleus_jars="$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)"
 
 if [ -n "$datanucleus_jars" ]; then
@@ -142,4 +149,11 @@ if [ -n "$YARN_CONF_DIR" ]; then
   CLASSPATH="$CLASSPATH:$YARN_CONF_DIR"
 fi
 
+# To allow for distributions to append needed libraries to the classpath (e.g. when
+# using the "hadoop-provided" profile to build Spark), check SPARK_DIST_CLASSPATH and
+# append it to tbe final classpath.
+if [ -n "$SPARK_DIST_CLASSPATH" ]; then
+  CLASSPATH="$CLASSPATH:$SPARK_DIST_CLASSPATH"
+fi
+
 echo "$CLASSPATH"
diff --git a/bin/run-example b/bin/run-example
index 3d932509426fc..c567acf9a6b5c 100755
--- a/bin/run-example
+++ b/bin/run-example
@@ -35,17 +35,32 @@ else
 fi
 
 if [ -f "$FWDIR/RELEASE" ]; then
-  export SPARK_EXAMPLES_JAR="`ls "$FWDIR"/lib/spark-examples-*hadoop*.jar`"
-elif [ -e "$EXAMPLES_DIR"/target/scala-$SPARK_SCALA_VERSION/spark-examples-*hadoop*.jar ]; then
-  export SPARK_EXAMPLES_JAR="`ls "$EXAMPLES_DIR"/target/scala-$SPARK_SCALA_VERSION/spark-examples-*hadoop*.jar`"
+  JAR_PATH="${FWDIR}/lib"
+else
+  JAR_PATH="${EXAMPLES_DIR}/target/scala-${SPARK_SCALA_VERSION}"
 fi
 
-if [[ -z "$SPARK_EXAMPLES_JAR" ]]; then
-  echo "Failed to find Spark examples assembly in $FWDIR/lib or $FWDIR/examples/target" 1>&2
-  echo "You need to build Spark before running this program" 1>&2
+JAR_COUNT=0
+
+for f in ${JAR_PATH}/spark-examples-*hadoop*.jar; do
+  if [[ ! -e "$f" ]]; then
+    echo "Failed to find Spark examples assembly in $FWDIR/lib or $FWDIR/examples/target" 1>&2
+    echo "You need to build Spark before running this program" 1>&2
+    exit 1
+  fi
+  SPARK_EXAMPLES_JAR="$f"
+  JAR_COUNT=$((JAR_COUNT+1))
+done
+
+if [ "$JAR_COUNT" -gt "1" ]; then
+  echo "Found multiple Spark examples assembly jars in ${JAR_PATH}" 1>&2
+  ls ${JAR_PATH}/spark-examples-*hadoop*.jar 1>&2
+  echo "Please remove all but one jar." 1>&2
   exit 1
 fi
 
+export SPARK_EXAMPLES_JAR
+
 EXAMPLE_MASTER=${MASTER:-"local[*]"}
 
 if [[ ! $EXAMPLE_CLASS == org.apache.spark.examples* ]]; then
diff --git a/bin/spark-class b/bin/spark-class
index 0d58d95c1aee3..1b945461fabc8 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -71,6 +71,8 @@ case "$1" in
   'org.apache.spark.executor.MesosExecutorBackend')
     OUR_JAVA_OPTS="$SPARK_JAVA_OPTS $SPARK_EXECUTOR_OPTS"
     OUR_JAVA_MEM=${SPARK_EXECUTOR_MEMORY:-$DEFAULT_MEM}
+    export PYTHONPATH="$FWDIR/python:$PYTHONPATH"
+    export PYTHONPATH="$FWDIR/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH"
     ;;
 
   # Spark submit uses SPARK_JAVA_OPTS + SPARK_SUBMIT_OPTS +
@@ -148,7 +150,7 @@ fi
 if [[ "$1" =~ org.apache.spark.tools.* ]]; then
   if test -z "$SPARK_TOOLS_JAR"; then
     echo "Failed to find Spark Tools Jar in $FWDIR/tools/target/scala-$SPARK_SCALA_VERSION/" 1>&2
-    echo "You need to build Spark before running $1." 1>&2
+    echo "You need to run \"build/sbt tools/package\" before running $1." 1>&2
     exit 1
   fi
   CLASSPATH="$CLASSPATH:$SPARK_TOOLS_JAR"
diff --git a/bin/spark-shell b/bin/spark-shell
index 4a0670fc6c8aa..cca5aa0676123 100755
--- a/bin/spark-shell
+++ b/bin/spark-shell
@@ -45,6 +45,13 @@ source "$FWDIR"/bin/utils.sh
 SUBMIT_USAGE_FUNCTION=usage
 gatherSparkSubmitOpts "$@"
 
+# SPARK-4161: scala does not assume use of the java classpath,
+# so we need to add the "-Dscala.usejavacp=true" flag mnually. We
+# do this specifically for the Spark shell because the scala REPL
+# has its own class loader, and any additional classpath specified
+# through spark.driver.extraClassPath is not automatically propagated.
+SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Dscala.usejavacp=true"
+
 function main() {
   if $cygwin; then
     # Workaround for issue involving JLine and Cygwin
diff --git a/bin/spark-shell2.cmd b/bin/spark-shell2.cmd
index 2ee60b4e2a2b3..1d1a40da315eb 100644
--- a/bin/spark-shell2.cmd
+++ b/bin/spark-shell2.cmd
@@ -19,4 +19,23 @@ rem
 
 set SPARK_HOME=%~dp0..
 
-cmd /V /E /C %SPARK_HOME%\bin\spark-submit.cmd --class org.apache.spark.repl.Main %* spark-shell
+echo "%*" | findstr " --help -h" >nul
+if %ERRORLEVEL% equ 0 (
+  call :usage
+  exit /b 0
+)
+
+call %SPARK_HOME%\bin\windows-utils.cmd %*
+if %ERRORLEVEL% equ 1 (
+  call :usage
+  exit /b 1
+)
+
+cmd /V /E /C %SPARK_HOME%\bin\spark-submit.cmd --class org.apache.spark.repl.Main %SUBMISSION_OPTS% spark-shell %APPLICATION_OPTS%
+
+exit /b 0
+
+:usage
+echo "Usage: .\bin\spark-shell.cmd [options]" >&2
+%SPARK_HOME%\bin\spark-submit --help 2>&1 | findstr /V "Usage" 1>&2
+exit /b 0
diff --git a/bin/spark-submit b/bin/spark-submit
index f92d90c3a66b0..3e5cbdbb24394 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -38,11 +38,19 @@ while (($#)); do
     export SPARK_SUBMIT_CLASSPATH=$2
   elif [ "$1" = "--driver-java-options" ]; then
     export SPARK_SUBMIT_OPTS=$2
+  elif [ "$1" = "--master" ]; then
+    export MASTER=$2
   fi
   shift
 done
 
-DEFAULT_PROPERTIES_FILE="$SPARK_HOME/conf/spark-defaults.conf"
+if [ -z "$SPARK_CONF_DIR" ]; then
+  export SPARK_CONF_DIR="$SPARK_HOME/conf"
+fi
+DEFAULT_PROPERTIES_FILE="$SPARK_CONF_DIR/spark-defaults.conf"
+if [ "$MASTER" == "yarn-cluster" ]; then
+  SPARK_SUBMIT_DEPLOY_MODE=cluster
+fi
 export SPARK_SUBMIT_DEPLOY_MODE=${SPARK_SUBMIT_DEPLOY_MODE:-"client"}
 export SPARK_SUBMIT_PROPERTIES_FILE=${SPARK_SUBMIT_PROPERTIES_FILE:-"$DEFAULT_PROPERTIES_FILE"}
 
diff --git a/bin/spark-submit2.cmd b/bin/spark-submit2.cmd
index cf6046d1547ad..12244a9cb04fb 100644
--- a/bin/spark-submit2.cmd
+++ b/bin/spark-submit2.cmd
@@ -24,7 +24,11 @@ set ORIG_ARGS=%*
 
 rem Reset the values of all variables used
 set SPARK_SUBMIT_DEPLOY_MODE=client
-set SPARK_SUBMIT_PROPERTIES_FILE=%SPARK_HOME%\conf\spark-defaults.conf
+
+if not defined %SPARK_CONF_DIR% (
+  set SPARK_CONF_DIR=%SPARK_HOME%\conf
+)
+set SPARK_SUBMIT_PROPERTIES_FILE=%SPARK_CONF_DIR%\spark-defaults.conf
 set SPARK_SUBMIT_DRIVER_MEMORY=
 set SPARK_SUBMIT_LIBRARY_PATH=
 set SPARK_SUBMIT_CLASSPATH=
@@ -45,11 +49,17 @@ if [%1] == [] goto continue
     set SPARK_SUBMIT_CLASSPATH=%2
   ) else if [%1] == [--driver-java-options] (
     set SPARK_SUBMIT_OPTS=%2
+  ) else if [%1] == [--master] (
+    set MASTER=%2
   )
   shift
 goto loop
 :continue
 
+if [%MASTER%] == [yarn-cluster] (
+  set SPARK_SUBMIT_DEPLOY_MODE=cluster
+)
+
 rem For client mode, the driver will be launched in the same JVM that launches
 rem SparkSubmit, so we may need to read the properties file for any extra class
 rem paths, library paths, java options and memory early on. Otherwise, it will
diff --git a/bin/windows-utils.cmd b/bin/windows-utils.cmd
new file mode 100644
index 0000000000000..1082a952dac99
--- /dev/null
+++ b/bin/windows-utils.cmd
@@ -0,0 +1,59 @@
+rem
+rem Licensed to the Apache Software Foundation (ASF) under one or more
+rem contributor license agreements.  See the NOTICE file distributed with
+rem this work for additional information regarding copyright ownership.
+rem The ASF licenses this file to You under the Apache License, Version 2.0
+rem (the "License"); you may not use this file except in compliance with
+rem the License.  You may obtain a copy of the License at
+rem
+rem    http://www.apache.org/licenses/LICENSE-2.0
+rem
+rem Unless required by applicable law or agreed to in writing, software
+rem distributed under the License is distributed on an "AS IS" BASIS,
+rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+rem See the License for the specific language governing permissions and
+rem limitations under the License.
+rem
+
+rem Gather all spark-submit options into SUBMISSION_OPTS
+
+set SUBMISSION_OPTS=
+set APPLICATION_OPTS=
+
+rem NOTE: If you add or remove spark-sumbmit options,
+rem modify NOT ONLY this script but also SparkSubmitArgument.scala
+
+:OptsLoop
+if "x%1"=="x" (
+  goto :OptsLoopEnd
+)
+
+SET opts="\<--master\> \<--deploy-mode\> \<--class\> \<--name\> \<--jars\> \<--py-files\> \<--files\>"
+SET opts="%opts:~1,-1% \<--conf\> \<--properties-file\> \<--driver-memory\> \<--driver-java-options\>"
+SET opts="%opts:~1,-1% \<--driver-library-path\> \<--driver-class-path\> \<--executor-memory\>"
+SET opts="%opts:~1,-1% \<--driver-cores\> \<--total-executor-cores\> \<--executor-cores\> \<--queue\>"
+SET opts="%opts:~1,-1% \<--num-executors\> \<--archives\>"
+
+echo %1 | findstr %opts% >nul
+if %ERRORLEVEL% equ 0 (
+  if "x%2"=="x" (
+    echo "%1" requires an argument. >&2
+    exit /b 1
+  )
+  set SUBMISSION_OPTS=%SUBMISSION_OPTS% %1 %2
+  shift
+  shift
+  goto :OptsLoop
+)
+echo %1 | findstr "\<--verbose\> \<-v\> \<--supervise\>" >nul
+if %ERRORLEVEL% equ 0 (
+  set SUBMISSION_OPTS=%SUBMISSION_OPTS% %1
+  shift
+  goto :OptsLoop
+)
+set APPLICATION_OPTS=%APPLICATION_OPTS% %1
+shift
+goto :OptsLoop
+
+:OptsLoopEnd
+exit /b 0
diff --git a/build/mvn b/build/mvn
new file mode 100755
index 0000000000000..43471f83e904c
--- /dev/null
+++ b/build/mvn
@@ -0,0 +1,149 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Determine the current working directory
+_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+# Preserve the calling directory
+_CALLING_DIR="$(pwd)"
+
+# Installs any application tarball given a URL, the expected tarball name,
+# and, optionally, a checkable binary path to determine if the binary has
+# already been installed
+## Arg1 - URL
+## Arg2 - Tarball Name
+## Arg3 - Checkable Binary
+install_app() {
+  local remote_tarball="$1/$2"
+  local local_tarball="${_DIR}/$2"
+  local binary="${_DIR}/$3"
+
+  # setup `curl` and `wget` silent options if we're running on Jenkins
+  local curl_opts=""
+  local wget_opts=""
+  if [ -n "$AMPLAB_JENKINS" ]; then
+    curl_opts="-s"
+    wget_opts="--quiet"
+  else
+    curl_opts="--progress-bar"
+    wget_opts="--progress=bar:force"
+  fi
+
+  if [ -z "$3" -o ! -f "$binary" ]; then
+    # check if we already have the tarball
+    # check if we have curl installed
+    # download application
+    [ ! -f "${local_tarball}" ] && [ -n "`which curl 2>/dev/null`" ] && \
+      echo "exec: curl ${curl_opts} ${remote_tarball}" && \
+      curl ${curl_opts} "${remote_tarball}" > "${local_tarball}"
+    # if the file still doesn't exist, lets try `wget` and cross our fingers
+    [ ! -f "${local_tarball}" ] && [ -n "`which wget 2>/dev/null`" ] && \
+      echo "exec: wget ${wget_opts} ${remote_tarball}" && \
+      wget ${wget_opts} -O "${local_tarball}" "${remote_tarball}"
+    # if both were unsuccessful, exit
+    [ ! -f "${local_tarball}" ] && \
+      echo -n "ERROR: Cannot download $2 with cURL or wget; " && \
+      echo "please install manually and try again." && \
+      exit 2
+    cd "${_DIR}" && tar -xzf "$2"
+    rm -rf "$local_tarball"
+  fi
+}
+
+# Install maven under the build/ folder
+install_mvn() {
+  install_app \
+    "http://apache.claz.org/maven/maven-3/3.2.3/binaries" \
+    "apache-maven-3.2.3-bin.tar.gz" \
+    "apache-maven-3.2.3/bin/mvn"
+  MVN_BIN="${_DIR}/apache-maven-3.2.3/bin/mvn"
+}
+
+# Install zinc under the build/ folder
+install_zinc() {
+  local zinc_path="zinc-0.3.5.3/bin/zinc"
+  [ ! -f "${zinc_path}" ] && ZINC_INSTALL_FLAG=1
+  install_app \
+    "http://downloads.typesafe.com/zinc/0.3.5.3" \
+    "zinc-0.3.5.3.tgz" \
+    "${zinc_path}"
+  ZINC_BIN="${_DIR}/${zinc_path}"
+}
+
+# Determine the Scala version from the root pom.xml file, set the Scala URL,
+# and, with that, download the specific version of Scala necessary under
+# the build/ folder
+install_scala() {
+  # determine the Scala version used in Spark
+  local scala_version=`grep "scala.version" "${_DIR}/../pom.xml" | \
+                       head -1 | cut -f2 -d'>' | cut -f1 -d'<'`
+  local scala_bin="${_DIR}/scala-${scala_version}/bin/scala"
+
+  install_app \
+    "http://downloads.typesafe.com/scala/${scala_version}" \
+    "scala-${scala_version}.tgz" \
+    "scala-${scala_version}/bin/scala"
+
+  SCALA_COMPILER="$(cd "$(dirname ${scala_bin})/../lib" && pwd)/scala-compiler.jar"
+  SCALA_LIBRARY="$(cd "$(dirname ${scala_bin})/../lib" && pwd)/scala-library.jar"
+}
+
+# Determines if a given application is already installed. If not, will attempt
+# to install
+## Arg1 - application name
+## Arg2 - Alternate path to local install under build/ dir
+check_and_install_app() {
+  # create the local environment variable in uppercase
+  local app_bin="`echo $1 | awk '{print toupper(\$0)}'`_BIN"
+  # some black magic to set the generated app variable (i.e. MVN_BIN) into the
+  # environment
+  eval "${app_bin}=`which $1 2>/dev/null`"
+
+  if [ -z "`which $1 2>/dev/null`" ]; then
+    install_$1
+  fi
+}
+
+# Setup healthy defaults for the Zinc port if none were provided from
+# the environment
+ZINC_PORT=${ZINC_PORT:-"3030"}
+
+# Check and install all applications necessary to build Spark
+check_and_install_app "mvn"
+
+# Install the proper version of Scala and Zinc for the build
+install_zinc
+install_scala
+
+# Reset the current working directory
+cd "${_CALLING_DIR}"
+
+# Now that zinc is ensured to be installed, check its status and, if its
+# not running or just installed, start it
+if [ -n "${ZINC_INSTALL_FLAG}" -o -z "`${ZINC_BIN} -status`" ]; then
+  ${ZINC_BIN} -shutdown
+  ${ZINC_BIN} -start -port ${ZINC_PORT} \
+    -scala-compiler "${SCALA_COMPILER}" \
+    -scala-library "${SCALA_LIBRARY}" &>/dev/null
+fi
+
+# Set any `mvn` options if not already present
+export MAVEN_OPTS=${MAVEN_OPTS:-"-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"}
+
+# Last, call the `mvn` command as usual
+${MVN_BIN} "$@"
diff --git a/build/sbt b/build/sbt
new file mode 100755
index 0000000000000..28ebb64f7197c
--- /dev/null
+++ b/build/sbt
@@ -0,0 +1,128 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# When creating new tests for Spark SQL Hive, the HADOOP_CLASSPATH must contain the hive jars so
+# that we can run Hive to generate the golden answer.  This is not required for normal development
+# or testing.
+for i in "$HIVE_HOME"/lib/*
+do HADOOP_CLASSPATH="$HADOOP_CLASSPATH:$i"
+done
+export HADOOP_CLASSPATH
+
+realpath () {
+(
+  TARGET_FILE="$1"
+
+  cd "$(dirname "$TARGET_FILE")"
+  TARGET_FILE="$(basename "$TARGET_FILE")"
+
+  COUNT=0
+  while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
+  do
+      TARGET_FILE="$(readlink "$TARGET_FILE")"
+      cd $(dirname "$TARGET_FILE")
+      TARGET_FILE="$(basename $TARGET_FILE)"
+      COUNT=$(($COUNT + 1))
+  done
+
+  echo "$(pwd -P)/"$TARGET_FILE""
+)
+}
+
+. "$(dirname "$(realpath "$0")")"/sbt-launch-lib.bash
+
+
+declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
+declare -r sbt_opts_file=".sbtopts"
+declare -r etc_sbt_opts_file="/etc/sbt/sbtopts"
+
+usage() {
+ cat <<EOM
+Usage: $script_name [options]
+
+  -h | -help         print this message
+  -v | -verbose      this runner is chattier
+  -d | -debug        set sbt log level to debug
+  -no-colors         disable ANSI color codes
+  -sbt-create        start sbt even if current directory contains no sbt project
+  -sbt-dir   <path>  path to global settings/plugins directory (default: ~/.sbt)
+  -sbt-boot  <path>  path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
+  -ivy       <path>  path to local Ivy repository (default: ~/.ivy2)
+  -mem    <integer>  set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
+  -no-share          use all local caches; no sharing
+  -no-global         uses global caches, but does not use global ~/.sbt directory.
+  -jvm-debug <port>  Turn on JVM debugging, open at the given port.
+  -batch             Disable interactive mode
+
+  # sbt version (default: from project/build.properties if present, else latest release)
+  -sbt-version  <version>   use the specified version of sbt
+  -sbt-jar      <path>      use the specified jar as the sbt launcher
+  -sbt-rc                   use an RC version of sbt
+  -sbt-snapshot             use a snapshot version of sbt
+
+  # java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
+  -java-home <path>         alternate JAVA_HOME
+
+  # jvm options and output control
+  JAVA_OPTS          environment variable, if unset uses "$java_opts"
+  SBT_OPTS           environment variable, if unset uses "$default_sbt_opts"
+  .sbtopts           if this file exists in the current directory, it is
+                     prepended to the runner args
+  /etc/sbt/sbtopts   if this file exists, it is prepended to the runner args
+  -Dkey=val          pass -Dkey=val directly to the java runtime
+  -J-X               pass option -X directly to the java runtime
+                     (-J is stripped)
+  -S-X               add -X to sbt's scalacOptions (-S is stripped)
+  -PmavenProfiles    Enable a maven profile for the build.
+
+In the case of duplicated or conflicting options, the order above
+shows precedence: JAVA_OPTS lowest, command line options highest.
+EOM
+}
+
+process_my_args () {
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+     -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
+      -no-share) addJava "$noshare_opts" && shift ;;
+     -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;;
+      -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
+       -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;;
+     -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
+         -batch) exec </dev/null && shift ;;
+
+    -sbt-create) sbt_create=true && shift ;;
+
+              *) addResidual "$1" && shift ;;
+    esac
+  done
+
+  # Now, ensure sbt version is used.
+  [[ "${sbt_version}XXX" != "XXX" ]] && addJava "-Dsbt.version=$sbt_version"
+}
+
+loadConfigFile() {
+  cat "$1" | sed '/^\#/d'
+}
+
+# if sbtopts files exist, prepend their contents to $@ so it can be processed by this runner
+[[ -f "$etc_sbt_opts_file" ]] && set -- $(loadConfigFile "$etc_sbt_opts_file") "$@"
+[[ -f "$sbt_opts_file" ]] && set -- $(loadConfigFile "$sbt_opts_file") "$@"
+
+run "$@"
diff --git a/sbt/sbt-launch-lib.bash b/build/sbt-launch-lib.bash
similarity index 96%
rename from sbt/sbt-launch-lib.bash
rename to build/sbt-launch-lib.bash
index fa7222d55a6db..f5df439effb01 100755
--- a/sbt/sbt-launch-lib.bash
+++ b/build/sbt-launch-lib.bash
@@ -37,10 +37,10 @@ dlog () {
 }
 
 acquire_sbt_jar () {
-  SBT_VERSION=`awk -F "=" '/sbt\\.version/ {print $2}' ./project/build.properties`
+  SBT_VERSION=`awk -F "=" '/sbt\.version/ {print $2}' ./project/build.properties`
   URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
   URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
-  JAR=sbt/sbt-launch-${SBT_VERSION}.jar
+  JAR=build/sbt-launch-${SBT_VERSION}.jar
 
   sbt_jar=$JAR
 
@@ -150,7 +150,7 @@ process_args () {
      -java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && export JAVA_HOME=$2 && shift 2 ;;
 
             -D*) addJava "$1" && shift ;;
-            -J*) addJava "${1:2}" && shift ;; 
+            -J*) addJava "${1:2}" && shift ;;
             -P*) enableProfile "$1" && shift ;;
               *) addResidual "$1" && shift ;;
     esac
diff --git a/conf/metrics.properties.template b/conf/metrics.properties.template
index 30bcab0c93302..96b6844f0aabb 100644
--- a/conf/metrics.properties.template
+++ b/conf/metrics.properties.template
@@ -77,8 +77,8 @@
 #   sample    false      Whether to show entire set of samples for histograms ('false' or 'true')
 #
 # * Default path is /metrics/json for all instances except the master. The master has two paths:
-#     /metrics/aplications/json # App information
-#     /metrics/master/json      # Master information
+#     /metrics/applications/json # App information
+#     /metrics/master/json       # Master information
 
 # org.apache.spark.metrics.sink.GraphiteSink
 #   Name:     Default:      Description:
diff --git a/core/pom.xml b/core/pom.xml
index 1feb00b3a7fb8..d9a49c9e08afc 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -276,11 +276,6 @@
       <artifactId>selenium-java</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-all</artifactId>
@@ -326,19 +321,6 @@
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
     <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-        <executions>
-          <execution>
-            <id>test</id>
-            <goals>
-              <goal>test</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-
       <!-- Unzip py4j so we can include its files in the jar -->
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
@@ -352,9 +334,9 @@
           </execution>
         </executions>
         <configuration>
-          <tasks>
+          <target>
             <unzip src="../python/lib/py4j-0.8.2.1-src.zip" dest="../python/build" />
-          </tasks>
+          </target>
         </configuration>
       </plugin>
       <plugin>
diff --git a/core/src/main/java/com/bealetech/metrics/reporting/Statsd.java b/core/src/main/java/com/bealetech/metrics/reporting/Statsd.java
index 6b79d61184e34..ef92bdd49bc7b 100644
--- a/core/src/main/java/com/bealetech/metrics/reporting/Statsd.java
+++ b/core/src/main/java/com/bealetech/metrics/reporting/Statsd.java
@@ -23,10 +23,10 @@ public class Statsd implements Closeable {
 
     private static final Pattern WHITESPACE = Pattern.compile("[\\s]+");
 
-    private static final String DRIVER = "driver";
+    private static final String DRIVER = "<driver>";
     private static final String EXECUTOR = "executor";
 
-    private static final String DRIVER_MATCH = ".driver.";
+    private static final String DRIVER_MATCH = ".<driver>.";
     private static final String EXECUTOR_MATCH = ".executor.";
 
     public static enum StatType { COUNTER, TIMER, GAUGE }
diff --git a/core/src/main/java/org/apache/spark/JavaSparkListener.java b/core/src/main/java/org/apache/spark/JavaSparkListener.java
new file mode 100644
index 0000000000000..646496f313507
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/JavaSparkListener.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark;
+
+import org.apache.spark.scheduler.SparkListener;
+import org.apache.spark.scheduler.SparkListenerApplicationEnd;
+import org.apache.spark.scheduler.SparkListenerApplicationStart;
+import org.apache.spark.scheduler.SparkListenerBlockManagerAdded;
+import org.apache.spark.scheduler.SparkListenerBlockManagerRemoved;
+import org.apache.spark.scheduler.SparkListenerEnvironmentUpdate;
+import org.apache.spark.scheduler.SparkListenerExecutorAdded;
+import org.apache.spark.scheduler.SparkListenerExecutorMetricsUpdate;
+import org.apache.spark.scheduler.SparkListenerExecutorRemoved;
+import org.apache.spark.scheduler.SparkListenerJobEnd;
+import org.apache.spark.scheduler.SparkListenerJobStart;
+import org.apache.spark.scheduler.SparkListenerStageCompleted;
+import org.apache.spark.scheduler.SparkListenerStageSubmitted;
+import org.apache.spark.scheduler.SparkListenerTaskEnd;
+import org.apache.spark.scheduler.SparkListenerTaskGettingResult;
+import org.apache.spark.scheduler.SparkListenerTaskStart;
+import org.apache.spark.scheduler.SparkListenerUnpersistRDD;
+
+/**
+ * Java clients should extend this class instead of implementing
+ * SparkListener directly. This is to prevent java clients
+ * from breaking when new events are added to the SparkListener
+ * trait.
+ *
+ * This is a concrete class instead of abstract to enforce
+ * new events get added to both the SparkListener and this adapter
+ * in lockstep.
+ */
+public class JavaSparkListener implements SparkListener {
+
+  @Override
+  public void onStageCompleted(SparkListenerStageCompleted stageCompleted) { }
+
+  @Override
+  public void onStageSubmitted(SparkListenerStageSubmitted stageSubmitted) { }
+
+  @Override
+  public void onTaskStart(SparkListenerTaskStart taskStart) { }
+
+  @Override
+  public void onTaskGettingResult(SparkListenerTaskGettingResult taskGettingResult) { }
+
+  @Override
+  public void onTaskEnd(SparkListenerTaskEnd taskEnd) { }
+
+  @Override
+  public void onJobStart(SparkListenerJobStart jobStart) { }
+
+  @Override
+  public void onJobEnd(SparkListenerJobEnd jobEnd) { }
+
+  @Override
+  public void onEnvironmentUpdate(SparkListenerEnvironmentUpdate environmentUpdate) { }
+
+  @Override
+  public void onBlockManagerAdded(SparkListenerBlockManagerAdded blockManagerAdded) { }
+
+  @Override
+  public void onBlockManagerRemoved(SparkListenerBlockManagerRemoved blockManagerRemoved) { }
+
+  @Override
+  public void onUnpersistRDD(SparkListenerUnpersistRDD unpersistRDD) { }
+
+  @Override
+  public void onApplicationStart(SparkListenerApplicationStart applicationStart) { }
+
+  @Override
+  public void onApplicationEnd(SparkListenerApplicationEnd applicationEnd) { }
+
+  @Override
+  public void onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate executorMetricsUpdate) { }
+
+  @Override
+  public void onExecutorAdded(SparkListenerExecutorAdded executorAdded) { }
+
+  @Override
+  public void onExecutorRemoved(SparkListenerExecutorRemoved executorRemoved) { }
+}
diff --git a/core/src/main/java/org/apache/spark/TaskContext.java b/core/src/main/java/org/apache/spark/TaskContext.java
index 0d6973203eba1..095f9fb94fdf0 100644
--- a/core/src/main/java/org/apache/spark/TaskContext.java
+++ b/core/src/main/java/org/apache/spark/TaskContext.java
@@ -62,7 +62,7 @@ static void unset() {
    */
   public abstract boolean isInterrupted();
 
-  /** @deprecated: use isRunningLocally() */
+  /** @deprecated use {@link #isRunningLocally()} */
   @Deprecated
   public abstract boolean runningLocally();
 
@@ -87,19 +87,39 @@ static void unset() {
    * is for HadoopRDD to register a callback to close the input stream.
    * Will be called in any situation - success, failure, or cancellation.
    *
-   * @deprecated: use addTaskCompletionListener
+   * @deprecated use {@link #addTaskCompletionListener(scala.Function1)}
    *
    * @param f Callback function.
    */
   @Deprecated
   public abstract void addOnCompleteCallback(final Function0<Unit> f);
 
+  /**
+   * The ID of the stage that this task belong to.
+   */
   public abstract int stageId();
 
+  /**
+   * The ID of the RDD partition that is computed by this task.
+   */
   public abstract int partitionId();
 
+  /**
+   * How many times this task has been attempted.  The first task attempt will be assigned
+   * attemptNumber = 0, and subsequent attempts will have increasing attempt numbers.
+   */
+  public abstract int attemptNumber();
+
+  /** @deprecated use {@link #taskAttemptId()}; it was renamed to avoid ambiguity. */
+  @Deprecated
   public abstract long attemptId();
 
+  /**
+   * An ID that is unique to this task attempt (within the same SparkContext, no two task attempts
+   * will share the same attempt ID).  This is roughly equivalent to Hadoop's TaskAttemptID.
+   */
+  public abstract long taskAttemptId();
+
   /** ::DeveloperApi:: */
   @DeveloperApi
   public abstract TaskMetrics taskMetrics();
diff --git a/core/src/main/resources/org/apache/spark/log4j-defaults.properties b/core/src/main/resources/org/apache/spark/log4j-defaults.properties
index 89eec7d4b7f61..c99a61f63ea2b 100644
--- a/core/src/main/resources/org/apache/spark/log4j-defaults.properties
+++ b/core/src/main/resources/org/apache/spark/log4j-defaults.properties
@@ -10,3 +10,4 @@ log4j.logger.org.eclipse.jetty=WARN
 log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
 log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
 log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+log4j.logger.org.apache.hadoop.yarn.util.RackResolver=WARN
diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.css b/core/src/main/resources/org/apache/spark/ui/static/webui.css
index 68c52ac09b55a..a1f7133f897ee 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/webui.css
+++ b/core/src/main/resources/org/apache/spark/ui/static/webui.css
@@ -19,6 +19,7 @@
   height: 50px;
   font-size: 15px;
   margin-bottom: 15px;
+  min-width: 1200px
 }
 
 .navbar .navbar-inner {
@@ -39,12 +40,12 @@
 
 .navbar .nav > li a {
   height: 30px;
-  line-height: 30px;
+  line-height: 2;
 }
 
 .navbar-text {
   height: 50px;
-  line-height: 50px;
+  line-height: 3.3;
 }
 
 table.sortable thead {
@@ -120,6 +121,14 @@ pre {
   border: none;
 }
 
+.description-input {
+  overflow: hidden;
+  text-overflow: ellipsis;
+  width: 100%;
+  white-space: nowrap;
+  display: block;
+}
+
 .stacktrace-details {
   max-height: 300px;
   overflow-y: auto;
@@ -169,6 +178,16 @@ span.additional-metric-title {
   display: inline-block;
 }
 
+.version {
+  line-height: 2.5;
+  vertical-align: bottom;
+  font-size: 12px;
+  padding: 0;
+  margin: 0;
+  font-weight: bold;
+  color: #777;
+}
+
 /* Hide all additional metrics by default. This is done here rather than using JavaScript to
  * avoid slow page loads for stage pages with large numbers (e.g., thousands) of tasks. */
 .scheduler_delay, .deserialization_time, .serialization_time, .getting_result_time {
diff --git a/core/src/main/scala/org/apache/spark/Accumulators.scala b/core/src/main/scala/org/apache/spark/Accumulators.scala
index 000bbd6b532ad..5f31bfba3f8d6 100644
--- a/core/src/main/scala/org/apache/spark/Accumulators.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulators.scala
@@ -19,6 +19,7 @@ package org.apache.spark
 
 import java.io.{ObjectInputStream, Serializable}
 import java.util.concurrent.atomic.AtomicLong
+import java.lang.ThreadLocal
 
 import scala.collection.generic.Growable
 import scala.collection.mutable.Map
@@ -278,10 +279,12 @@ object AccumulatorParam {
 
 // TODO: The multi-thread support in accumulators is kind of lame; check
 // if there's a more intuitive way of doing it right
-private object Accumulators {
+private[spark] object Accumulators {
   // TODO: Use soft references? => need to make readObject work properly then
   val originals = Map[Long, Accumulable[_, _]]()
-  val localAccums = Map[Thread, Map[Long, Accumulable[_, _]]]()
+  val localAccums = new ThreadLocal[Map[Long, Accumulable[_, _]]]() {
+    override protected def initialValue() = Map[Long, Accumulable[_, _]]()
+  }
   var lastId: Long = 0
 
   def newId(): Long = synchronized {
@@ -293,22 +296,21 @@ private object Accumulators {
     if (original) {
       originals(a.id) = a
     } else {
-      val accums = localAccums.getOrElseUpdate(Thread.currentThread, Map())
-      accums(a.id) = a
+      localAccums.get()(a.id) = a
     }
   }
 
   // Clear the local (non-original) accumulators for the current thread
   def clear() {
     synchronized {
-      localAccums.remove(Thread.currentThread)
+      localAccums.get.clear
     }
   }
 
   // Get the values of the local accumulators for the current thread (by ID)
   def values: Map[Long, Any] = synchronized {
     val ret = Map[Long, Any]()
-    for ((id, accum) <- localAccums.getOrElse(Thread.currentThread, Map())) {
+    for ((id, accum) <- localAccums.get) {
       ret(id) = accum.localValue
     }
     return ret
diff --git a/core/src/main/scala/org/apache/spark/Aggregator.scala b/core/src/main/scala/org/apache/spark/Aggregator.scala
index 79c9c451d273d..3b684bbeceaf2 100644
--- a/core/src/main/scala/org/apache/spark/Aggregator.scala
+++ b/core/src/main/scala/org/apache/spark/Aggregator.scala
@@ -34,7 +34,9 @@ case class Aggregator[K, V, C] (
     mergeValue: (C, V) => C,
     mergeCombiners: (C, C) => C) {
 
-  private val externalSorting = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", true)
+  // When spilling is enabled sorting will happen externally, but not necessarily with an 
+  // ExternalSorter. 
+  private val isSpillEnabled = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", true)
 
   @deprecated("use combineValuesByKey with TaskContext argument", "0.9.0")
   def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]]): Iterator[(K, C)] =
@@ -42,7 +44,7 @@ case class Aggregator[K, V, C] (
 
   def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]],
                          context: TaskContext): Iterator[(K, C)] = {
-    if (!externalSorting) {
+    if (!isSpillEnabled) {
       val combiners = new AppendOnlyMap[K,C]
       var kv: Product2[K, V] = null
       val update = (hadValue: Boolean, oldValue: C) => {
@@ -59,8 +61,8 @@ case class Aggregator[K, V, C] (
       // Update task metrics if context is not null
       // TODO: Make context non optional in a future release
       Option(context).foreach { c =>
-        c.taskMetrics.memoryBytesSpilled += combiners.memoryBytesSpilled
-        c.taskMetrics.diskBytesSpilled += combiners.diskBytesSpilled
+        c.taskMetrics.incMemoryBytesSpilled(combiners.memoryBytesSpilled)
+        c.taskMetrics.incDiskBytesSpilled(combiners.diskBytesSpilled)
       }
       combiners.iterator
     }
@@ -71,9 +73,9 @@ case class Aggregator[K, V, C] (
     combineCombinersByKey(iter, null)
 
   def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]], context: TaskContext)
-      : Iterator[(K, C)] =
+    : Iterator[(K, C)] =
   {
-    if (!externalSorting) {
+    if (!isSpillEnabled) {
       val combiners = new AppendOnlyMap[K,C]
       var kc: Product2[K, C] = null
       val update = (hadValue: Boolean, oldValue: C) => {
@@ -93,8 +95,8 @@ case class Aggregator[K, V, C] (
       // Update task metrics if context is not null
       // TODO: Make context non-optional in a future release
       Option(context).foreach { c =>
-        c.taskMetrics.memoryBytesSpilled += combiners.memoryBytesSpilled
-        c.taskMetrics.diskBytesSpilled += combiners.diskBytesSpilled
+        c.taskMetrics.incMemoryBytesSpilled(combiners.memoryBytesSpilled)
+        c.taskMetrics.incDiskBytesSpilled(combiners.diskBytesSpilled)
       }
       combiners.iterator
     }
diff --git a/core/src/main/scala/org/apache/spark/CacheManager.scala b/core/src/main/scala/org/apache/spark/CacheManager.scala
index 80da62c44edc5..a0c0372b7f0ef 100644
--- a/core/src/main/scala/org/apache/spark/CacheManager.scala
+++ b/core/src/main/scala/org/apache/spark/CacheManager.scala
@@ -44,7 +44,11 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
     blockManager.get(key) match {
       case Some(blockResult) =>
         // Partition is already materialized, so just return its values
-        context.taskMetrics.inputMetrics = Some(blockResult.inputMetrics)
+        val inputMetrics = blockResult.inputMetrics
+        val existingMetrics = context.taskMetrics
+          .getInputMetricsForReadMethod(inputMetrics.readMethod)
+        existingMetrics.addBytesRead(inputMetrics.bytesRead)
+
         new InterruptibleIterator(context, blockResult.data.asInstanceOf[Iterator[T]])
 
       case None =>
diff --git a/core/src/main/scala/org/apache/spark/Dependency.scala b/core/src/main/scala/org/apache/spark/Dependency.scala
index ab2594cfc02eb..9a7cd4523e5ab 100644
--- a/core/src/main/scala/org/apache/spark/Dependency.scala
+++ b/core/src/main/scala/org/apache/spark/Dependency.scala
@@ -60,6 +60,9 @@ abstract class NarrowDependency[T](_rdd: RDD[T]) extends Dependency[T] {
  * @param serializer [[org.apache.spark.serializer.Serializer Serializer]] to use. If set to None,
  *                   the default serializer, as specified by `spark.serializer` config option, will
  *                   be used.
+ * @param keyOrdering key ordering for RDD's shuffles
+ * @param aggregator map/reduce-side aggregator for RDD's shuffle
+ * @param mapSideCombine whether to perform partial aggregation (also known as map-side combine)
  */
 @DeveloperApi
 class ShuffleDependency[K, V, C](
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
new file mode 100644
index 0000000000000..a46a81eabd965
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+/**
+ * A client that communicates with the cluster manager to request or kill executors.
+ */
+private[spark] trait ExecutorAllocationClient {
+
+  /**
+   * Request an additional number of executors from the cluster manager.
+   * Return whether the request is acknowledged by the cluster manager.
+   */
+  def requestExecutors(numAdditionalExecutors: Int): Boolean
+
+  /**
+   * Request that the cluster manager kill the specified executors.
+   * Return whether the request is acknowledged by the cluster manager.
+   */
+  def killExecutors(executorIds: Seq[String]): Boolean
+
+  /**
+   * Request that the cluster manager kill the specified executor.
+   * Return whether the request is acknowledged by the cluster manager.
+   */
+  def killExecutor(executorId: String): Boolean = killExecutors(Seq(executorId))
+}
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 88adb892998af..b28da192c1c0d 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -60,10 +60,15 @@ import org.apache.spark.scheduler._
  *   spark.dynamicAllocation.executorIdleTimeout (K) -
  *     If an executor has been idle for this duration, remove it
  */
-private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging {
-  import ExecutorAllocationManager._
+private[spark] class ExecutorAllocationManager(
+    client: ExecutorAllocationClient,
+    listenerBus: LiveListenerBus,
+    conf: SparkConf)
+  extends Logging {
+
+  allocationManager =>
 
-  private val conf = sc.conf
+  import ExecutorAllocationManager._
 
   // Lower and upper bounds on the number of executors. These are required.
   private val minNumExecutors = conf.getInt("spark.dynamicAllocation.minExecutors", -1)
@@ -119,7 +124,7 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging
   private var clock: Clock = new RealClock
 
   // Listener for Spark events that impact the allocation policy
-  private val listener = new ExecutorAllocationListener(this)
+  private val listener = new ExecutorAllocationListener
 
   /**
    * Verify that the settings specified through the config are valid.
@@ -153,7 +158,7 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging
         "shuffle service. You may enable this through spark.shuffle.service.enabled.")
     }
     if (tasksPerExecutor == 0) {
-      throw new SparkException("spark.executor.cores must not be less than spark.task.cpus.cores")
+      throw new SparkException("spark.executor.cores must not be less than spark.task.cpus.")
     }
   }
 
@@ -168,7 +173,7 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging
    * Register for scheduler callbacks to decide when to add and remove executors.
    */
   def start(): Unit = {
-    sc.addSparkListener(listener)
+    listenerBus.addListener(listener)
     startPolling()
   }
 
@@ -207,11 +212,12 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging
       addTime += sustainedSchedulerBacklogTimeout * 1000
     }
 
-    removeTimes.foreach { case (executorId, expireTime) =>
-      if (now >= expireTime) {
+    removeTimes.retain { case (executorId, expireTime) =>
+      val expired = now >= expireTime
+      if (expired) {
         removeExecutor(executorId)
-        removeTimes.remove(executorId)
       }
+      !expired
     }
   }
 
@@ -253,7 +259,7 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging
     val actualNumExecutorsToAdd = math.min(numExecutorsToAdd, maxNumExecutorsToAdd)
 
     val newTotalExecutors = numExistingExecutors + actualNumExecutorsToAdd
-    val addRequestAcknowledged = testing || sc.requestExecutors(actualNumExecutorsToAdd)
+    val addRequestAcknowledged = testing || client.requestExecutors(actualNumExecutorsToAdd)
     if (addRequestAcknowledged) {
       logInfo(s"Requesting $actualNumExecutorsToAdd new executor(s) because " +
         s"tasks are backlogged (new desired total will be $newTotalExecutors)")
@@ -289,13 +295,13 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging
     // Do not kill the executor if we have already reached the lower bound
     val numExistingExecutors = executorIds.size - executorsPendingToRemove.size
     if (numExistingExecutors - 1 < minNumExecutors) {
-      logInfo(s"Not removing idle executor $executorId because there are only " +
+      logDebug(s"Not removing idle executor $executorId because there are only " +
         s"$numExistingExecutors executor(s) left (limit $minNumExecutors)")
       return false
     }
 
     // Send a request to the backend to kill this executor
-    val removeRequestAcknowledged = testing || sc.killExecutor(executorId)
+    val removeRequestAcknowledged = testing || client.killExecutor(executorId)
     if (removeRequestAcknowledged) {
       logInfo(s"Removing executor $executorId because it has been idle for " +
         s"$executorIdleTimeout seconds (new desired total will be ${numExistingExecutors - 1})")
@@ -313,7 +319,11 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging
   private def onExecutorAdded(executorId: String): Unit = synchronized {
     if (!executorIds.contains(executorId)) {
       executorIds.add(executorId)
-      executorIds.foreach(onExecutorIdle)
+      // If an executor (call this executor X) is not removed because the lower bound
+      // has been reached, it will no longer be marked as idle. When new executors join,
+      // however, we are no longer at the lower bound, and so we must mark executor X
+      // as idle again so as not to forget that it is a candidate for removal. (see SPARK-4951)
+      executorIds.filter(listener.isExecutorIdle).foreach(onExecutorIdle)
       logInfo(s"New executor $executorId has registered (new total is ${executorIds.size})")
       if (numExecutorsPending > 0) {
         numExecutorsPending -= 1
@@ -371,10 +381,14 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging
    * the executor is not already marked as idle.
    */
   private def onExecutorIdle(executorId: String): Unit = synchronized {
-    if (!removeTimes.contains(executorId) && !executorsPendingToRemove.contains(executorId)) {
-      logDebug(s"Starting idle timer for $executorId because there are no more tasks " +
-        s"scheduled to run on the executor (to expire in $executorIdleTimeout seconds)")
-      removeTimes(executorId) = clock.getTimeMillis + executorIdleTimeout * 1000
+    if (executorIds.contains(executorId)) {
+      if (!removeTimes.contains(executorId) && !executorsPendingToRemove.contains(executorId)) {
+        logDebug(s"Starting idle timer for $executorId because there are no more tasks " +
+          s"scheduled to run on the executor (to expire in $executorIdleTimeout seconds)")
+        removeTimes(executorId) = clock.getTimeMillis + executorIdleTimeout * 1000
+      }
+    } else {
+      logWarning(s"Attempted to mark unknown executor $executorId idle")
     }
   }
 
@@ -394,25 +408,24 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging
    * and consistency of events returned by the listener. For simplicity, it does not account
    * for speculated tasks.
    */
-  private class ExecutorAllocationListener(allocationManager: ExecutorAllocationManager)
-    extends SparkListener {
+  private class ExecutorAllocationListener extends SparkListener {
 
     private val stageIdToNumTasks = new mutable.HashMap[Int, Int]
     private val stageIdToTaskIndices = new mutable.HashMap[Int, mutable.HashSet[Int]]
     private val executorIdToTaskIds = new mutable.HashMap[String, mutable.HashSet[Long]]
 
     override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = {
-      synchronized {
-        val stageId = stageSubmitted.stageInfo.stageId
-        val numTasks = stageSubmitted.stageInfo.numTasks
+      val stageId = stageSubmitted.stageInfo.stageId
+      val numTasks = stageSubmitted.stageInfo.numTasks
+      allocationManager.synchronized {
         stageIdToNumTasks(stageId) = numTasks
         allocationManager.onSchedulerBacklogged()
       }
     }
 
     override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = {
-      synchronized {
-        val stageId = stageCompleted.stageInfo.stageId
+      val stageId = stageCompleted.stageInfo.stageId
+      allocationManager.synchronized {
         stageIdToNumTasks -= stageId
         stageIdToTaskIndices -= stageId
 
@@ -424,39 +437,49 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging
       }
     }
 
-    override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = synchronized {
+    override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
       val stageId = taskStart.stageId
       val taskId = taskStart.taskInfo.taskId
       val taskIndex = taskStart.taskInfo.index
       val executorId = taskStart.taskInfo.executorId
 
-      // If this is the last pending task, mark the scheduler queue as empty
-      stageIdToTaskIndices.getOrElseUpdate(stageId, new mutable.HashSet[Int]) += taskIndex
-      val numTasksScheduled = stageIdToTaskIndices(stageId).size
-      val numTasksTotal = stageIdToNumTasks.getOrElse(stageId, -1)
-      if (numTasksScheduled == numTasksTotal) {
-        // No more pending tasks for this stage
-        stageIdToNumTasks -= stageId
-        if (stageIdToNumTasks.isEmpty) {
-          allocationManager.onSchedulerQueueEmpty()
+      allocationManager.synchronized {
+        // This guards against the race condition in which the `SparkListenerTaskStart`
+        // event is posted before the `SparkListenerBlockManagerAdded` event, which is
+        // possible because these events are posted in different threads. (see SPARK-4951)
+        if (!allocationManager.executorIds.contains(executorId)) {
+          allocationManager.onExecutorAdded(executorId)
         }
-      }
 
-      // Mark the executor on which this task is scheduled as busy
-      executorIdToTaskIds.getOrElseUpdate(executorId, new mutable.HashSet[Long]) += taskId
-      allocationManager.onExecutorBusy(executorId)
+        // If this is the last pending task, mark the scheduler queue as empty
+        stageIdToTaskIndices.getOrElseUpdate(stageId, new mutable.HashSet[Int]) += taskIndex
+        val numTasksScheduled = stageIdToTaskIndices(stageId).size
+        val numTasksTotal = stageIdToNumTasks.getOrElse(stageId, -1)
+        if (numTasksScheduled == numTasksTotal) {
+          // No more pending tasks for this stage
+          stageIdToNumTasks -= stageId
+          if (stageIdToNumTasks.isEmpty) {
+            allocationManager.onSchedulerQueueEmpty()
+          }
+        }
+
+        // Mark the executor on which this task is scheduled as busy
+        executorIdToTaskIds.getOrElseUpdate(executorId, new mutable.HashSet[Long]) += taskId
+        allocationManager.onExecutorBusy(executorId)
+      }
     }
 
-    override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized {
+    override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = {
       val executorId = taskEnd.taskInfo.executorId
       val taskId = taskEnd.taskInfo.taskId
-
-      // If the executor is no longer running scheduled any tasks, mark it as idle
-      if (executorIdToTaskIds.contains(executorId)) {
-        executorIdToTaskIds(executorId) -= taskId
-        if (executorIdToTaskIds(executorId).isEmpty) {
-          executorIdToTaskIds -= executorId
-          allocationManager.onExecutorIdle(executorId)
+      allocationManager.synchronized {
+        // If the executor is no longer running scheduled any tasks, mark it as idle
+        if (executorIdToTaskIds.contains(executorId)) {
+          executorIdToTaskIds(executorId) -= taskId
+          if (executorIdToTaskIds(executorId).isEmpty) {
+            executorIdToTaskIds -= executorId
+            allocationManager.onExecutorIdle(executorId)
+          }
         }
       }
     }
@@ -464,7 +487,12 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging
     override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded): Unit = {
       val executorId = blockManagerAdded.blockManagerId.executorId
       if (executorId != SparkContext.DRIVER_IDENTIFIER) {
-        allocationManager.onExecutorAdded(executorId)
+        // This guards against the race condition in which the `SparkListenerTaskStart`
+        // event is posted before the `SparkListenerBlockManagerAdded` event, which is
+        // possible because these events are posted in different threads. (see SPARK-4951)
+        if (!allocationManager.executorIds.contains(executorId)) {
+          allocationManager.onExecutorAdded(executorId)
+        }
       }
     }
 
@@ -476,12 +504,23 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging
     /**
      * An estimate of the total number of pending tasks remaining for currently running stages. Does
      * not account for tasks which may have failed and been resubmitted.
+     *
+     * Note: This is not thread-safe without the caller owning the `allocationManager` lock.
      */
     def totalPendingTasks(): Int = {
       stageIdToNumTasks.map { case (stageId, numTasks) =>
         numTasks - stageIdToTaskIndices.get(stageId).map(_.size).getOrElse(0)
       }.sum
     }
+
+    /**
+     * Return true if an executor is not currently running a task, and false otherwise.
+     *
+     * Note: This is not thread-safe without the caller owning the `allocationManager` lock.
+     */
+    def isExecutorIdle(executorId: String): Boolean = {
+      !executorIdToTaskIds.contains(executorId)
+    }
   }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/HttpFileServer.scala b/core/src/main/scala/org/apache/spark/HttpFileServer.scala
index edc3889c9ae51..677c5e0f89d72 100644
--- a/core/src/main/scala/org/apache/spark/HttpFileServer.scala
+++ b/core/src/main/scala/org/apache/spark/HttpFileServer.scala
@@ -24,6 +24,7 @@ import com.google.common.io.Files
 import org.apache.spark.util.Utils
 
 private[spark] class HttpFileServer(
+    conf: SparkConf,
     securityManager: SecurityManager,
     requestedPort: Int = 0)
   extends Logging {
@@ -41,7 +42,7 @@ private[spark] class HttpFileServer(
     fileDir.mkdir()
     jarDir.mkdir()
     logInfo("HTTP File server directory is " + baseDir)
-    httpServer = new HttpServer(baseDir, securityManager, requestedPort, "HTTP file server")
+    httpServer = new HttpServer(conf, baseDir, securityManager, requestedPort, "HTTP file server")
     httpServer.start()
     serverUri = httpServer.uri
     logDebug("HTTP file server started at: " + serverUri)
diff --git a/core/src/main/scala/org/apache/spark/HttpServer.scala b/core/src/main/scala/org/apache/spark/HttpServer.scala
index 912558d0cab7d..fa22787ce7ea3 100644
--- a/core/src/main/scala/org/apache/spark/HttpServer.scala
+++ b/core/src/main/scala/org/apache/spark/HttpServer.scala
@@ -42,6 +42,7 @@ private[spark] class ServerStateException(message: String) extends Exception(mes
  * around a Jetty server.
  */
 private[spark] class HttpServer(
+    conf: SparkConf,
     resourceBase: File,
     securityManager: SecurityManager,
     requestedPort: Int = 0,
@@ -57,7 +58,7 @@ private[spark] class HttpServer(
     } else {
       logInfo("Starting HTTP Server")
       val (actualServer, actualPort) =
-        Utils.startServiceOnPort[Server](requestedPort, doStart, serverName)
+        Utils.startServiceOnPort[Server](requestedPort, doStart, conf, serverName)
       server = actualServer
       port = actualPort
     }
diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index 7d96962c4acd7..6e4edc7c80d7a 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -72,20 +72,22 @@ private[spark] class MapOutputTrackerMasterActor(tracker: MapOutputTrackerMaster
 /**
  * Class that keeps track of the location of the map output of
  * a stage. This is abstract because different versions of MapOutputTracker
- * (driver and worker) use different HashMap to store its metadata.
+ * (driver and executor) use different HashMap to store its metadata.
  */
 private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging {
   private val timeout = AkkaUtils.askTimeout(conf)
+  private val retryAttempts = AkkaUtils.numRetries(conf)
+  private val retryIntervalMs = AkkaUtils.retryWaitMs(conf)
 
   /** Set to the MapOutputTrackerActor living on the driver. */
   var trackerActor: ActorRef = _
 
   /**
-   * This HashMap has different behavior for the master and the workers.
+   * This HashMap has different behavior for the driver and the executors.
    *
-   * On the master, it serves as the source of map outputs recorded from ShuffleMapTasks.
-   * On the workers, it simply serves as a cache, in which a miss triggers a fetch from the
-   * master's corresponding HashMap.
+   * On the driver, it serves as the source of map outputs recorded from ShuffleMapTasks.
+   * On the executors, it simply serves as a cache, in which a miss triggers a fetch from the
+   * driver's corresponding HashMap.
    *
    * Note: because mapStatuses is accessed concurrently, subclasses should make sure it's a
    * thread-safe map.
@@ -99,7 +101,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
   protected var epoch: Long = 0
   protected val epochLock = new AnyRef
 
-  /** Remembers which map output locations are currently being fetched on a worker. */
+  /** Remembers which map output locations are currently being fetched on an executor. */
   private val fetching = new HashSet[Int]
 
   /**
@@ -108,8 +110,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
    */
   protected def askTracker(message: Any): Any = {
     try {
-      val future = trackerActor.ask(message)(timeout)
-      Await.result(future, timeout)
+      AkkaUtils.askWithReply(message, trackerActor, retryAttempts, retryIntervalMs, timeout)
     } catch {
       case e: Exception =>
         logError("Error communicating with MapOutputTracker", e)
@@ -136,14 +137,12 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
       logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them")
       var fetchedStatuses: Array[MapStatus] = null
       fetching.synchronized {
-        if (fetching.contains(shuffleId)) {
-          // Someone else is fetching it; wait for them to be done
-          while (fetching.contains(shuffleId)) {
-            try {
-              fetching.wait()
-            } catch {
-              case e: InterruptedException =>
-            }
+        // Someone else is fetching it; wait for them to be done
+        while (fetching.contains(shuffleId)) {
+          try {
+            fetching.wait()
+          } catch {
+            case e: InterruptedException =>
           }
         }
 
@@ -198,8 +197,8 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
 
   /**
    * Called from executors to update the epoch number, potentially clearing old outputs
-   * because of a fetch failure. Each worker task calls this with the latest epoch
-   * number on the master at the time it was created.
+   * because of a fetch failure. Each executor task calls this with the latest epoch
+   * number on the driver at the time it was created.
    */
   def updateEpoch(newEpoch: Long) {
     epochLock.synchronized {
@@ -231,7 +230,7 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf)
   private var cacheEpoch = epoch
 
   /**
-   * Timestamp based HashMap for storing mapStatuses and cached serialized statuses in the master,
+   * Timestamp based HashMap for storing mapStatuses and cached serialized statuses in the driver,
    * so that statuses are dropped only by explicit de-registering or by TTL-based cleaning (if set).
    * Other than these two scenarios, nothing should be dropped from this HashMap.
    */
@@ -341,7 +340,7 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf)
 }
 
 /**
- * MapOutputTracker for the workers, which fetches map output information from the driver's
+ * MapOutputTracker for the executors, which fetches map output information from the driver's
  * MapOutputTrackerMaster.
  */
 private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTracker(conf) {
diff --git a/core/src/main/scala/org/apache/spark/Partition.scala b/core/src/main/scala/org/apache/spark/Partition.scala
index 27892dbd2a0bc..dd3f28e4197e3 100644
--- a/core/src/main/scala/org/apache/spark/Partition.scala
+++ b/core/src/main/scala/org/apache/spark/Partition.scala
@@ -18,11 +18,11 @@
 package org.apache.spark
 
 /**
- * A partition of an RDD.
+ * An identifier for a partition in an RDD.
  */
 trait Partition extends Serializable {
   /**
-   * Get the split's index within its parent RDD
+   * Get the partition's index within its parent RDD
    */
   def index: Int
 
diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala
index dbff9d12b5ad7..ec82d09cd079b 100644
--- a/core/src/main/scala/org/apache/spark/SecurityManager.scala
+++ b/core/src/main/scala/org/apache/spark/SecurityManager.scala
@@ -93,19 +93,19 @@ import org.apache.spark.network.sasl.SecretKeyHolder
  *            Note that SASL is pluggable as to what mechanism it uses.  We currently use
  *            DIGEST-MD5 but this could be changed to use Kerberos or other in the future.
  *            Spark currently supports "auth" for the quality of protection, which means
- *            the connection is not supporting integrity or privacy protection (encryption)
+ *            the connection does not support integrity or privacy protection (encryption)
  *            after authentication. SASL also supports "auth-int" and "auth-conf" which
- *            SPARK could be support in the future to allow the user to specify the quality
+ *            SPARK could support in the future to allow the user to specify the quality
  *            of protection they want. If we support those, the messages will also have to
  *            be wrapped and unwrapped via the SaslServer/SaslClient.wrap/unwrap API's.
  *
  *            Since the NioBlockTransferService does asynchronous messages passing, the SASL
  *            authentication is a bit more complex. A ConnectionManager can be both a client
- *            and a Server, so for a particular connection is has to determine what to do.
+ *            and a Server, so for a particular connection it has to determine what to do.
  *            A ConnectionId was added to be able to track connections and is used to
  *            match up incoming messages with connections waiting for authentication.
- *            The ConnectionManager tracks all the sendingConnections using the ConnectionId
- *            and waits for the response from the server and does the handshake before sending
+ *            The ConnectionManager tracks all the sendingConnections using the ConnectionId,
+ *            waits for the response from the server, and does the handshake before sending
  *            the real message.
  *
  *            The NettyBlockTransferService ensures that SASL authentication is performed
@@ -114,14 +114,14 @@ import org.apache.spark.network.sasl.SecretKeyHolder
  *
  *  - HTTP for the Spark UI -> the UI was changed to use servlets so that javax servlet filters
  *            can be used. Yarn requires a specific AmIpFilter be installed for security to work
- *            properly. For non-Yarn deployments, users can write a filter to go through a
- *            companies normal login service. If an authentication filter is in place then the
+ *            properly. For non-Yarn deployments, users can write a filter to go through their
+ *            organization's normal login service. If an authentication filter is in place then the
  *            SparkUI can be configured to check the logged in user against the list of users who
  *            have view acls to see if that user is authorized.
  *            The filters can also be used for many different purposes. For instance filters
  *            could be used for logging, encryption, or compression.
  *
- *  The exact mechanisms used to generate/distributed the shared secret is deployment specific.
+ *  The exact mechanisms used to generate/distribute the shared secret are deployment-specific.
  *
  *  For Yarn deployments, the secret is automatically generated using the Akka remote
  *  Crypt.generateSecureCookie() API. The secret is placed in the Hadoop UGI which gets passed
@@ -138,7 +138,7 @@ import org.apache.spark.network.sasl.SecretKeyHolder
  *  All the nodes (Master and Workers) and the applications need to have the same shared secret.
  *  This again is not ideal as one user could potentially affect another users application.
  *  This should be enhanced in the future to provide better protection.
- *  If the UI needs to be secured the user needs to install a javax servlet filter to do the
+ *  If the UI needs to be secure, the user needs to install a javax servlet filter to do the
  *  authentication. Spark will then use that user to compare against the view acls to do
  *  authorization. If not filter is in place the user is generally null and no authorization
  *  can take place.
@@ -151,8 +151,8 @@ private[spark] class SecurityManager(sparkConf: SparkConf) extends Logging with
 
   private val authOn = sparkConf.getBoolean("spark.authenticate", false)
   // keep spark.ui.acls.enable for backwards compatibility with 1.0
-  private var aclsOn = sparkConf.getOption("spark.acls.enable").getOrElse(
-    sparkConf.get("spark.ui.acls.enable", "false")).toBoolean
+  private var aclsOn =
+    sparkConf.getBoolean("spark.acls.enable", sparkConf.getBoolean("spark.ui.acls.enable", false))
 
   // admin acls should be set before view or modify acls
   private var adminAcls: Set[String] =
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index c14764f773982..f9d4aa4240e9d 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -18,6 +18,7 @@
 package org.apache.spark
 
 import scala.collection.JavaConverters._
+import scala.collection.concurrent.TrieMap
 import scala.collection.mutable.{HashMap, LinkedHashSet}
 import org.apache.spark.serializer.KryoSerializer
 
@@ -46,7 +47,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
   /** Create a SparkConf that loads defaults from system properties and the classpath */
   def this() = this(true)
 
-  private[spark] val settings = new HashMap[String, String]()
+  private[spark] val settings = new TrieMap[String, String]()
 
   if (loadDefaults) {
     // Load any spark.* system properties
@@ -177,7 +178,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
   }
 
   /** Get all parameters as a list of pairs */
-  def getAll: Array[(String, String)] = settings.clone().toArray
+  def getAll: Array[(String, String)] = settings.toArray
 
   /** Get a parameter as an integer, falling back to a default if not set */
   def getInt(key: String, defaultValue: Int): Int = {
@@ -370,7 +371,9 @@ private[spark] object SparkConf {
   }
 
   /**
-   * Return whether the given config is a Spark port config.
+   * Return true if the given config matches either `spark.*.port` or `spark.port.*`.
    */
-  def isSparkPortConf(name: String): Boolean = name.startsWith("spark.") && name.endsWith(".port")
+  def isSparkPortConf(name: String): Boolean = {
+    (name.startsWith("spark.") && name.endsWith(".port")) || name.startsWith("spark.port.")
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index aded7c12e274e..4c4ee04cc515e 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -64,7 +64,7 @@ import org.apache.spark.util._
  * @param config a Spark Config object describing the application configuration. Any settings in
  *   this config overrides the default configs as well as system properties.
  */
-class SparkContext(config: SparkConf) extends Logging {
+class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationClient {
 
   // The call site where this SparkContext was constructed.
   private val creationSite: CallSite = Utils.getCallSite()
@@ -85,6 +85,14 @@ class SparkContext(config: SparkConf) extends Logging {
 
   val startTime = System.currentTimeMillis()
 
+  @volatile private var stopped: Boolean = false
+
+  private def assertNotStopped(): Unit = {
+    if (stopped) {
+      throw new IllegalStateException("Cannot call methods on a stopped SparkContext")
+    }
+  }
+
   /**
    * Create a SparkContext that loads settings from system properties (for instance, when
    * launching with ./bin/spark-submit).
@@ -172,6 +180,9 @@ class SparkContext(config: SparkConf) extends Logging {
   private[spark] def this(master: String, appName: String, sparkHome: String, jars: Seq[String]) =
     this(master, appName, sparkHome, jars, Map(), Map())
 
+  // log out Spark Version in Spark driver log
+  logInfo(s"Running Spark version $SPARK_VERSION")
+  
   private[spark] val conf = config.clone()
   conf.validateSettings()
 
@@ -226,7 +237,7 @@ class SparkContext(config: SparkConf) extends Logging {
   // An asynchronous listener bus for Spark events
   private[spark] val listenerBus = new LiveListenerBus
 
-  conf.set("spark.executor.id", "driver")
+  conf.set("spark.executor.id", SparkContext.DRIVER_IDENTIFIER)
 
   // Create the Spark execution environment (cache, map output tracker, etc)
   private[spark] val env = SparkEnv.createDriverEnv(conf, isLocal, listenerBus)
@@ -326,8 +337,13 @@ class SparkContext(config: SparkConf) extends Logging {
   try {
     dagScheduler = new DAGScheduler(this)
   } catch {
-    case e: Exception => throw
-      new SparkException("DAGScheduler cannot be initialized due to %s".format(e.getMessage))
+    case e: Exception => {
+      try {
+        stop()
+      } finally {
+        throw new SparkException("Error while constructing DAGScheduler", e)
+      }
+    }
   }
 
   // start TaskScheduler after taskScheduler sets DAGScheduler reference in DAGScheduler's
@@ -344,6 +360,8 @@ class SparkContext(config: SparkConf) extends Logging {
   // The metrics system for Driver need to be set spark.app.id to app ID.
   // So it should start after we get app ID from the task scheduler and set spark.app.id.
   metricsSystem.start()
+  // Attach the driver metrics servlet handler to the web ui after the metrics system is started.
+  metricsSystem.getServletHandlers.foreach(handler => ui.foreach(_.attachHandler(handler)))
 
   // Optionally log Spark events
   private[spark] val eventLogger: Option[EventLoggingListener] = {
@@ -357,9 +375,13 @@ class SparkContext(config: SparkConf) extends Logging {
   }
 
   // Optionally scale number of executors dynamically based on workload. Exposed for testing.
+  private val dynamicAllocationEnabled = conf.getBoolean("spark.dynamicAllocation.enabled", false)
+  private val dynamicAllocationTesting = conf.getBoolean("spark.dynamicAllocation.testing", false)
   private[spark] val executorAllocationManager: Option[ExecutorAllocationManager] =
-    if (conf.getBoolean("spark.dynamicAllocation.enabled", false)) {
-      Some(new ExecutorAllocationManager(this))
+    if (dynamicAllocationEnabled) {
+      assert(master.contains("yarn") || dynamicAllocationTesting,
+        "Dynamic allocation of executors is currently only supported in YARN mode")
+      Some(new ExecutorAllocationManager(this, listenerBus, conf))
     } else {
       None
     }
@@ -444,7 +466,6 @@ class SparkContext(config: SparkConf) extends Logging {
     Option(localProperties.get).map(_.getProperty(key)).getOrElse(null)
 
   /** Set a human readable description of the current job. */
-  @deprecated("use setJobGroup", "0.8.1")
   def setJobDescription(value: String) {
     setLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION, value)
   }
@@ -507,12 +528,12 @@ class SparkContext(config: SparkConf) extends Logging {
 
   /** Distribute a local Scala collection to form an RDD.
    *
-   * @note Parallelize acts lazily. If `seq` is a mutable collection and is
-   * altered after the call to parallelize and before the first action on the
-   * RDD, the resultant RDD will reflect the modified collection. Pass a copy of
-   * the argument to avoid this.
+   * @note Parallelize acts lazily. If `seq` is a mutable collection and is altered after the call
+   * to parallelize and before the first action on the RDD, the resultant RDD will reflect the
+   * modified collection. Pass a copy of the argument to avoid this.
    */
   def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = {
+    assertNotStopped()
     new ParallelCollectionRDD[T](this, seq, numSlices, Map[Int, Seq[String]]())
   }
 
@@ -528,6 +549,7 @@ class SparkContext(config: SparkConf) extends Logging {
     * location preferences (hostnames of Spark nodes) for each object.
     * Create a new partition for each collection item. */
   def makeRDD[T: ClassTag](seq: Seq[(T, Seq[String])]): RDD[T] = {
+    assertNotStopped()
     val indexToPrefs = seq.zipWithIndex.map(t => (t._2, t._1._2)).toMap
     new ParallelCollectionRDD[T](this, seq.map(_._1), seq.size, indexToPrefs)
   }
@@ -537,6 +559,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * Hadoop-supported file system URI, and return it as an RDD of Strings.
    */
   def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String] = {
+    assertNotStopped()
     hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
       minPartitions).map(pair => pair._2.toString).setName(path)
   }
@@ -570,6 +593,7 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   def wholeTextFiles(path: String, minPartitions: Int = defaultMinPartitions):
   RDD[(String, String)] = {
+    assertNotStopped()
     val job = new NewHadoopJob(hadoopConfiguration)
     NewFileInputFormat.addInputPath(job, new Path(path))
     val updateConf = job.getConfiguration
@@ -615,6 +639,7 @@ class SparkContext(config: SparkConf) extends Logging {
   @Experimental
   def binaryFiles(path: String, minPartitions: Int = defaultMinPartitions):
       RDD[(String, PortableDataStream)] = {
+    assertNotStopped()
     val job = new NewHadoopJob(hadoopConfiguration)
     NewFileInputFormat.addInputPath(job, new Path(path))
     val updateConf = job.getConfiguration
@@ -639,6 +664,7 @@ class SparkContext(config: SparkConf) extends Logging {
   @Experimental
   def binaryRecords(path: String, recordLength: Int, conf: Configuration = hadoopConfiguration)
       : RDD[Array[Byte]] = {
+    assertNotStopped()
     conf.setInt(FixedLengthBinaryInputFormat.RECORD_LENGTH_PROPERTY, recordLength)
     val br = newAPIHadoopFile[LongWritable, BytesWritable, FixedLengthBinaryInputFormat](path,
       classOf[FixedLengthBinaryInputFormat],
@@ -672,6 +698,7 @@ class SparkContext(config: SparkConf) extends Logging {
       valueClass: Class[V],
       minPartitions: Int = defaultMinPartitions
       ): RDD[(K, V)] = {
+    assertNotStopped()
     // Add necessary security credentials to the JobConf before broadcasting it.
     SparkHadoopUtil.get.addCredentials(conf)
     new HadoopRDD(this, conf, inputFormatClass, keyClass, valueClass, minPartitions)
@@ -691,6 +718,7 @@ class SparkContext(config: SparkConf) extends Logging {
       valueClass: Class[V],
       minPartitions: Int = defaultMinPartitions
       ): RDD[(K, V)] = {
+    assertNotStopped()
     // A Hadoop configuration can be about 10 KB, which is pretty big, so broadcast it.
     val confBroadcast = broadcast(new SerializableWritable(hadoopConfiguration))
     val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, path)
@@ -770,6 +798,7 @@ class SparkContext(config: SparkConf) extends Logging {
       kClass: Class[K],
       vClass: Class[V],
       conf: Configuration = hadoopConfiguration): RDD[(K, V)] = {
+    assertNotStopped()
     val job = new NewHadoopJob(conf)
     NewFileInputFormat.addInputPath(job, new Path(path))
     val updatedConf = job.getConfiguration
@@ -790,6 +819,7 @@ class SparkContext(config: SparkConf) extends Logging {
       fClass: Class[F],
       kClass: Class[K],
       vClass: Class[V]): RDD[(K, V)] = {
+    assertNotStopped()
     new NewHadoopRDD(this, fClass, kClass, vClass, conf)
   }
 
@@ -805,6 +835,7 @@ class SparkContext(config: SparkConf) extends Logging {
       valueClass: Class[V],
       minPartitions: Int
       ): RDD[(K, V)] = {
+    assertNotStopped()
     val inputFormatClass = classOf[SequenceFileInputFormat[K, V]]
     hadoopFile(path, inputFormatClass, keyClass, valueClass, minPartitions)
   }
@@ -816,9 +847,10 @@ class SparkContext(config: SparkConf) extends Logging {
     * If you plan to directly cache Hadoop writable objects, you should first copy them using
     * a `map` function.
     * */
-  def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V]
-      ): RDD[(K, V)] =
+  def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V]): RDD[(K, V)] = {
+    assertNotStopped()
     sequenceFile(path, keyClass, valueClass, defaultMinPartitions)
+  }
 
   /**
    * Version of sequenceFile() for types implicitly convertible to Writables through a
@@ -846,6 +878,7 @@ class SparkContext(config: SparkConf) extends Logging {
        (implicit km: ClassTag[K], vm: ClassTag[V],
         kcf: () => WritableConverter[K], vcf: () => WritableConverter[V])
       : RDD[(K, V)] = {
+    assertNotStopped()
     val kc = kcf()
     val vc = vcf()
     val format = classOf[SequenceFileInputFormat[Writable, Writable]]
@@ -867,6 +900,7 @@ class SparkContext(config: SparkConf) extends Logging {
       path: String,
       minPartitions: Int = defaultMinPartitions
       ): RDD[T] = {
+    assertNotStopped()
     sequenceFile(path, classOf[NullWritable], classOf[BytesWritable], minPartitions)
       .flatMap(x => Utils.deserialize[Array[T]](x._2.getBytes, Utils.getContextOrSparkClassLoader))
   }
@@ -942,6 +976,13 @@ class SparkContext(config: SparkConf) extends Logging {
    * The variable will be sent to each cluster only once.
    */
   def broadcast[T: ClassTag](value: T): Broadcast[T] = {
+    assertNotStopped()
+    if (classOf[RDD[_]].isAssignableFrom(classTag[T].runtimeClass)) {
+      // This is a warning instead of an exception in order to avoid breaking user programs that
+      // might have created RDD broadcast variables but not used them:
+      logWarning("Can not directly broadcast RDDs; instead, call collect() and "
+        + "broadcast the result (see SPARK-5063)")
+    }
     val bc = env.broadcastManager.newBroadcast[T](value, isLocal)
     val callSite = getCallSite
     logInfo("Created broadcast " + bc.id + " from " + callSite.shortForm)
@@ -988,7 +1029,9 @@ class SparkContext(config: SparkConf) extends Logging {
    * This is currently only supported in Yarn mode. Return whether the request is received.
    */
   @DeveloperApi
-  def requestExecutors(numAdditionalExecutors: Int): Boolean = {
+  override def requestExecutors(numAdditionalExecutors: Int): Boolean = {
+    assert(master.contains("yarn") || dynamicAllocationTesting,
+      "Requesting executors is currently only supported in YARN mode")
     schedulerBackend match {
       case b: CoarseGrainedSchedulerBackend =>
         b.requestExecutors(numAdditionalExecutors)
@@ -1004,7 +1047,9 @@ class SparkContext(config: SparkConf) extends Logging {
    * This is currently only supported in Yarn mode. Return whether the request is received.
    */
   @DeveloperApi
-  def killExecutors(executorIds: Seq[String]): Boolean = {
+  override def killExecutors(executorIds: Seq[String]): Boolean = {
+    assert(master.contains("yarn") || dynamicAllocationTesting,
+      "Killing executors is currently only supported in YARN mode")
     schedulerBackend match {
       case b: CoarseGrainedSchedulerBackend =>
         b.killExecutors(executorIds)
@@ -1020,7 +1065,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * This is currently only supported in Yarn mode. Return whether the request is received.
    */
   @DeveloperApi
-  def killExecutor(executorId: String): Boolean = killExecutors(Seq(executorId))
+  override def killExecutor(executorId: String): Boolean = super.killExecutor(executorId)
 
   /** The version of Spark on which this application is running. */
   def version = SPARK_VERSION
@@ -1030,6 +1075,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * memory available for caching.
    */
   def getExecutorMemoryStatus: Map[String, (Long, Long)] = {
+    assertNotStopped()
     env.blockManager.master.getMemoryStatus.map { case(blockManagerId, mem) =>
       (blockManagerId.host + ":" + blockManagerId.port, mem)
     }
@@ -1042,6 +1088,7 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   @DeveloperApi
   def getRDDStorageInfo: Array[RDDInfo] = {
+    assertNotStopped()
     val rddInfos = persistentRdds.values.map(RDDInfo.fromRdd).toArray
     StorageUtils.updateRddInfo(rddInfos, getExecutorStorageStatus)
     rddInfos.filter(_.isCached)
@@ -1059,6 +1106,7 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   @DeveloperApi
   def getExecutorStorageStatus: Array[StorageStatus] = {
+    assertNotStopped()
     env.blockManager.master.getStorageStatus
   }
 
@@ -1068,6 +1116,7 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   @DeveloperApi
   def getAllPools: Seq[Schedulable] = {
+    assertNotStopped()
     // TODO(xiajunluan): We should take nested pools into account
     taskScheduler.rootPool.schedulableQueue.toSeq
   }
@@ -1078,6 +1127,7 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   @DeveloperApi
   def getPoolForName(pool: String): Option[Schedulable] = {
+    assertNotStopped()
     Option(taskScheduler.rootPool.schedulableNameToSchedulable.get(pool))
   }
 
@@ -1085,6 +1135,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * Return current scheduling mode
    */
   def getSchedulingMode: SchedulingMode.SchedulingMode = {
+    assertNotStopped()
     taskScheduler.schedulingMode
   }
 
@@ -1190,16 +1241,14 @@ class SparkContext(config: SparkConf) extends Logging {
     SparkContext.SPARK_CONTEXT_CONSTRUCTOR_LOCK.synchronized {
       postApplicationEnd()
       ui.foreach(_.stop())
-      // Do this only if not stopped already - best case effort.
-      // prevent NPE if stopped more than once.
-      val dagSchedulerCopy = dagScheduler
-      dagScheduler = null
-      if (dagSchedulerCopy != null) {
+      if (!stopped) {
+        stopped = true
         env.metricsSystem.report()
         metadataCleaner.cancel()
         env.actorSystem.stop(heartbeatReceiver)
         cleaner.foreach(_.stop())
-        dagSchedulerCopy.stop()
+        dagScheduler.stop()
+        dagScheduler = null
         taskScheduler = null
         // TODO: Cache.stop()?
         env.stop()
@@ -1273,8 +1322,8 @@ class SparkContext(config: SparkConf) extends Logging {
       partitions: Seq[Int],
       allowLocal: Boolean,
       resultHandler: (Int, U) => Unit) {
-    if (dagScheduler == null) {
-      throw new SparkException("SparkContext has been shutdown")
+    if (stopped) {
+      throw new IllegalStateException("SparkContext has been shutdown")
     }
     val callSite = getCallSite
     val cleanedFunc = clean(func)
@@ -1361,6 +1410,7 @@ class SparkContext(config: SparkConf) extends Logging {
       func: (TaskContext, Iterator[T]) => U,
       evaluator: ApproximateEvaluator[U, R],
       timeout: Long): PartialResult[R] = {
+    assertNotStopped()
     val callSite = getCallSite
     logInfo("Starting job: " + callSite.shortForm)
     val start = System.nanoTime
@@ -1383,6 +1433,7 @@ class SparkContext(config: SparkConf) extends Logging {
       resultHandler: (Int, U) => Unit,
       resultFunc: => R): SimpleFutureAction[R] =
   {
+    assertNotStopped()
     val cleanF = clean(processPartition)
     val callSite = getCallSite
     val waiter = dagScheduler.submitJob(
@@ -1401,11 +1452,13 @@ class SparkContext(config: SparkConf) extends Logging {
    * for more information.
    */
   def cancelJobGroup(groupId: String) {
+    assertNotStopped()
     dagScheduler.cancelJobGroup(groupId)
   }
 
   /** Cancel all jobs that have been scheduled or are running.  */
   def cancelAllJobs() {
+    assertNotStopped()
     dagScheduler.cancelAllJobs()
   }
 
@@ -1452,13 +1505,20 @@ class SparkContext(config: SparkConf) extends Logging {
   def getCheckpointDir = checkpointDir
 
   /** Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD). */
-  def defaultParallelism: Int = taskScheduler.defaultParallelism
+  def defaultParallelism: Int = {
+    assertNotStopped()
+    taskScheduler.defaultParallelism
+  }
 
   /** Default min number of partitions for Hadoop RDDs when not given by user */
   @deprecated("use defaultMinPartitions", "1.0.0")
   def defaultMinSplits: Int = math.min(defaultParallelism, 2)
 
-  /** Default min number of partitions for Hadoop RDDs when not given by user */
+  /** 
+   * Default min number of partitions for Hadoop RDDs when not given by user 
+   * Notice that we use math.min so the "defaultMinPartitions" cannot be higher than 2.
+   * The reasons for this are discussed in https://github.com/mesos/spark/pull/718
+   */
   def defaultMinPartitions: Int = math.min(defaultParallelism, 2)
 
   private val nextShuffleId = new AtomicInteger(0)
@@ -1695,19 +1755,19 @@ object SparkContext extends Logging {
 
   // Implicit conversions to common Writable types, for saveAsSequenceFile
 
-  implicit def intToIntWritable(i: Int) = new IntWritable(i)
+  implicit def intToIntWritable(i: Int): IntWritable = new IntWritable(i)
 
-  implicit def longToLongWritable(l: Long) = new LongWritable(l)
+  implicit def longToLongWritable(l: Long): LongWritable = new LongWritable(l)
 
-  implicit def floatToFloatWritable(f: Float) = new FloatWritable(f)
+  implicit def floatToFloatWritable(f: Float): FloatWritable = new FloatWritable(f)
 
-  implicit def doubleToDoubleWritable(d: Double) = new DoubleWritable(d)
+  implicit def doubleToDoubleWritable(d: Double): DoubleWritable = new DoubleWritable(d)
 
-  implicit def boolToBoolWritable (b: Boolean) = new BooleanWritable(b)
+  implicit def boolToBoolWritable (b: Boolean): BooleanWritable = new BooleanWritable(b)
 
-  implicit def bytesToBytesWritable (aob: Array[Byte]) = new BytesWritable(aob)
+  implicit def bytesToBytesWritable (aob: Array[Byte]): BytesWritable = new BytesWritable(aob)
 
-  implicit def stringToText(s: String) = new Text(s)
+  implicit def stringToText(s: String): Text = new Text(s)
 
   private implicit def arrayToArrayWritable[T <% Writable: ClassTag](arr: Traversable[T])
     : ArrayWritable = {
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index e464b32e61dd6..4d418037bd33f 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -156,7 +156,15 @@ object SparkEnv extends Logging {
     assert(conf.contains("spark.driver.port"), "spark.driver.port is not set on the driver!")
     val hostname = conf.get("spark.driver.host")
     val port = conf.get("spark.driver.port").toInt
-    create(conf, SparkContext.DRIVER_IDENTIFIER, hostname, port, true, isLocal, listenerBus)
+    create(
+      conf,
+      SparkContext.DRIVER_IDENTIFIER,
+      hostname,
+      port,
+      isDriver = true,
+      isLocal = isLocal,
+      listenerBus = listenerBus
+    )
   }
 
   /**
@@ -169,10 +177,18 @@ object SparkEnv extends Logging {
       hostname: String,
       port: Int,
       numCores: Int,
-      isLocal: Boolean,
-      actorSystem: ActorSystem = null): SparkEnv = {
-    create(conf, executorId, hostname, port, false, isLocal, defaultActorSystem = actorSystem,
-      numUsableCores = numCores)
+      isLocal: Boolean): SparkEnv = {
+    val env = create(
+      conf,
+      executorId,
+      hostname,
+      port,
+      isDriver = false,
+      isLocal = isLocal,
+      numUsableCores = numCores
+    )
+    SparkEnv.set(env)
+    env
   }
 
   /**
@@ -186,7 +202,6 @@ object SparkEnv extends Logging {
       isDriver: Boolean,
       isLocal: Boolean,
       listenerBus: LiveListenerBus = null,
-      defaultActorSystem: ActorSystem = null,
       numUsableCores: Int = 0): SparkEnv = {
 
     // Listener bus is only used on the driver
@@ -196,20 +211,17 @@ object SparkEnv extends Logging {
 
     val securityManager = new SecurityManager(conf)
 
-    // If an existing actor system is already provided, use it.
-    // This is the case when an executor is launched in coarse-grained mode.
-    val (actorSystem, boundPort) =
-      Option(defaultActorSystem) match {
-        case Some(as) => (as, port)
-        case None =>
-          val actorSystemName = if (isDriver) driverActorSystemName else executorActorSystemName
-          AkkaUtils.createActorSystem(actorSystemName, hostname, port, conf, securityManager)
-      }
+    // Create the ActorSystem for Akka and get the port it binds to.
+    val (actorSystem, boundPort) = {
+      val actorSystemName = if (isDriver) driverActorSystemName else executorActorSystemName
+      AkkaUtils.createActorSystem(actorSystemName, hostname, port, conf, securityManager)
+    }
 
     // Figure out which port Akka actually bound to in case the original port is 0 or occupied.
-    // This is so that we tell the executors the correct port to connect to.
     if (isDriver) {
       conf.set("spark.driver.port", boundPort.toString)
+    } else {
+      conf.set("spark.executor.port", boundPort.toString)
     }
 
     // Create an instance of the class with the given name, possibly initializing it with our conf
@@ -300,7 +312,7 @@ object SparkEnv extends Logging {
     val httpFileServer =
       if (isDriver) {
         val fileServerPort = conf.getInt("spark.fileserver.port", 0)
-        val server = new HttpFileServer(securityManager, fileServerPort)
+        val server = new HttpFileServer(conf, securityManager, fileServerPort)
         server.initialize()
         conf.set("spark.fileserver.uri",  server.serverUri)
         server
@@ -383,7 +395,7 @@ object SparkEnv extends Logging {
     val sparkProperties = (conf.getAll ++ schedulerMode).sorted
 
     // System properties that are not java classpaths
-    val systemProperties = System.getProperties.iterator.toSeq
+    val systemProperties = Utils.getSystemProperties.toSeq
     val otherProperties = systemProperties.filter { case (k, _) =>
       k != "java.class.path" && !k.startsWith("spark.")
     }.sorted
diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
index afd2b85d33a77..9bb0c61e441f8 100644
--- a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
@@ -22,14 +22,19 @@ import org.apache.spark.util.{TaskCompletionListener, TaskCompletionListenerExce
 
 import scala.collection.mutable.ArrayBuffer
 
-private[spark] class TaskContextImpl(val stageId: Int,
+private[spark] class TaskContextImpl(
+    val stageId: Int,
     val partitionId: Int,
-    val attemptId: Long,
+    override val taskAttemptId: Long,
+    override val attemptNumber: Int,
     val runningLocally: Boolean = false,
     val taskMetrics: TaskMetrics = TaskMetrics.empty)
   extends TaskContext
   with Logging {
 
+  // For backwards-compatibility; this method is now deprecated as of 1.3.0.
+  override def attemptId: Long = taskAttemptId
+
   // List of callback functions to execute when the task completes.
   @transient private val onCompleteCallbacks = new ArrayBuffer[TaskCompletionListener]
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/NullType.java b/core/src/main/scala/org/apache/spark/TaskNotSerializableException.scala
similarity index 76%
rename from sql/core/src/main/java/org/apache/spark/sql/api/java/NullType.java
rename to core/src/main/scala/org/apache/spark/TaskNotSerializableException.scala
index 6d5ecdf46e551..9df61062e1f85 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/NullType.java
+++ b/core/src/main/scala/org/apache/spark/TaskNotSerializableException.scala
@@ -15,13 +15,11 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.api.java;
+package org.apache.spark
+
+import org.apache.spark.annotation.DeveloperApi
 
 /**
- * The data type representing null and NULL values.
- *
- * {@code NullType} is represented by the singleton object {@link DataType#NullType}.
+ * Exception thrown when a task cannot be serialized.
  */
-public class NullType extends DataType {
-  protected NullType() {}
-}
+private[spark] class TaskNotSerializableException(error: Throwable) extends Exception(error)
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
index bd451634e53d2..62bf18d82d9b0 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -38,6 +38,10 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
 
+/**
+ * Defines operations common to several Java RDD implementations.
+ * Note that this trait is not intended to be implemented by user code.
+ */
 trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
   def wrapRDD(rdd: RDD[T]): This
 
@@ -435,6 +439,12 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
    */
   def first(): T = rdd.first()
 
+  /**
+   * @return true if and only if the RDD contains no elements at all. Note that an RDD
+   *         may be empty even when it has at least 1 partition.
+   */
+  def isEmpty(): Boolean = rdd.isEmpty()
+
   /**
    * Save this RDD as a text file, using string representations of elements.
    */
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala b/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala
index 86e94931300f8..71b26737b8c02 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala
@@ -80,7 +80,7 @@ private[spark] object JavaUtils {
           prev match {
             case Some(k) =>
               underlying match {
-                case mm: mutable.Map[a, _] =>
+                case mm: mutable.Map[A, _] =>
                   mm remove k
                   prev = None
                 case _ =>
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index bad40e6529f74..4ac666c54fbcd 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -125,8 +125,8 @@ private[spark] class PythonRDD(
                 init, finish))
               val memoryBytesSpilled = stream.readLong()
               val diskBytesSpilled = stream.readLong()
-              context.taskMetrics.memoryBytesSpilled += memoryBytesSpilled
-              context.taskMetrics.diskBytesSpilled += diskBytesSpilled
+              context.taskMetrics.incMemoryBytesSpilled(memoryBytesSpilled)
+              context.taskMetrics.incDiskBytesSpilled(diskBytesSpilled)
               read()
             case SpecialLengths.PYTHON_EXCEPTION_THROWN =>
               // Signals that an exception has been thrown in python
diff --git a/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcast.scala
index 31f0a462f84d8..31d6958c403b3 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcast.scala
@@ -153,7 +153,8 @@ private[broadcast] object HttpBroadcast extends Logging {
   private def createServer(conf: SparkConf) {
     broadcastDir = Utils.createTempDir(Utils.getLocalDir(conf))
     val broadcastPort = conf.getInt("spark.broadcast.port", 0)
-    server = new HttpServer(broadcastDir, securityManager, broadcastPort, "HTTP broadcast server")
+    server =
+      new HttpServer(conf, broadcastDir, securityManager, broadcastPort, "HTTP broadcast server")
     server.start()
     serverUri = server.uri
     logInfo("Broadcast server started at " + serverUri)
diff --git a/core/src/main/scala/org/apache/spark/deploy/Client.scala b/core/src/main/scala/org/apache/spark/deploy/Client.scala
index f2687ce6b42b4..7c1c831c248fc 100644
--- a/core/src/main/scala/org/apache/spark/deploy/Client.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/Client.scala
@@ -160,6 +160,8 @@ object Client {
     val (actorSystem, _) = AkkaUtils.createActorSystem(
       "driverClient", Utils.localHostName(), 0, conf, new SecurityManager(conf))
 
+    // Verify driverArgs.master is a valid url so that we can use it in ClientActor safely
+    Master.toAkkaUrl(driverArgs.master)
     actorSystem.actorOf(Props(classOf[ClientActor], driverArgs, conf))
 
     actorSystem.awaitTermination()
diff --git a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala
index 2e1e52906ceeb..e5873ce724b9f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable.ListBuffer
 
 import org.apache.log4j.Level
 
-import org.apache.spark.util.MemoryParam
+import org.apache.spark.util.{IntParam, MemoryParam}
 
 /**
  * Command-line parser for the driver client.
@@ -51,8 +51,8 @@ private[spark] class ClientArguments(args: Array[String]) {
   parse(args.toList)
 
   def parse(args: List[String]): Unit = args match {
-    case ("--cores" | "-c") :: value :: tail =>
-      cores = value.toInt
+    case ("--cores" | "-c") :: IntParam(value) :: tail =>
+      cores = value
       parse(tail)
 
     case ("--memory" | "-m") :: MemoryParam(value) :: tail =>
diff --git a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
index c46f84de8444a..243d8edb72ed3 100644
--- a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
@@ -88,6 +88,8 @@ private[deploy] object DeployMessages {
 
   case class KillDriver(driverId: String) extends DeployMessage
 
+  case class ApplicationFinished(id: String)
+
   // Worker internal
 
   case object WorkDirCleanup      // Sent to Worker actor periodically for cleaning up app folders
@@ -175,4 +177,5 @@ private[deploy] object DeployMessages {
   // Liveness checks in various places
 
   case object SendHeartbeat
+
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 60ee115e393ce..57f9faf5ddd1d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -24,6 +24,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.fs.FileSystem.Statistics
 import org.apache.hadoop.mapred.JobConf
+import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 import org.apache.hadoop.security.Credentials
 import org.apache.hadoop.security.UserGroupInformation
 
@@ -183,6 +184,17 @@ class SparkHadoopUtil extends Logging {
       Class.forName("org.apache.hadoop.fs.FileSystem$Statistics$StatisticsData")
     statisticsDataClass.getDeclaredMethod(methodName)
   }
+
+  /**
+   * Using reflection to get the Configuration from JobContext/TaskAttemptContext. If we directly
+   * call `JobContext/TaskAttemptContext.getConfiguration`, it will generate different byte codes
+   * for Hadoop 1.+ and Hadoop 2.+ because JobContext/TaskAttemptContext is class in Hadoop 1.+
+   * while it's interface in Hadoop 2.+.
+   */
+  def getConfigurationFromJobContext(context: JobContext): Configuration = {
+    val method = context.getClass.getMethod("getConfiguration")
+    method.invoke(context).asInstanceOf[Configuration]
+  }
 }
 
 object SparkHadoopUtil {
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 955cbd6dab96d..050ba91eb2bc3 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -200,6 +200,7 @@ object SparkSubmit {
       // Yarn cluster only
       OptionAssigner(args.name, YARN, CLUSTER, clOption = "--name"),
       OptionAssigner(args.driverMemory, YARN, CLUSTER, clOption = "--driver-memory"),
+      OptionAssigner(args.driverCores, YARN, CLUSTER, clOption = "--driver-cores"),
       OptionAssigner(args.queue, YARN, CLUSTER, clOption = "--queue"),
       OptionAssigner(args.numExecutors, YARN, CLUSTER, clOption = "--num-executors"),
       OptionAssigner(args.executorMemory, YARN, CLUSTER, clOption = "--executor-memory"),
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index f0e9ee67f6a67..81ec08cb6d501 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.deploy
 
+import java.net.URI
 import java.util.jar.JarFile
 
 import scala.collection.mutable.{ArrayBuffer, HashMap}
@@ -107,6 +108,9 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
       .orElse(sparkProperties.get("spark.driver.memory"))
       .orElse(env.get("SPARK_DRIVER_MEMORY"))
       .orNull
+    driverCores = Option(driverCores)
+      .orElse(sparkProperties.get("spark.driver.cores"))
+      .orNull
     executorMemory = Option(executorMemory)
       .orElse(sparkProperties.get("spark.executor.memory"))
       .orElse(env.get("SPARK_EXECUTOR_MEMORY"))
@@ -120,23 +124,39 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
     name = Option(name).orElse(sparkProperties.get("spark.app.name")).orNull
     jars = Option(jars).orElse(sparkProperties.get("spark.jars")).orNull
     deployMode = Option(deployMode).orElse(env.get("DEPLOY_MODE")).orNull
+    numExecutors = Option(numExecutors)
+      .getOrElse(sparkProperties.get("spark.executor.instances").orNull)
 
     // Try to set main class from JAR if no --class argument is given
     if (mainClass == null && !isPython && primaryResource != null) {
-      try {
-        val jar = new JarFile(primaryResource)
-        // Note that this might still return null if no main-class is set; we catch that later
-        mainClass = jar.getManifest.getMainAttributes.getValue("Main-Class")
-      } catch {
-        case e: Exception =>
-          SparkSubmit.printErrorAndExit("Cannot load main class from JAR: " + primaryResource)
-          return
+      val uri = new URI(primaryResource)
+      val uriScheme = uri.getScheme()
+
+      uriScheme match {
+        case "file" =>
+          try {
+            val jar = new JarFile(uri.getPath)
+            // Note that this might still return null if no main-class is set; we catch that later
+            mainClass = jar.getManifest.getMainAttributes.getValue("Main-Class")
+          } catch {
+            case e: Exception =>
+              SparkSubmit.printErrorAndExit(s"Cannot load main class from JAR $primaryResource")
+          }
+        case _ =>
+          SparkSubmit.printErrorAndExit(
+            s"Cannot load main class from JAR $primaryResource with URI $uriScheme. " +
+            "Please specify a class through --class.")
       }
     }
 
     // Global defaults. These should be keep to minimum to avoid confusing behavior.
     master = Option(master).getOrElse("local[*]")
 
+    // In YARN mode, app name can be set via SPARK_YARN_APP_NAME (see SPARK-5222)
+    if (master.startsWith("yarn")) {
+      name = Option(name).orElse(env.get("SPARK_YARN_APP_NAME")).orNull
+    }
+
     // Set name from main class if not given
     name = Option(name).orElse(Option(mainClass)).orNull
     if (name == null && primaryResource != null) {
@@ -212,7 +232,10 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
     """.stripMargin
   }
 
-  /** Fill in values by parsing user options. */
+  /**
+   * Fill in values by parsing user options.
+   * NOTE: Any changes here must be reflected in YarnClientSchedulerBackend.
+   */
   private def parseOpts(opts: Seq[String]): Unit = {
     val EQ_SEPARATED_OPT="""(--[^=]+)=(.+)""".r
 
@@ -386,11 +409,14 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
         |  --total-executor-cores NUM  Total cores for all executors.
         |
         | YARN-only:
+        |  --driver-cores NUM          Number of cores used by the driver, only in cluster mode
+        |                              (Default: 1).
         |  --executor-cores NUM        Number of cores per executor (Default: 1).
         |  --queue QUEUE_NAME          The YARN queue to submit to (Default: "default").
         |  --num-executors NUM         Number of executors to launch (Default: 2).
         |  --archives ARCHIVES         Comma separated list of archives to be extracted into the
-        |                              working directory of each executor.""".stripMargin
+        |                              working directory of each executor.
+      """.stripMargin
     )
     SparkSubmit.exitFn()
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitDriverBootstrapper.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitDriverBootstrapper.scala
index d2687faad62b1..2eab9981845e8 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitDriverBootstrapper.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitDriverBootstrapper.scala
@@ -151,7 +151,8 @@ private[spark] object SparkSubmitDriverBootstrapper {
     val isWindows = Utils.isWindows
     val isSubprocess = sys.env.contains("IS_SUBPROCESS")
     if (!isWindows) {
-      val stdinThread = new RedirectThread(System.in, process.getOutputStream, "redirect stdin")
+      val stdinThread = new RedirectThread(System.in, process.getOutputStream, "redirect stdin",
+        propagateEof = true)
       stdinThread.start()
       // Spark submit (JVM) may run as a subprocess, and so this JVM should terminate on
       // broken pipe, signaling that the parent process has exited. This is the case if the
diff --git a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala
index 4efebcaa350fe..39a7b0319b6a1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala
@@ -26,7 +26,7 @@ import akka.actor._
 import akka.pattern.ask
 import akka.remote.{AssociationErrorEvent, DisassociatedEvent, RemotingLifecycleEvent}
 
-import org.apache.spark.{Logging, SparkConf, SparkException}
+import org.apache.spark.{Logging, SparkConf}
 import org.apache.spark.deploy.{ApplicationDescription, ExecutorState}
 import org.apache.spark.deploy.DeployMessages._
 import org.apache.spark.deploy.master.Master
@@ -47,6 +47,8 @@ private[spark] class AppClient(
     conf: SparkConf)
   extends Logging {
 
+  val masterAkkaUrls = masterUrls.map(Master.toAkkaUrl)
+
   val REGISTRATION_TIMEOUT = 20.seconds
   val REGISTRATION_RETRIES = 3
 
@@ -75,9 +77,9 @@ private[spark] class AppClient(
     }
 
     def tryRegisterAllMasters() {
-      for (masterUrl <- masterUrls) {
-        logInfo("Connecting to master " + masterUrl + "...")
-        val actor = context.actorSelection(Master.toAkkaUrl(masterUrl))
+      for (masterAkkaUrl <- masterAkkaUrls) {
+        logInfo("Connecting to master " + masterAkkaUrl + "...")
+        val actor = context.actorSelection(masterAkkaUrl)
         actor ! RegisterApplication(appDescription)
       }
     }
@@ -103,20 +105,14 @@ private[spark] class AppClient(
     }
 
     def changeMaster(url: String) {
+      // activeMasterUrl is a valid Spark url since we receive it from master.
       activeMasterUrl = url
       master = context.actorSelection(Master.toAkkaUrl(activeMasterUrl))
-      masterAddress = activeMasterUrl match {
-        case Master.sparkUrlRegex(host, port) =>
-          Address("akka.tcp", Master.systemName, host, port.toInt)
-        case x =>
-          throw new SparkException("Invalid spark URL: " + x)
-      }
+      masterAddress = Master.toAkkaAddress(activeMasterUrl)
     }
 
     private def isPossibleMaster(remoteUrl: Address) = {
-      masterUrls.map(s => Master.toAkkaUrl(s))
-        .map(u => AddressFromURIString(u).hostPort)
-        .contains(remoteUrl.hostPort)
+      masterAkkaUrls.map(AddressFromURIString(_).hostPort).contains(remoteUrl.hostPort)
     }
 
     override def receiveWithLogging = {
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
index fbe39b27649f6..553bf3cb945ab 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
@@ -25,7 +25,8 @@ private[spark] case class ApplicationHistoryInfo(
     startTime: Long,
     endTime: Long,
     lastUpdated: Long,
-    sparkUser: String)
+    sparkUser: String,
+    completed: Boolean = false)
 
 private[spark] abstract class ApplicationHistoryProvider {
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 82a54dbfb5330..2b084a2d73b78 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -17,14 +17,16 @@
 
 package org.apache.spark.deploy.history
 
-import java.io.FileNotFoundException
+import java.io.{BufferedInputStream, FileNotFoundException, InputStream}
 
 import scala.collection.mutable
 
 import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.fs.permission.AccessControlException
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.io.CompressionCodec
 import org.apache.spark.scheduler._
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.Utils
@@ -64,6 +66,12 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
   @volatile private var applications: mutable.LinkedHashMap[String, FsApplicationHistoryInfo]
     = new mutable.LinkedHashMap()
 
+  // Constants used to parse Spark 1.0.0 log directories.
+  private[history] val LOG_PREFIX = "EVENT_LOG_"
+  private[history] val SPARK_VERSION_PREFIX = "SPARK_VERSION_"
+  private[history] val COMPRESSION_CODEC_PREFIX = "COMPRESSION_CODEC_"
+  private[history] val APPLICATION_COMPLETE = "APPLICATION_COMPLETE"
+
   /**
    * A background thread that periodically checks for event log updates on disk.
    *
@@ -90,7 +98,7 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
 
   initialize()
 
-  private def initialize() {
+  private def initialize(): Unit = {
     // Validate the log directory.
     val path = new Path(logDir)
     if (!fs.exists(path)) {
@@ -106,8 +114,12 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
     }
 
     checkForLogs()
-    logCheckingThread.setDaemon(true)
-    logCheckingThread.start()
+
+    // Disable the background thread during tests.
+    if (!conf.contains("spark.testing")) {
+      logCheckingThread.setDaemon(true)
+      logCheckingThread.start()
+    }
   }
 
   override def getListing() = applications.values
@@ -115,8 +127,7 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
   override def getAppUI(appId: String): Option[SparkUI] = {
     try {
       applications.get(appId).map { info =>
-        val (replayBus, appListener) = createReplayBus(fs.getFileStatus(
-          new Path(logDir, info.logDir)))
+        val replayBus = new ReplayListenerBus()
         val ui = {
           val conf = this.conf.clone()
           val appSecManager = new SecurityManager(conf)
@@ -125,15 +136,17 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
           // Do not call ui.bind() to avoid creating a new server for each application
         }
 
-        replayBus.replay()
+        val appListener = new ApplicationEventListener()
+        replayBus.addListener(appListener)
+        val appInfo = replay(fs.getFileStatus(new Path(logDir, info.logPath)), replayBus)
 
-        ui.setAppName(s"${appListener.appName.getOrElse(NOT_STARTED)} ($appId)")
+        ui.setAppName(s"${appInfo.name} ($appId)")
 
         val uiAclsEnabled = conf.getBoolean("spark.history.ui.acls.enable", false)
         ui.getSecurityManager.setAcls(uiAclsEnabled)
         // make sure to set admin acls before view acls so they are properly picked up
         ui.getSecurityManager.setAdminAcls(appListener.adminAcls.getOrElse(""))
-        ui.getSecurityManager.setViewAcls(appListener.sparkUser.getOrElse(NOT_STARTED),
+        ui.getSecurityManager.setViewAcls(appInfo.sparkUser,
           appListener.viewAcls.getOrElse(""))
         ui
       }
@@ -149,45 +162,38 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
    * Tries to reuse as much of the data already in memory as possible, by not reading
    * applications that haven't been updated since last time the logs were checked.
    */
-  private def checkForLogs() = {
+  private[history] def checkForLogs(): Unit = {
     lastLogCheckTimeMs = getMonotonicTimeMs()
     logDebug("Checking for logs. Time is now %d.".format(lastLogCheckTimeMs))
-    try {
-      val logStatus = fs.listStatus(new Path(logDir))
-      val logDirs = if (logStatus != null) logStatus.filter(_.isDir).toSeq else Seq[FileStatus]()
 
-      // Load all new logs from the log directory. Only directories that have a modification time
-      // later than the last known log directory will be loaded.
+    try {
       var newLastModifiedTime = lastModifiedTime
-      val logInfos = logDirs
-        .filter { dir =>
-          if (fs.isFile(new Path(dir.getPath(), EventLoggingListener.APPLICATION_COMPLETE))) {
-            val modTime = getModificationTime(dir)
+      val statusList = Option(fs.listStatus(new Path(logDir))).map(_.toSeq)
+        .getOrElse(Seq[FileStatus]())
+      val logInfos = statusList
+        .filter { entry =>
+          try {
+            val modTime = getModificationTime(entry)
             newLastModifiedTime = math.max(newLastModifiedTime, modTime)
-            modTime > lastModifiedTime
-          } else {
-            false
+            modTime >= lastModifiedTime
+          } catch {
+            case e: AccessControlException =>
+              // Do not use "logInfo" since these messages can get pretty noisy if printed on
+              // every poll.
+              logDebug(s"No permission to read $entry, ignoring.")
+              false
           }
         }
-        .flatMap { dir =>
+        .flatMap { entry =>
           try {
-            val (replayBus, appListener) = createReplayBus(dir)
-            replayBus.replay()
-            Some(new FsApplicationHistoryInfo(
-              dir.getPath().getName(),
-              appListener.appId.getOrElse(dir.getPath().getName()),
-              appListener.appName.getOrElse(NOT_STARTED),
-              appListener.startTime.getOrElse(-1L),
-              appListener.endTime.getOrElse(-1L),
-              getModificationTime(dir),
-              appListener.sparkUser.getOrElse(NOT_STARTED)))
+            Some(replay(entry, new ReplayListenerBus()))
           } catch {
             case e: Exception =>
-              logInfo(s"Failed to load application log data from $dir.", e)
+              logError(s"Failed to load application log data from $entry.", e)
               None
           }
         }
-        .sortBy { info => -info.endTime }
+        .sortBy { info => (-info.endTime, -info.startTime) }
 
       lastModifiedTime = newLastModifiedTime
 
@@ -217,37 +223,112 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
         applications = newApps
       }
     } catch {
-      case t: Throwable => logError("Exception in checking for event log updates", t)
+      case e: Exception => logError("Exception in checking for event log updates", e)
     }
   }
 
-  private def createReplayBus(logDir: FileStatus): (ReplayListenerBus, ApplicationEventListener) = {
-    val path = logDir.getPath()
-    val elogInfo = EventLoggingListener.parseLoggingInfo(path, fs)
-    val replayBus = new ReplayListenerBus(elogInfo.logPaths, fs, elogInfo.compressionCodec)
-    val appListener = new ApplicationEventListener
-    replayBus.addListener(appListener)
-    (replayBus, appListener)
+  /**
+   * Replays the events in the specified log file and returns information about the associated
+   * application.
+   */
+  private def replay(eventLog: FileStatus, bus: ReplayListenerBus): FsApplicationHistoryInfo = {
+    val logPath = eventLog.getPath()
+    val (logInput, sparkVersion) =
+      if (isLegacyLogDirectory(eventLog)) {
+        openLegacyEventLog(logPath)
+      } else {
+        EventLoggingListener.openEventLog(logPath, fs)
+      }
+    try {
+      val appListener = new ApplicationEventListener
+      bus.addListener(appListener)
+      bus.replay(logInput, sparkVersion)
+      new FsApplicationHistoryInfo(
+        logPath.getName(),
+        appListener.appId.getOrElse(logPath.getName()),
+        appListener.appName.getOrElse(NOT_STARTED),
+        appListener.startTime.getOrElse(-1L),
+        appListener.endTime.getOrElse(-1L),
+        getModificationTime(eventLog),
+        appListener.sparkUser.getOrElse(NOT_STARTED),
+        isApplicationCompleted(eventLog))
+    } finally {
+      logInput.close()
+    }
   }
 
-  /** Return when this directory was last modified. */
-  private def getModificationTime(dir: FileStatus): Long = {
-    try {
-      val logFiles = fs.listStatus(dir.getPath)
-      if (logFiles != null && !logFiles.isEmpty) {
-        logFiles.map(_.getModificationTime).max
-      } else {
-        dir.getModificationTime
+  /**
+   * Loads a legacy log directory. This assumes that the log directory contains a single event
+   * log file (along with other metadata files), which is the case for directories generated by
+   * the code in previous releases.
+   *
+   * @return 2-tuple of (input stream of the events, version of Spark which wrote the log)
+   */
+  private[history] def openLegacyEventLog(dir: Path): (InputStream, String) = {
+    val children = fs.listStatus(dir)
+    var eventLogPath: Path = null
+    var codecName: Option[String] = None
+    var sparkVersion: String = null
+
+    children.foreach { child =>
+      child.getPath().getName() match {
+        case name if name.startsWith(LOG_PREFIX) =>
+          eventLogPath = child.getPath()
+
+        case codec if codec.startsWith(COMPRESSION_CODEC_PREFIX) =>
+          codecName = Some(codec.substring(COMPRESSION_CODEC_PREFIX.length()))
+
+        case version if version.startsWith(SPARK_VERSION_PREFIX) =>
+          sparkVersion = version.substring(SPARK_VERSION_PREFIX.length())
+
+        case _ =>
       }
-    } catch {
-      case t: Throwable =>
-        logError("Exception in accessing modification time of %s".format(dir.getPath), t)
-        -1L
+    }
+
+    if (eventLogPath == null || sparkVersion == null) {
+      throw new IllegalArgumentException(s"$dir is not a Spark application log directory.")
+    }
+
+    val codec = try {
+        codecName.map { c => CompressionCodec.createCodec(conf, c) }
+      } catch {
+        case e: Exception =>
+          throw new IllegalArgumentException(s"Unknown compression codec $codecName.")
+      }
+
+    val in = new BufferedInputStream(fs.open(eventLogPath))
+    (codec.map(_.compressedInputStream(in)).getOrElse(in), sparkVersion)
+  }
+
+  /**
+   * Return whether the specified event log path contains a old directory-based event log.
+   * Previously, the event log of an application comprises of multiple files in a directory.
+   * As of Spark 1.3, these files are consolidated into a single one that replaces the directory.
+   * See SPARK-2261 for more detail.
+   */
+  private def isLegacyLogDirectory(entry: FileStatus): Boolean = entry.isDir()
+
+  private def getModificationTime(fsEntry: FileStatus): Long = {
+    if (fsEntry.isDir) {
+      fs.listStatus(fsEntry.getPath).map(_.getModificationTime()).max
+    } else {
+      fsEntry.getModificationTime()
     }
   }
 
   /** Returns the system's mononotically increasing time. */
-  private def getMonotonicTimeMs() = System.nanoTime() / (1000 * 1000)
+  private def getMonotonicTimeMs(): Long = System.nanoTime() / (1000 * 1000)
+
+  /**
+   * Return true when the application has completed.
+   */
+  private def isApplicationCompleted(entry: FileStatus): Boolean = {
+    if (isLegacyLogDirectory(entry)) {
+      fs.exists(new Path(entry.getPath(), APPLICATION_COMPLETE))
+    } else {
+      !entry.getPath().getName().endsWith(EventLoggingListener.IN_PROGRESS)
+    }
+  }
 
 }
 
@@ -256,11 +337,12 @@ private object FsHistoryProvider {
 }
 
 private class FsApplicationHistoryInfo(
-    val logDir: String,
+    val logPath: String,
     id: String,
     name: String,
     startTime: Long,
     endTime: Long,
     lastUpdated: Long,
-    sparkUser: String)
-  extends ApplicationHistoryInfo(id, name, startTime, endTime, lastUpdated, sparkUser)
+    sparkUser: String,
+    completed: Boolean = true)
+  extends ApplicationHistoryInfo(id, name, startTime, endTime, lastUpdated, sparkUser, completed)
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
index 5fdc350cd8512..e4e7bc2216014 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
@@ -26,12 +26,15 @@ import org.apache.spark.ui.{WebUIPage, UIUtils}
 private[spark] class HistoryPage(parent: HistoryServer) extends WebUIPage("") {
 
   private val pageSize = 20
+  private val plusOrMinus = 2
 
   def render(request: HttpServletRequest): Seq[Node] = {
     val requestedPage = Option(request.getParameter("page")).getOrElse("1").toInt
     val requestedFirst = (requestedPage - 1) * pageSize
+    val requestedIncomplete =
+      Option(request.getParameter("showIncomplete")).getOrElse("false").toBoolean
 
-    val allApps = parent.getApplicationList()
+    val allApps = parent.getApplicationList().filter(_.completed != requestedIncomplete)
     val actualFirst = if (requestedFirst < allApps.size) requestedFirst else 0
     val apps = allApps.slice(actualFirst, Math.min(actualFirst + pageSize, allApps.size))
 
@@ -39,6 +42,9 @@ private[spark] class HistoryPage(parent: HistoryServer) extends WebUIPage("") {
     val last = Math.min(actualFirst + pageSize, allApps.size) - 1
     val pageCount = allApps.size / pageSize + (if (allApps.size % pageSize > 0) 1 else 0)
 
+    val secondPageFromLeft = 2
+    val secondPageFromRight = pageCount - 1
+
     val appTable = UIUtils.listingTable(appHeader, appRow, apps)
     val providerConfig = parent.getProviderConfig()
     val content =
@@ -48,12 +54,38 @@ private[spark] class HistoryPage(parent: HistoryServer) extends WebUIPage("") {
             {providerConfig.map { case (k, v) => <li><strong>{k}:</strong> {v}</li> }}
           </ul>
           {
+            // This displays the indices of pages that are within `plusOrMinus` pages of
+            // the current page. Regardless of where the current page is, this also links
+            // to the first and last page. If the current page +/- `plusOrMinus` is greater
+            // than the 2nd page from the first page or less than the 2nd page from the last
+            // page, `...` will be displayed.
             if (allApps.size > 0) {
+              val leftSideIndices =
+                rangeIndices(actualPage - plusOrMinus until actualPage, 1 < _)
+              val rightSideIndices =
+                rangeIndices(actualPage + 1 to actualPage + plusOrMinus, _ < pageCount)
+
               <h4>
                 Showing {actualFirst + 1}-{last + 1} of {allApps.size}
+                {if (requestedIncomplete) "(Incomplete applications)"}
                 <span style="float: right">
-                  {if (actualPage > 1) <a href={"/?page=" + (actualPage - 1)}>&lt;</a>}
-                  {if (actualPage < pageCount) <a href={"/?page=" + (actualPage + 1)}>&gt;</a>}
+                  {
+                    if (actualPage > 1) {
+                      <a href={makePageLink(actualPage - 1, requestedIncomplete)}>&lt; </a>
+                      <a href={makePageLink(1, requestedIncomplete)}>1</a>
+                    }
+                  }
+                  {if (actualPage - plusOrMinus > secondPageFromLeft) " ... "}
+                  {leftSideIndices}
+                  {actualPage}
+                  {rightSideIndices}
+                  {if (actualPage + plusOrMinus < secondPageFromRight) " ... "}
+                  {
+                    if (actualPage < pageCount) {
+                      <a href={makePageLink(pageCount, requestedIncomplete)}>{pageCount}</a>
+                      <a href={makePageLink(actualPage + 1, requestedIncomplete)}> &gt;</a>
+                    }
+                  }
                 </span>
               </h4> ++
               appTable
@@ -67,6 +99,15 @@ private[spark] class HistoryPage(parent: HistoryServer) extends WebUIPage("") {
               </p>
             }
           }
+          <a href={makePageLink(actualPage, !requestedIncomplete)}>
+            {
+              if (requestedIncomplete) {
+                "Back to completed applications"
+              } else {
+                "Show incomplete applications"
+              }
+            }
+          </a>
         </div>
       </div>
     UIUtils.basicSparkPage(content, "History Server")
@@ -81,11 +122,16 @@ private[spark] class HistoryPage(parent: HistoryServer) extends WebUIPage("") {
     "Spark User",
     "Last Updated")
 
+  private def rangeIndices(range: Seq[Int], condition: Int => Boolean): Seq[Node] = {
+    range.filter(condition).map(nextPage => <a href={"/?page=" + nextPage}> {nextPage} </a>)
+  }
+
   private def appRow(info: ApplicationHistoryInfo): Seq[Node] = {
     val uiAddress = HistoryServer.UI_PATH_PREFIX + s"/${info.id}"
     val startTime = UIUtils.formatDate(info.startTime)
-    val endTime = UIUtils.formatDate(info.endTime)
-    val duration = UIUtils.formatDuration(info.endTime - info.startTime)
+    val endTime = if (info.endTime > 0) UIUtils.formatDate(info.endTime) else "-"
+    val duration =
+      if (info.endTime > 0) UIUtils.formatDuration(info.endTime - info.startTime) else "-"
     val lastUpdated = UIUtils.formatDate(info.lastUpdated)
     <tr>
       <td><a href={uiAddress}>{info.id}</a></td>
@@ -97,4 +143,11 @@ private[spark] class HistoryPage(parent: HistoryServer) extends WebUIPage("") {
       <td sorttable_customkey={info.lastUpdated.toString}>{lastUpdated}</td>
     </tr>
   }
+
+  private def makePageLink(linkPage: Int, showIncomplete: Boolean): String = {
+    "/?" + Array(
+      "page=" + linkPage,
+      "showIncomplete=" + showIncomplete
+    ).mkString("&")
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index ce00c0ffd21e0..fa9bfe5426b6c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -158,11 +158,12 @@ class HistoryServer(
 
 /**
  * The recommended way of starting and stopping a HistoryServer is through the scripts
- * start-history-server.sh and stop-history-server.sh. The path to a base log directory
- * is must be specified, while the requested UI port is optional. For example:
+ * start-history-server.sh and stop-history-server.sh. The path to a base log directory,
+ * as well as any other relevant history server configuration, should be specified via
+ * the $SPARK_HISTORY_OPTS environment variable. For example:
  *
- *   ./sbin/spark-history-server.sh /tmp/spark-events
- *   ./sbin/spark-history-server.sh hdfs://1.2.3.4:9000/spark-events
+ *   export SPARK_HISTORY_OPTS="-Dspark.history.fs.logDirectory=/tmp/spark-events"
+ *   ./sbin/start-history-server.sh
  *
  * This launches the HistoryServer as a Spark daemon.
  */
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala
index ad7d81747c377..ede0a9dbefb8d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala
@@ -38,8 +38,8 @@ private[spark] class ApplicationInfo(
   extends Serializable {
 
   @transient var state: ApplicationState.Value = _
-  @transient var executors: mutable.HashMap[Int, ExecutorInfo] = _
-  @transient var removedExecutors: ArrayBuffer[ExecutorInfo] = _
+  @transient var executors: mutable.HashMap[Int, ExecutorDesc] = _
+  @transient var removedExecutors: ArrayBuffer[ExecutorDesc] = _
   @transient var coresGranted: Int = _
   @transient var endTime: Long = _
   @transient var appSource: ApplicationSource = _
@@ -55,12 +55,12 @@ private[spark] class ApplicationInfo(
 
   private def init() {
     state = ApplicationState.WAITING
-    executors = new mutable.HashMap[Int, ExecutorInfo]
+    executors = new mutable.HashMap[Int, ExecutorDesc]
     coresGranted = 0
     endTime = -1L
     appSource = new ApplicationSource(this)
     nextExecutorId = 0
-    removedExecutors = new ArrayBuffer[ExecutorInfo]
+    removedExecutors = new ArrayBuffer[ExecutorDesc]
   }
 
   private def newExecutorId(useID: Option[Int] = None): Int = {
@@ -75,14 +75,14 @@ private[spark] class ApplicationInfo(
     }
   }
 
-  def addExecutor(worker: WorkerInfo, cores: Int, useID: Option[Int] = None): ExecutorInfo = {
-    val exec = new ExecutorInfo(newExecutorId(useID), this, worker, cores, desc.memoryPerSlave)
+  def addExecutor(worker: WorkerInfo, cores: Int, useID: Option[Int] = None): ExecutorDesc = {
+    val exec = new ExecutorDesc(newExecutorId(useID), this, worker, cores, desc.memoryPerSlave)
     executors(exec.id) = exec
     coresGranted += cores
     exec
   }
 
-  def removeExecutor(exec: ExecutorInfo) {
+  def removeExecutor(exec: ExecutorDesc) {
     if (executors.contains(exec.id)) {
       removedExecutors += executors(exec.id)
       executors -= exec.id
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ExecutorInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/ExecutorDesc.scala
similarity index 95%
rename from core/src/main/scala/org/apache/spark/deploy/master/ExecutorInfo.scala
rename to core/src/main/scala/org/apache/spark/deploy/master/ExecutorDesc.scala
index d417070c51016..5d620dfcabad5 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ExecutorInfo.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ExecutorDesc.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.master
 
 import org.apache.spark.deploy.{ExecutorDescription, ExecutorState}
 
-private[spark] class ExecutorInfo(
+private[spark] class ExecutorDesc(
     val id: Int,
     val application: ApplicationInfo,
     val worker: WorkerInfo,
@@ -37,7 +37,7 @@ private[spark] class ExecutorInfo(
 
   override def equals(other: Any): Boolean = {
     other match {
-      case info: ExecutorInfo =>
+      case info: ExecutorDesc =>
         fullId == info.fullId &&
         worker.id == info.worker.id &&
         cores == info.cores &&
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 7b32c505def9b..d92d99310a583 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.deploy.master
 
+import java.io.FileNotFoundException
 import java.net.URLEncoder
 import java.text.SimpleDateFormat
 import java.util.Date
@@ -32,6 +33,7 @@ import akka.pattern.ask
 import akka.remote.{DisassociatedEvent, RemotingLifecycleEvent}
 import akka.serialization.Serialization
 import akka.serialization.SerializationExtension
+import org.apache.hadoop.fs.Path
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
 import org.apache.spark.deploy.{ApplicationDescription, DriverDescription,
@@ -56,6 +58,7 @@ private[spark] class Master(
   import context.dispatcher   // to use Akka's scheduler.schedule()
 
   val conf = new SparkConf
+  val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
 
   def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")  // For application IDs
   val WORKER_TIMEOUT = conf.getLong("spark.worker.timeout", 60) * 1000
@@ -120,6 +123,7 @@ private[spark] class Master(
 
   override def preStart() {
     logInfo("Starting Spark master at " + masterUrl)
+    logInfo(s"Running Spark version ${org.apache.spark.SPARK_VERSION}")
     // Listen for remote client disconnection events, since they don't go through Akka's watch()
     context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
     webUi.bind()
@@ -129,6 +133,10 @@ private[spark] class Master(
     masterMetricsSystem.registerSource(masterSource)
     masterMetricsSystem.start()
     applicationMetricsSystem.start()
+    // Attach the master and app metrics servlet handler to the web ui after the metrics systems are
+    // started.
+    masterMetricsSystem.getServletHandlers.foreach(webUi.attachHandler)
+    applicationMetricsSystem.getServletHandlers.foreach(webUi.attachHandler)
 
     val (persistenceEngine_, leaderElectionAgent_) = RECOVERY_MODE match {
       case "ZOOKEEPER" =>
@@ -510,7 +518,7 @@ private[spark] class Master(
     val shuffledAliveWorkers = Random.shuffle(workers.toSeq.filter(_.state == WorkerState.ALIVE))
     val numWorkersAlive = shuffledAliveWorkers.size
     var curPos = 0
-    
+
     for (driver <- waitingDrivers.toList) { // iterate over a copy of waitingDrivers
       // We assign workers to each waiting driver in a round-robin fashion. For each driver, we
       // start from the last worker that was assigned a driver, and continue onwards until we have
@@ -573,7 +581,7 @@ private[spark] class Master(
     }
   }
 
-  def launchExecutor(worker: WorkerInfo, exec: ExecutorInfo) {
+  def launchExecutor(worker: WorkerInfo, exec: ExecutorDesc) {
     logInfo("Launching executor " + exec.fullId + " on worker " + worker.id)
     worker.addExecutor(exec)
     worker.actor ! LaunchExecutor(masterUrl,
@@ -697,6 +705,11 @@ private[spark] class Master(
       }
       persistenceEngine.removeApplication(app)
       schedule()
+
+      // Tell all workers that the application has finished, so they can clean up any app state.
+      workers.foreach { w =>
+        w.actor ! ApplicationFinished(app.id)
+      }
     }
   }
 
@@ -707,41 +720,51 @@ private[spark] class Master(
   def rebuildSparkUI(app: ApplicationInfo): Boolean = {
     val appName = app.desc.name
     val notFoundBasePath = HistoryServer.UI_PATH_PREFIX + "/not-found"
-    val eventLogDir = app.desc.eventLogDir.getOrElse {
-      // Event logging is not enabled for this application
-      app.desc.appUiUrl = notFoundBasePath
-      return false
-    }
-
-    val appEventLogDir = EventLoggingListener.getLogDirPath(eventLogDir, app.id)
-    val fileSystem = Utils.getHadoopFileSystem(appEventLogDir,
-      SparkHadoopUtil.get.newConfiguration(conf))
-    val eventLogInfo = EventLoggingListener.parseLoggingInfo(appEventLogDir, fileSystem)
-    val eventLogPaths = eventLogInfo.logPaths
-    val compressionCodec = eventLogInfo.compressionCodec
-
-    if (eventLogPaths.isEmpty) {
-      // Event logging is enabled for this application, but no event logs are found
-      val title = s"Application history not found (${app.id})"
-      var msg = s"No event logs found for application $appName in $appEventLogDir."
-      logWarning(msg)
-      msg += " Did you specify the correct logging directory?"
-      msg = URLEncoder.encode(msg, "UTF-8")
-      app.desc.appUiUrl = notFoundBasePath + s"?msg=$msg&title=$title"
-      return false
-    }
-
     try {
-      val replayBus = new ReplayListenerBus(eventLogPaths, fileSystem, compressionCodec)
+      val eventLogFile = app.desc.eventLogDir
+        .map { dir => EventLoggingListener.getLogPath(dir, app.id) }
+        .getOrElse {
+          // Event logging is not enabled for this application
+          app.desc.appUiUrl = notFoundBasePath
+          return false
+        }
+        
+      val fs = Utils.getHadoopFileSystem(eventLogFile, hadoopConf)
+
+      if (fs.exists(new Path(eventLogFile + EventLoggingListener.IN_PROGRESS))) {
+        // Event logging is enabled for this application, but the application is still in progress
+        val title = s"Application history not found (${app.id})"
+        var msg = s"Application $appName is still in progress."
+        logWarning(msg)
+        msg = URLEncoder.encode(msg, "UTF-8")
+        app.desc.appUiUrl = notFoundBasePath + s"?msg=$msg&title=$title"
+        return false
+      }
+
+      val (logInput, sparkVersion) = EventLoggingListener.openEventLog(new Path(eventLogFile), fs)
+      val replayBus = new ReplayListenerBus()
       val ui = SparkUI.createHistoryUI(new SparkConf, replayBus, new SecurityManager(conf),
         appName + " (completed)", HistoryServer.UI_PATH_PREFIX + s"/${app.id}")
-      replayBus.replay()
+      try {
+        replayBus.replay(logInput, sparkVersion)
+      } finally {
+        logInput.close()
+      }
       appIdToUI(app.id) = ui
       webUi.attachSparkUI(ui)
       // Application UI is successfully rebuilt, so link the Master UI to it
-      app.desc.appUiUrl = ui.getBasePath
+      app.desc.appUiUrl = ui.basePath
       true
     } catch {
+      case fnf: FileNotFoundException =>
+        // Event logging is enabled for this application, but no event logs are found
+        val title = s"Application history not found (${app.id})"
+        var msg = s"No event logs found for application $appName in ${app.desc.eventLogDir}."
+        logWarning(msg)
+        msg += " Did you specify the correct logging directory?"
+        msg = URLEncoder.encode(msg, "UTF-8")
+        app.desc.appUiUrl = notFoundBasePath + s"?msg=$msg&title=$title"
+        false
       case e: Exception =>
         // Relay exception message to application UI page
         val title = s"Application history load error (${app.id})"
@@ -823,7 +846,6 @@ private[spark] class Master(
 private[spark] object Master extends Logging {
   val systemName = "sparkMaster"
   private val actorName = "Master"
-  val sparkUrlRegex = "spark://([^:]+):([0-9]+)".r
 
   def main(argStrings: Array[String]) {
     SignalLogger.register(log)
@@ -833,14 +855,24 @@ private[spark] object Master extends Logging {
     actorSystem.awaitTermination()
   }
 
-  /** Returns an `akka.tcp://...` URL for the Master actor given a sparkUrl `spark://host:ip`. */
+  /**
+   * Returns an `akka.tcp://...` URL for the Master actor given a sparkUrl `spark://host:port`.
+   *
+   * @throws SparkException if the url is invalid
+   */
   def toAkkaUrl(sparkUrl: String): String = {
-    sparkUrl match {
-      case sparkUrlRegex(host, port) =>
-        "akka.tcp://%s@%s:%s/user/%s".format(systemName, host, port, actorName)
-      case _ =>
-        throw new SparkException("Invalid master URL: " + sparkUrl)
-    }
+    val (host, port) = Utils.extractHostPortFromSparkUrl(sparkUrl)
+    "akka.tcp://%s@%s:%s/user/%s".format(systemName, host, port, actorName)
+  }
+
+  /**
+   * Returns an akka `Address` for the Master actor given a sparkUrl `spark://host:port`.
+   *
+   * @throws SparkException if the url is invalid
+   */
+  def toAkkaAddress(sparkUrl: String): Address = {
+    val (host, port) = Utils.extractHostPortFromSparkUrl(sparkUrl)
+    Address("akka.tcp", systemName, host, port)
   }
 
   def startSystemAndActor(
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala
index 473ddc23ff0f3..e94aae93e4495 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala
@@ -38,7 +38,7 @@ private[spark] class WorkerInfo(
   Utils.checkHost(host, "Expected hostname")
   assert (port > 0)
 
-  @transient var executors: mutable.HashMap[String, ExecutorInfo] = _ // executorId => info
+  @transient var executors: mutable.HashMap[String, ExecutorDesc] = _ // executorId => info
   @transient var drivers: mutable.HashMap[String, DriverInfo] = _ // driverId => info
   @transient var state: WorkerState.Value = _
   @transient var coresUsed: Int = _
@@ -70,13 +70,13 @@ private[spark] class WorkerInfo(
     host + ":" + port
   }
 
-  def addExecutor(exec: ExecutorInfo) {
+  def addExecutor(exec: ExecutorDesc) {
     executors(exec.fullId) = exec
     coresUsed += exec.cores
     memoryUsed += exec.memory
   }
 
-  def removeExecutor(exec: ExecutorInfo) {
+  def removeExecutor(exec: ExecutorDesc) {
     if (executors.contains(exec.fullId)) {
       executors -= exec.fullId
       coresUsed -= exec.cores
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
index 4588c130ef439..3aae2b95d7396 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
@@ -27,7 +27,7 @@ import org.json4s.JValue
 
 import org.apache.spark.deploy.{ExecutorState, JsonProtocol}
 import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState}
-import org.apache.spark.deploy.master.ExecutorInfo
+import org.apache.spark.deploy.master.ExecutorDesc
 import org.apache.spark.ui.{UIUtils, WebUIPage}
 import org.apache.spark.util.Utils
 
@@ -109,7 +109,7 @@ private[spark] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app
     UIUtils.basicSparkPage(content, "Application: " + app.desc.name)
   }
 
-  private def executorRow(executor: ExecutorInfo): Seq[Node] = {
+  private def executorRow(executor: ExecutorDesc): Seq[Node] = {
     <tr>
       <td>{executor.id}</td>
       <td>
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
index d86ec1e03e45c..73400c5affb5d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
@@ -41,8 +41,6 @@ class MasterWebUI(val master: Master, requestedPort: Int)
     attachPage(new HistoryNotFoundPage(this))
     attachPage(new MasterPage(this))
     attachHandler(createStaticHandler(MasterWebUI.STATIC_RESOURCE_DIR, "/static"))
-    master.masterMetricsSystem.getServletHandlers.foreach(attachHandler)
-    master.applicationMetricsSystem.getServletHandlers.foreach(attachHandler)
   }
 
   /** Attach a reconstructed UI to this Master UI. Only valid after bind(). */
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
index f4fedc6327ab9..acbdf0d8bd7bc 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
@@ -47,6 +47,7 @@ private[spark] class ExecutorRunner(
     val executorDir: File,
     val workerUrl: String,
     val conf: SparkConf,
+    val appLocalDirs: Seq[String],
     var state: ExecutorState.Value)
   extends Logging {
 
@@ -77,7 +78,7 @@ private[spark] class ExecutorRunner(
   /**
    * Kill executor process, wait for exit and notify worker to update resource status.
    *
-   * @param message the exception message which caused the executor's death 
+   * @param message the exception message which caused the executor's death
    */
   private def killProcess(message: Option[String]) {
     var exitCode: Option[Int] = None
@@ -129,6 +130,7 @@ private[spark] class ExecutorRunner(
       logInfo("Launch command: " + command.mkString("\"", "\" \"", "\""))
 
       builder.directory(executorDir)
+      builder.environment.put("SPARK_LOCAL_DIRS", appLocalDirs.mkString(","))
       // In case we are running this from within the Spark Shell, avoid creating a "scala"
       // parent process for the executor command
       builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index eb11163538b20..13599830123d0 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -23,7 +23,7 @@ import java.text.SimpleDateFormat
 import java.util.{UUID, Date}
 
 import scala.collection.JavaConversions._
-import scala.collection.mutable.HashMap
+import scala.collection.mutable.{HashMap, HashSet}
 import scala.concurrent.duration._
 import scala.language.postfixOps
 import scala.util.Random
@@ -40,7 +40,7 @@ import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.util.{ActorLogReceive, AkkaUtils, SignalLogger, Utils}
 
 /**
-  * @param masterUrls Each url should look like spark://host:port.
+  * @param masterAkkaUrls Each url should be a valid akka url.
   */
 private[spark] class Worker(
     host: String,
@@ -48,7 +48,7 @@ private[spark] class Worker(
     webUiPort: Int,
     cores: Int,
     memory: Int,
-    masterUrls: Array[String],
+    masterAkkaUrls: Array[String],
     actorSystemName: String,
     actorName: String,
     workDirPath: String = null,
@@ -109,6 +109,8 @@ private[spark] class Worker(
   val finishedExecutors = new HashMap[String, ExecutorRunner]
   val drivers = new HashMap[String, DriverRunner]
   val finishedDrivers = new HashMap[String, DriverRunner]
+  val appDirectories = new HashMap[String, Seq[String]]
+  val finishedApps = new HashSet[String]
 
   // The shuffle service is not actually started unless configured.
   val shuffleService = new StandaloneWorkerShuffleService(conf, securityMgr)
@@ -153,6 +155,7 @@ private[spark] class Worker(
     assert(!registered)
     logInfo("Starting Spark worker %s:%d with %d cores, %s RAM".format(
       host, port, cores, Utils.megabytesToString(memory)))
+    logInfo(s"Running Spark version ${org.apache.spark.SPARK_VERSION}")
     logInfo("Spark home: " + sparkHome)
     createWorkDir()
     context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
@@ -163,18 +166,16 @@ private[spark] class Worker(
 
     metricsSystem.registerSource(workerSource)
     metricsSystem.start()
+    // Attach the worker metrics servlet handler to the web ui after the metrics system is started.
+    metricsSystem.getServletHandlers.foreach(webUi.attachHandler)
   }
 
   def changeMaster(url: String, uiUrl: String) {
+    // activeMasterUrl it's a valid Spark url since we receive it from master.
     activeMasterUrl = url
     activeMasterWebUiUrl = uiUrl
     master = context.actorSelection(Master.toAkkaUrl(activeMasterUrl))
-    masterAddress = activeMasterUrl match {
-      case Master.sparkUrlRegex(_host, _port) =>
-        Address("akka.tcp", Master.systemName, _host, _port.toInt)
-      case x =>
-        throw new SparkException("Invalid spark URL: " + x)
-    }
+    masterAddress = Master.toAkkaAddress(activeMasterUrl)
     connected = true
     // Cancel any outstanding re-registration attempts because we found a new master
     registrationRetryTimer.foreach(_.cancel())
@@ -182,9 +183,9 @@ private[spark] class Worker(
   }
 
   private def tryRegisterAllMasters() {
-    for (masterUrl <- masterUrls) {
-      logInfo("Connecting to master " + masterUrl + "...")
-      val actor = context.actorSelection(Master.toAkkaUrl(masterUrl))
+    for (masterAkkaUrl <- masterAkkaUrls) {
+      logInfo("Connecting to master " + masterAkkaUrl + "...")
+      val actor = context.actorSelection(masterAkkaUrl)
       actor ! RegisterWorker(workerId, host, port, cores, memory, webUi.boundPort, publicAddress)
     }
   }
@@ -292,7 +293,7 @@ private[spark] class Worker(
           val isAppStillRunning = executors.values.map(_.appId).contains(appIdFromDir)
           dir.isDirectory && !isAppStillRunning &&
           !Utils.doesDirectoryContainAnyNewFiles(dir, APP_DATA_RETENTION_SECS)
-        }.foreach { dir => 
+        }.foreach { dir =>
           logInfo(s"Removing directory: ${dir.getPath}")
           Utils.deleteRecursively(dir)
         }
@@ -337,8 +338,19 @@ private[spark] class Worker(
             throw new IOException("Failed to create directory " + executorDir)
           }
 
+          // Create local dirs for the executor. These are passed to the executor via the
+          // SPARK_LOCAL_DIRS environment variable, and deleted by the Worker when the
+          // application finishes.
+          val appLocalDirs = appDirectories.get(appId).getOrElse {
+            Utils.getOrCreateLocalRootDirs(conf).map { dir =>
+              Utils.createDirectory(dir).getAbsolutePath()
+            }.toSeq
+          }
+          appDirectories(appId) = appLocalDirs
+
           val manager = new ExecutorRunner(appId, execId, appDesc, cores_, memory_,
-            self, workerId, host, sparkHome, executorDir, akkaUrl, conf, ExecutorState.LOADING)
+            self, workerId, host, sparkHome, executorDir, akkaUrl, conf, appLocalDirs,
+            ExecutorState.LOADING)
           executors(appId + "/" + execId) = manager
           manager.start()
           coresUsed += cores_
@@ -375,6 +387,7 @@ private[spark] class Worker(
               message.map(" message " + _).getOrElse("") +
               exitStatus.map(" exitStatus " + _).getOrElse(""))
         }
+        maybeCleanupApplication(appId)
       }
 
     case KillExecutor(masterUrl, appId, execId) =>
@@ -444,6 +457,9 @@ private[spark] class Worker(
     case ReregisterWithMaster =>
       reregisterWithMaster()
 
+    case ApplicationFinished(id) =>
+      finishedApps += id
+      maybeCleanupApplication(id)
   }
 
   private def masterDisconnected() {
@@ -452,6 +468,19 @@ private[spark] class Worker(
     registerWithMaster()
   }
 
+  private def maybeCleanupApplication(id: String): Unit = {
+    val shouldCleanup = finishedApps.contains(id) && !executors.values.exists(_.appId == id)
+    if (shouldCleanup) {
+      finishedApps -= id
+      appDirectories.remove(id).foreach { dirList =>
+        logInfo(s"Cleaning up local directories for application $id")
+        dirList.foreach { dir =>
+          Utils.deleteRecursively(new File(dir))
+        }
+      }
+    }
+  }
+
   def generateWorkerId(): String = {
     "worker-%s-%s-%d".format(createDateFormat.format(new Date), host, port)
   }
@@ -494,8 +523,9 @@ private[spark] object Worker extends Logging {
     val securityMgr = new SecurityManager(conf)
     val (actorSystem, boundPort) = AkkaUtils.createActorSystem(systemName, host, port,
       conf = conf, securityManager = securityMgr)
+    val masterAkkaUrls = masterUrls.map(Master.toAkkaUrl)
     actorSystem.actorOf(Props(classOf[Worker], host, boundPort, webUiPort, cores, memory,
-      masterUrls, systemName, actorName,  workDir, conf, securityMgr), name = actorName)
+      masterAkkaUrls, systemName, actorName,  workDir, conf, securityMgr), name = actorName)
     (actorSystem, boundPort)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala
index b07942a9ca729..7ac81a2d87efd 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala
@@ -50,7 +50,6 @@ class WorkerWebUI(
     attachHandler(createStaticHandler(WorkerWebUI.STATIC_RESOURCE_BASE, "/static"))
     attachHandler(createServletHandler("/log",
       (request: HttpServletRequest) => logPage.renderLog(request), worker.securityMgr))
-    worker.metricsSystem.getServletHandlers.foreach(attachHandler)
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index 5f46f3b1f085e..9a4adfbbb3d71 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -21,7 +21,7 @@ import java.nio.ByteBuffer
 
 import scala.concurrent.Await
 
-import akka.actor.{Actor, ActorSelection, ActorSystem, Props}
+import akka.actor.{Actor, ActorSelection, Props}
 import akka.pattern.Patterns
 import akka.remote.{RemotingLifecycleEvent, DisassociatedEvent}
 
@@ -38,8 +38,7 @@ private[spark] class CoarseGrainedExecutorBackend(
     executorId: String,
     hostPort: String,
     cores: Int,
-    sparkProperties: Seq[(String, String)],
-    actorSystem: ActorSystem)
+    env: SparkEnv)
   extends Actor with ActorLogReceive with ExecutorBackend with Logging {
 
   Utils.checkHostPort(hostPort, "Expected hostport")
@@ -58,8 +57,7 @@ private[spark] class CoarseGrainedExecutorBackend(
     case RegisteredExecutor =>
       logInfo("Successfully registered with driver")
       val (hostname, _) = Utils.parseHostPort(hostPort)
-      executor = new Executor(executorId, hostname, sparkProperties, cores, isLocal = false,
-        actorSystem)
+      executor = new Executor(executorId, hostname, env, isLocal = false)
 
     case RegisterExecutorFailed(message) =>
       logError("Slave registration failed: " + message)
@@ -70,10 +68,11 @@ private[spark] class CoarseGrainedExecutorBackend(
         logError("Received LaunchTask command but executor was null")
         System.exit(1)
       } else {
-        val ser = SparkEnv.get.closureSerializer.newInstance()
+        val ser = env.closureSerializer.newInstance()
         val taskDesc = ser.deserialize[TaskDescription](data.value)
         logInfo("Got assigned task " + taskDesc.taskId)
-        executor.launchTask(this, taskDesc.taskId, taskDesc.name, taskDesc.serializedTask)
+        executor.launchTask(this, taskId = taskDesc.taskId, attemptNumber = taskDesc.attemptNumber,
+          taskDesc.name, taskDesc.serializedTask)
       }
 
     case KillTask(taskId, _, interruptThread) =>
@@ -128,21 +127,25 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
         Seq[(String, String)](("spark.app.id", appId))
       fetcher.shutdown()
 
-      // Create a new ActorSystem using driver's Spark properties to run the backend.
+      // Create SparkEnv using properties we fetched from the driver.
       val driverConf = new SparkConf().setAll(props)
-      val (actorSystem, boundPort) = AkkaUtils.createActorSystem(
-        SparkEnv.executorActorSystemName,
-        hostname, port, driverConf, new SecurityManager(driverConf))
-      // set it
+      val env = SparkEnv.createExecutorEnv(
+        driverConf, executorId, hostname, port, cores, isLocal = false)
+
+      // SparkEnv sets spark.driver.port so it shouldn't be 0 anymore.
+      val boundPort = env.conf.getInt("spark.executor.port", 0)
+      assert(boundPort != 0)
+
+      // Start the CoarseGrainedExecutorBackend actor.
       val sparkHostPort = hostname + ":" + boundPort
-      actorSystem.actorOf(
+      env.actorSystem.actorOf(
         Props(classOf[CoarseGrainedExecutorBackend],
-          driverUrl, executorId, sparkHostPort, cores, props, actorSystem),
+          driverUrl, executorId, sparkHostPort, cores, env),
         name = "Executor")
       workerUrl.foreach { url =>
-        actorSystem.actorOf(Props(classOf[WorkerWatcher], url), name = "WorkerWatcher")
+        env.actorSystem.actorOf(Props(classOf[WorkerWatcher], url), name = "WorkerWatcher")
       }
-      actorSystem.awaitTermination()
+      env.actorSystem.awaitTermination()
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 835157fc520aa..42566d1a14093 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -26,7 +26,7 @@ import scala.collection.JavaConversions._
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.util.control.NonFatal
 
-import akka.actor.{Props, ActorSystem}
+import akka.actor.Props
 
 import org.apache.spark._
 import org.apache.spark.deploy.SparkHadoopUtil
@@ -42,10 +42,8 @@ import org.apache.spark.util.{SparkUncaughtExceptionHandler, AkkaUtils, Utils}
 private[spark] class Executor(
     executorId: String,
     slaveHostname: String,
-    properties: Seq[(String, String)],
-    numCores: Int,
-    isLocal: Boolean = false,
-    actorSystem: ActorSystem = null)
+    env: SparkEnv,
+    isLocal: Boolean = false)
   extends Logging
 {
   // Application dependencies (added through SparkContext) that we've fetched so far on this node.
@@ -55,6 +53,8 @@ private[spark] class Executor(
 
   private val EMPTY_BYTE_BUFFER = ByteBuffer.wrap(new Array[Byte](0))
 
+  private val conf = env.conf
+
   @volatile private var isStopped = false
 
   // No ip or host:port - just hostname
@@ -65,10 +65,6 @@ private[spark] class Executor(
   // Make sure the local hostname we report matches the cluster scheduler's name for this host
   Utils.setCustomHostname(slaveHostname)
 
-  // Set spark.* properties from executor arg
-  val conf = new SparkConf(true)
-  conf.setAll(properties)
-
   if (!isLocal) {
     // Setup an uncaught exception handler for non-local mode.
     // Make any thread terminations due to uncaught exceptions kill the entire
@@ -77,21 +73,11 @@ private[spark] class Executor(
   }
 
   val executorSource = new ExecutorSource(this, executorId)
-
-  // Initialize Spark environment (using system properties read above)
   conf.set("spark.executor.id", executorId)
-  private val env = {
-    if (!isLocal) {
-      val port = conf.getInt("spark.executor.port", 0)
-      val _env = SparkEnv.createExecutorEnv(
-        conf, executorId, slaveHostname, port, numCores, isLocal, actorSystem)
-      SparkEnv.set(_env)
-      _env.metricsSystem.registerSource(executorSource)
-      _env.blockManager.initialize(conf.getAppId)
-      _env
-    } else {
-      SparkEnv.get
-    }
+
+  if (!isLocal) {
+    env.metricsSystem.registerSource(executorSource)
+    env.blockManager.initialize(conf.getAppId)
   }
 
   // Create an actor for receiving RPCs from the driver
@@ -122,8 +108,13 @@ private[spark] class Executor(
   startDriverHeartbeater()
 
   def launchTask(
-      context: ExecutorBackend, taskId: Long, taskName: String, serializedTask: ByteBuffer) {
-    val tr = new TaskRunner(context, taskId, taskName, serializedTask)
+      context: ExecutorBackend,
+      taskId: Long,
+      attemptNumber: Int,
+      taskName: String,
+      serializedTask: ByteBuffer) {
+    val tr = new TaskRunner(context, taskId = taskId, attemptNumber = attemptNumber, taskName,
+      serializedTask)
     runningTasks.put(taskId, tr)
     threadPool.execute(tr)
   }
@@ -145,13 +136,20 @@ private[spark] class Executor(
     }
   }
 
+  private def gcTime = ManagementFactory.getGarbageCollectorMXBeans.map(_.getCollectionTime).sum
+
   class TaskRunner(
-      execBackend: ExecutorBackend, val taskId: Long, taskName: String, serializedTask: ByteBuffer)
+      execBackend: ExecutorBackend,
+      val taskId: Long,
+      val attemptNumber: Int,
+      taskName: String,
+      serializedTask: ByteBuffer)
     extends Runnable {
 
     @volatile private var killed = false
     @volatile var task: Task[Any] = _
     @volatile var attemptedTask: Option[Task[Any]] = None
+    @volatile var startGCTime: Long = _
 
     def kill(interruptThread: Boolean) {
       logInfo(s"Executor is trying to kill $taskName (TID $taskId)")
@@ -164,15 +162,13 @@ private[spark] class Executor(
     override def run() {
       val deserializeStartTime = System.currentTimeMillis()
       Thread.currentThread.setContextClassLoader(replClassLoader)
-      val ser = SparkEnv.get.closureSerializer.newInstance()
+      val ser = env.closureSerializer.newInstance()
       logInfo(s"Running $taskName (TID $taskId)")
       execBackend.statusUpdate(taskId, TaskState.RUNNING, EMPTY_BYTE_BUFFER)
       var taskStart: Long = 0
-      def gcTime = ManagementFactory.getGarbageCollectorMXBeans.map(_.getCollectionTime).sum
-      val startGCTime = gcTime
+      startGCTime = gcTime
 
       try {
-        Accumulators.clear()
         val (taskFiles, taskJars, taskBytes) = Task.deserializeWithDependencies(serializedTask)
         updateDependencies(taskFiles, taskJars)
         task = ser.deserialize[Task[Any]](taskBytes, Thread.currentThread.getContextClassLoader)
@@ -193,7 +189,7 @@ private[spark] class Executor(
 
         // Run the actual task and measure its runtime.
         taskStart = System.currentTimeMillis()
-        val value = task.run(taskId.toInt)
+        val value = task.run(taskAttemptId = taskId, attemptNumber = attemptNumber)
         val taskFinish = System.currentTimeMillis()
 
         // If the task has been killed, let's fail it.
@@ -201,16 +197,16 @@ private[spark] class Executor(
           throw new TaskKilledException
         }
 
-        val resultSer = SparkEnv.get.serializer.newInstance()
+        val resultSer = env.serializer.newInstance()
         val beforeSerialization = System.currentTimeMillis()
         val valueBytes = resultSer.serialize(value)
         val afterSerialization = System.currentTimeMillis()
 
         for (m <- task.metrics) {
-          m.executorDeserializeTime = taskStart - deserializeStartTime
-          m.executorRunTime = taskFinish - taskStart
-          m.jvmGCTime = gcTime - startGCTime
-          m.resultSerializationTime = afterSerialization - beforeSerialization
+          m.setExecutorDeserializeTime(taskStart - deserializeStartTime)
+          m.setExecutorRunTime(taskFinish - taskStart)
+          m.setJvmGCTime(gcTime - startGCTime)
+          m.setResultSerializationTime(afterSerialization - beforeSerialization)
         }
 
         val accumUpdates = Accumulators.values
@@ -261,8 +257,8 @@ private[spark] class Executor(
           val serviceTime = System.currentTimeMillis() - taskStart
           val metrics = attemptedTask.flatMap(t => t.metrics)
           for (m <- metrics) {
-            m.executorRunTime = serviceTime
-            m.jvmGCTime = gcTime - startGCTime
+            m.setExecutorRunTime(serviceTime)
+            m.setJvmGCTime(gcTime - startGCTime)
           }
           val reason = new ExceptionFailure(t, metrics)
           execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(reason))
@@ -278,6 +274,8 @@ private[spark] class Executor(
         env.shuffleMemoryManager.releaseMemoryForThisThread()
         // Release memory used by this thread for unrolling blocks
         env.blockManager.memoryStore.releaseUnrollMemoryForThisThread()
+        // Release memory used by this thread for accumulators
+        Accumulators.clear()
         runningTasks.remove(taskId)
       }
     }
@@ -375,10 +373,15 @@ private[spark] class Executor(
 
         while (!isStopped) {
           val tasksMetrics = new ArrayBuffer[(Long, TaskMetrics)]()
+          val curGCTime = gcTime
+
           for (taskRunner <- runningTasks.values()) {
-            if (!taskRunner.attemptedTask.isEmpty) {
+            if (taskRunner.attemptedTask.nonEmpty) {
               Option(taskRunner.task).flatMap(_.metrics).foreach { metrics =>
-                metrics.updateShuffleReadMetrics
+                metrics.updateShuffleReadMetrics()
+                metrics.updateInputMetrics()
+                metrics.setJvmGCTime(curGCTime - taskRunner.startGCTime)
+
                 if (isLocal) {
                   // JobProgressListener will hold an reference of it during
                   // onExecutorMetricsUpdate(), then JobProgressListener can not see
diff --git a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
index f15e6bc33fb41..cfd672e1d8a97 100644
--- a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
@@ -22,12 +22,13 @@ import java.nio.ByteBuffer
 import scala.collection.JavaConversions._
 
 import org.apache.mesos.protobuf.ByteString
-import org.apache.mesos.{Executor => MesosExecutor, ExecutorDriver, MesosExecutorDriver, MesosNativeLibrary}
+import org.apache.mesos.{Executor => MesosExecutor, ExecutorDriver, MesosExecutorDriver}
 import org.apache.mesos.Protos.{TaskStatus => MesosTaskStatus, _}
 
-import org.apache.spark.{Logging, TaskState}
+import org.apache.spark.{Logging, TaskState, SparkConf, SparkEnv}
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.scheduler.cluster.mesos.{MesosTaskLaunchData}
 import org.apache.spark.util.{SignalLogger, Utils}
 
 private[spark] class MesosExecutorBackend
@@ -64,19 +65,27 @@ private[spark] class MesosExecutorBackend
     this.driver = driver
     val properties = Utils.deserialize[Array[(String, String)]](executorInfo.getData.toByteArray) ++
       Seq[(String, String)](("spark.app.id", frameworkInfo.getId.getValue))
+    val conf = new SparkConf(loadDefaults = true).setAll(properties)
+    val port = conf.getInt("spark.executor.port", 0)
+    val env = SparkEnv.createExecutorEnv(
+      conf, executorId, slaveInfo.getHostname, port, cpusPerTask, isLocal = false)
+
     executor = new Executor(
       executorId,
       slaveInfo.getHostname,
-      properties,
-      cpusPerTask)
+      env)
   }
 
   override def launchTask(d: ExecutorDriver, taskInfo: TaskInfo) {
     val taskId = taskInfo.getTaskId.getValue.toLong
+    val taskData = MesosTaskLaunchData.fromByteString(taskInfo.getData)
     if (executor == null) {
       logError("Received launchTask but executor was null")
     } else {
-      executor.launchTask(this, taskId, taskInfo.getName, taskInfo.getData.asReadOnlyByteBuffer)
+      SparkHadoopUtil.get.runAsSparkUser { () =>
+        executor.launchTask(this, taskId = taskId, attemptNumber = taskData.attemptNumber,
+          taskInfo.getName, taskData.serializedTask)
+      }
     }
   }
 
@@ -108,11 +117,8 @@ private[spark] class MesosExecutorBackend
 private[spark] object MesosExecutorBackend extends Logging {
   def main(args: Array[String]) {
     SignalLogger.register(log)
-    SparkHadoopUtil.get.runAsSparkUser { () =>
-        MesosNativeLibrary.load()
-        // Create a new Executor and start it running
-        val runner = new MesosExecutorBackend()
-        new MesosExecutorDriver(runner).run()
-    }
+    // Create a new Executor and start it running
+    val runner = new MesosExecutorBackend()
+    new MesosExecutorDriver(runner).run()
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
index 51b5328cb4c8f..ddb5903bf6875 100644
--- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
+++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -17,6 +17,11 @@
 
 package org.apache.spark.executor
 
+import java.util.concurrent.atomic.AtomicLong
+
+import org.apache.spark.executor.DataReadMethod
+import org.apache.spark.executor.DataReadMethod.DataReadMethod
+
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.annotation.DeveloperApi
@@ -39,48 +44,78 @@ class TaskMetrics extends Serializable {
   /**
    * Host's name the task runs on
    */
-  var hostname: String = _
-
+  private var _hostname: String = _
+  def hostname = _hostname
+  private[spark] def setHostname(value: String) = _hostname = value
+  
   /**
    * Time taken on the executor to deserialize this task
    */
-  var executorDeserializeTime: Long = _
-
+  private var _executorDeserializeTime: Long = _
+  def executorDeserializeTime = _executorDeserializeTime
+  private[spark] def setExecutorDeserializeTime(value: Long) = _executorDeserializeTime = value
+  
+  
   /**
    * Time the executor spends actually running the task (including fetching shuffle data)
    */
-  var executorRunTime: Long = _
-
+  private var _executorRunTime: Long = _
+  def executorRunTime = _executorRunTime
+  private[spark] def setExecutorRunTime(value: Long) = _executorRunTime = value
+  
   /**
    * The number of bytes this task transmitted back to the driver as the TaskResult
    */
-  var resultSize: Long = _
+  private var _resultSize: Long = _
+  def resultSize = _resultSize
+  private[spark] def setResultSize(value: Long) = _resultSize = value
+
 
   /**
    * Amount of time the JVM spent in garbage collection while executing this task
    */
-  var jvmGCTime: Long = _
+  private var _jvmGCTime: Long = _
+  def jvmGCTime = _jvmGCTime
+  private[spark] def setJvmGCTime(value: Long) = _jvmGCTime = value
 
   /**
    * Amount of time spent serializing the task result
    */
-  var resultSerializationTime: Long = _
+  private var _resultSerializationTime: Long = _
+  def resultSerializationTime = _resultSerializationTime
+  private[spark] def setResultSerializationTime(value: Long) = _resultSerializationTime = value
 
   /**
    * The number of in-memory bytes spilled by this task
    */
-  var memoryBytesSpilled: Long = _
+  private var _memoryBytesSpilled: Long = _
+  def memoryBytesSpilled = _memoryBytesSpilled
+  private[spark] def incMemoryBytesSpilled(value: Long) = _memoryBytesSpilled += value
+  private[spark] def decMemoryBytesSpilled(value: Long) = _memoryBytesSpilled -= value
 
   /**
    * The number of on-disk bytes spilled by this task
    */
-  var diskBytesSpilled: Long = _
+  private var _diskBytesSpilled: Long = _
+  def diskBytesSpilled = _diskBytesSpilled
+  def incDiskBytesSpilled(value: Long) = _diskBytesSpilled += value
+  def decDiskBytesSpilled(value: Long) = _diskBytesSpilled -= value
 
   /**
    * If this task reads from a HadoopRDD or from persisted data, metrics on how much data was read
    * are stored here.
    */
-  var inputMetrics: Option[InputMetrics] = None
+  private var _inputMetrics: Option[InputMetrics] = None
+
+  def inputMetrics = _inputMetrics
+
+  /**
+   * This should only be used when recreating TaskMetrics, not when updating input metrics in
+   * executors
+   */
+  private[spark] def setInputMetrics(inputMetrics: Option[InputMetrics]) {
+    _inputMetrics = inputMetrics
+  }
 
   /**
    * If this task writes data externally (e.g. to a distributed filesystem), metrics on how much
@@ -133,19 +168,47 @@ class TaskMetrics extends Serializable {
     readMetrics
   }
 
+  /**
+   * Returns the input metrics object that the task should use. Currently, if
+   * there exists an input metric with the same readMethod, we return that one
+   * so the caller can accumulate bytes read. If the readMethod is different
+   * than previously seen by this task, we return a new InputMetric but don't
+   * record it.
+   *
+   * Once https://issues.apache.org/jira/browse/SPARK-5225 is addressed,
+   * we can store all the different inputMetrics (one per readMethod).
+   */
+  private[spark] def getInputMetricsForReadMethod(readMethod: DataReadMethod):
+    InputMetrics =synchronized {
+    _inputMetrics match {
+      case None =>
+        val metrics = new InputMetrics(readMethod)
+        _inputMetrics = Some(metrics)
+        metrics
+      case Some(metrics @ InputMetrics(method)) if method == readMethod =>
+        metrics
+      case Some(InputMetrics(method)) =>
+       new InputMetrics(readMethod)
+    }
+  }
+
   /**
    * Aggregates shuffle read metrics for all registered dependencies into shuffleReadMetrics.
    */
   private[spark] def updateShuffleReadMetrics() = synchronized {
     val merged = new ShuffleReadMetrics()
     for (depMetrics <- depsShuffleReadMetrics) {
-      merged.fetchWaitTime += depMetrics.fetchWaitTime
-      merged.localBlocksFetched += depMetrics.localBlocksFetched
-      merged.remoteBlocksFetched += depMetrics.remoteBlocksFetched
-      merged.remoteBytesRead += depMetrics.remoteBytesRead
+      merged.incFetchWaitTime(depMetrics.fetchWaitTime)
+      merged.incLocalBlocksFetched(depMetrics.localBlocksFetched)
+      merged.incRemoteBlocksFetched(depMetrics.remoteBlocksFetched)
+      merged.incRemoteBytesRead(depMetrics.remoteBytesRead)
     }
     _shuffleReadMetrics = Some(merged)
   }
+
+  private[spark] def updateInputMetrics() = synchronized {
+    inputMetrics.foreach(_.updateBytesRead())
+  }
 }
 
 private[spark] object TaskMetrics {
@@ -179,10 +242,38 @@ object DataWriteMethod extends Enumeration with Serializable {
  */
 @DeveloperApi
 case class InputMetrics(readMethod: DataReadMethod.Value) {
+
+  private val _bytesRead: AtomicLong = new AtomicLong()
+
   /**
    * Total bytes read.
    */
-  var bytesRead: Long = 0L
+  def bytesRead: Long = _bytesRead.get()
+  @volatile @transient var bytesReadCallback: Option[() => Long] = None
+
+  /**
+   * Adds additional bytes read for this read method.
+   */
+  def addBytesRead(bytes: Long) = {
+    _bytesRead.addAndGet(bytes)
+  }
+
+  /**
+   * Invoke the bytesReadCallback and mutate bytesRead.
+   */
+  def updateBytesRead() {
+    bytesReadCallback.foreach { c =>
+      _bytesRead.set(c())
+    }
+  }
+
+ /**
+  * Register a function that can be called to get up-to-date information on how many bytes the task
+  * has read from an input source.
+  */
+  def setBytesReadCallback(f: Option[() => Long]) {
+    bytesReadCallback = f
+  }
 }
 
 /**
@@ -194,7 +285,9 @@ case class OutputMetrics(writeMethod: DataWriteMethod.Value) {
   /**
    * Total bytes written
    */
-  var bytesWritten: Long = 0L
+  private var _bytesWritten: Long = _
+  def bytesWritten = _bytesWritten
+  private[spark] def setBytesWritten(value : Long) = _bytesWritten = value
 }
 
 /**
@@ -203,32 +296,45 @@ case class OutputMetrics(writeMethod: DataWriteMethod.Value) {
  */
 @DeveloperApi
 class ShuffleReadMetrics extends Serializable {
-  /**
-   * Number of blocks fetched in this shuffle by this task (remote or local)
-   */
-  def totalBlocksFetched: Int = remoteBlocksFetched + localBlocksFetched
-
   /**
    * Number of remote blocks fetched in this shuffle by this task
    */
-  var remoteBlocksFetched: Int = _
-
+  private var _remoteBlocksFetched: Int = _
+  def remoteBlocksFetched = _remoteBlocksFetched
+  private[spark] def incRemoteBlocksFetched(value: Int) = _remoteBlocksFetched += value
+  private[spark] def defRemoteBlocksFetched(value: Int) = _remoteBlocksFetched -= value
+  
   /**
    * Number of local blocks fetched in this shuffle by this task
    */
-  var localBlocksFetched: Int = _
+  private var _localBlocksFetched: Int = _
+  def localBlocksFetched = _localBlocksFetched
+  private[spark] def incLocalBlocksFetched(value: Int) = _localBlocksFetched += value
+  private[spark] def defLocalBlocksFetched(value: Int) = _localBlocksFetched -= value
+
 
   /**
    * Time the task spent waiting for remote shuffle blocks. This only includes the time
    * blocking on shuffle input data. For instance if block B is being fetched while the task is
    * still not finished processing block A, it is not considered to be blocking on block B.
    */
-  var fetchWaitTime: Long = _
-
+  private var _fetchWaitTime: Long = _
+  def fetchWaitTime = _fetchWaitTime
+  private[spark] def incFetchWaitTime(value: Long) = _fetchWaitTime += value
+  private[spark] def decFetchWaitTime(value: Long) = _fetchWaitTime -= value
+  
   /**
    * Total number of remote bytes read from the shuffle by this task
    */
-  var remoteBytesRead: Long = _
+  private var _remoteBytesRead: Long = _
+  def remoteBytesRead = _remoteBytesRead
+  private[spark] def incRemoteBytesRead(value: Long) = _remoteBytesRead += value
+  private[spark] def decRemoteBytesRead(value: Long) = _remoteBytesRead -= value
+
+  /**
+   * Number of blocks fetched in this shuffle by this task (remote or local)
+   */
+  def totalBlocksFetched = _remoteBlocksFetched + _localBlocksFetched
 }
 
 /**
@@ -240,10 +346,18 @@ class ShuffleWriteMetrics extends Serializable {
   /**
    * Number of bytes written for the shuffle by this task
    */
-  @volatile var shuffleBytesWritten: Long = _
-
+  @volatile private var _shuffleBytesWritten: Long = _
+  def shuffleBytesWritten = _shuffleBytesWritten
+  private[spark] def incShuffleBytesWritten(value: Long) = _shuffleBytesWritten += value
+  private[spark] def decShuffleBytesWritten(value: Long) = _shuffleBytesWritten -= value
+  
   /**
    * Time the task spent blocking on writes to disk or buffer cache, in nanoseconds
    */
-  @volatile var shuffleWriteTime: Long = _
+  @volatile private var _shuffleWriteTime: Long = _
+  def shuffleWriteTime= _shuffleWriteTime
+  private[spark] def incShuffleWriteTime(value: Long) = _shuffleWriteTime += value
+  private[spark] def decShuffleWriteTime(value: Long) = _shuffleWriteTime -= value
+  
+
 }
diff --git a/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala b/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala
index 89b29af2000c8..c219d21fbefa9 100644
--- a/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala
+++ b/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala
@@ -21,6 +21,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.io.{BytesWritable, LongWritable}
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
+import org.apache.spark.deploy.SparkHadoopUtil
 
 /**
  * Custom Input Format for reading and splitting flat binary files that contain records,
@@ -33,7 +34,7 @@ private[spark] object FixedLengthBinaryInputFormat {
 
   /** Retrieves the record length property from a Hadoop configuration */
   def getRecordLength(context: JobContext): Int = {
-    context.getConfiguration.get(RECORD_LENGTH_PROPERTY).toInt
+    SparkHadoopUtil.get.getConfigurationFromJobContext(context).get(RECORD_LENGTH_PROPERTY).toInt
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryRecordReader.scala b/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryRecordReader.scala
index 36a1e5d475f46..67a96925da019 100644
--- a/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryRecordReader.scala
+++ b/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryRecordReader.scala
@@ -24,6 +24,7 @@ import org.apache.hadoop.io.compress.CompressionCodecFactory
 import org.apache.hadoop.io.{BytesWritable, LongWritable}
 import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.input.FileSplit
+import org.apache.spark.deploy.SparkHadoopUtil
 
 /**
  * FixedLengthBinaryRecordReader is returned by FixedLengthBinaryInputFormat.
@@ -82,7 +83,7 @@ private[spark] class FixedLengthBinaryRecordReader
     // the actual file we will be reading from
     val file = fileSplit.getPath
     // job configuration
-    val job = context.getConfiguration
+    val job = SparkHadoopUtil.get.getConfigurationFromJobContext(context)
     // check compression
     val codec = new CompressionCodecFactory(job).getCodec(file)
     if (codec != null) {
diff --git a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
index 457472547fcbb..593a62b3e3b32 100644
--- a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
+++ b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
@@ -28,6 +28,7 @@ import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAt
 import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit}
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.deploy.SparkHadoopUtil
 
 /**
  * A general format for reading whole files in as streams, byte arrays,
@@ -145,7 +146,8 @@ class PortableDataStream(
 
   private val confBytes = {
     val baos = new ByteArrayOutputStream()
-    context.getConfiguration.write(new DataOutputStream(baos))
+    SparkHadoopUtil.get.getConfigurationFromJobContext(context).
+      write(new DataOutputStream(baos))
     baos.toByteArray
   }
 
diff --git a/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala b/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala
index d3601cca832b2..aaef7c74eea33 100644
--- a/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala
+++ b/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala
@@ -19,7 +19,6 @@ package org.apache.spark.input
 
 import scala.collection.JavaConversions._
 
-import org.apache.hadoop.conf.{Configuration, Configurable}
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.InputSplit
 import org.apache.hadoop.mapreduce.JobContext
@@ -38,18 +37,13 @@ private[spark] class WholeTextFileInputFormat
 
   override protected def isSplitable(context: JobContext, file: Path): Boolean = false
 
-  private var conf: Configuration = _
-  def setConf(c: Configuration) {
-    conf = c
-  }
-  def getConf: Configuration = conf
-
   override def createRecordReader(
       split: InputSplit,
       context: TaskAttemptContext): RecordReader[String, String] = {
 
-    val reader = new WholeCombineFileRecordReader(split, context)
-    reader.setConf(conf)
+    val reader =
+      new ConfigurableCombineFileRecordReader(split, context, classOf[WholeTextFileRecordReader])
+    reader.setConf(getConf)
     reader
   }
 
diff --git a/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala b/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala
index 6d59b24eb0596..31bde8a78f3c6 100644
--- a/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala
+++ b/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.input
 
-import org.apache.hadoop.conf.{Configuration, Configurable}
+import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable}
 import com.google.common.io.{ByteStreams, Closeables}
 
 import org.apache.hadoop.io.Text
@@ -26,6 +26,19 @@ import org.apache.hadoop.mapreduce.InputSplit
 import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader}
 import org.apache.hadoop.mapreduce.RecordReader
 import org.apache.hadoop.mapreduce.TaskAttemptContext
+import org.apache.spark.deploy.SparkHadoopUtil
+
+
+/**
+ * A trait to implement [[org.apache.hadoop.conf.Configurable Configurable]] interface.
+ */
+private[spark] trait Configurable extends HConfigurable {
+  private var conf: Configuration = _
+  def setConf(c: Configuration) {
+    conf = c
+  }
+  def getConf: Configuration = conf
+}
 
 /**
  * A [[org.apache.hadoop.mapreduce.RecordReader RecordReader]] for reading a single whole text file
@@ -38,14 +51,9 @@ private[spark] class WholeTextFileRecordReader(
     index: Integer)
   extends RecordReader[String, String] with Configurable {
 
-  private var conf: Configuration = _
-  def setConf(c: Configuration) {
-    conf = c
-  }
-  def getConf: Configuration = conf
-
   private[this] val path = split.getPath(index)
-  private[this] val fs = path.getFileSystem(context.getConfiguration)
+  private[this] val fs = path.getFileSystem(
+    SparkHadoopUtil.get.getConfigurationFromJobContext(context))
 
   // True means the current file has been processed, then skip it.
   private[this] var processed = false
@@ -87,29 +95,24 @@ private[spark] class WholeTextFileRecordReader(
 
 
 /**
- * A [[org.apache.hadoop.mapreduce.RecordReader RecordReader]] for reading a single whole text file
- * out in a key-value pair, where the key is the file path and the value is the entire content of
- * the file.
+ * A [[org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader CombineFileRecordReader]]
+ * that can pass Hadoop Configuration to [[org.apache.hadoop.conf.Configurable Configurable]]
+ * RecordReaders.
  */
-private[spark] class WholeCombineFileRecordReader(
+private[spark] class ConfigurableCombineFileRecordReader[K, V](
     split: InputSplit,
-    context: TaskAttemptContext)
-  extends CombineFileRecordReader[String, String](
+    context: TaskAttemptContext,
+    recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable])
+  extends CombineFileRecordReader[K, V](
     split.asInstanceOf[CombineFileSplit],
     context,
-    classOf[WholeTextFileRecordReader]
+    recordReaderClass
   ) with Configurable {
 
-  private var conf: Configuration = _
-  def setConf(c: Configuration) {
-    conf = c
-  }
-  def getConf: Configuration = conf
-
   override def initNextRecordReader(): Boolean = {
     val r = super.initNextRecordReader()
     if (r) {
-      this.curReader.asInstanceOf[WholeTextFileRecordReader].setConf(conf)
+      this.curReader.asInstanceOf[HConfigurable].setConf(getConf)
     }
     r
   }
diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
index 1ac7f4e448eb1..f856890d279f4 100644
--- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
+++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -21,11 +21,12 @@ import java.io.{InputStream, OutputStream}
 
 import com.ning.compress.lzf.{LZFInputStream, LZFOutputStream}
 import net.jpountz.lz4.{LZ4BlockInputStream, LZ4BlockOutputStream}
-import org.xerial.snappy.{SnappyInputStream, SnappyOutputStream}
+import org.xerial.snappy.{Snappy, SnappyInputStream, SnappyOutputStream}
 
 import org.apache.spark.SparkConf
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.util.Utils
+import org.apache.spark.Logging
 
 /**
  * :: DeveloperApi ::
@@ -44,25 +45,33 @@ trait CompressionCodec {
   def compressedInputStream(s: InputStream): InputStream
 }
 
-
 private[spark] object CompressionCodec {
 
+  private val configKey = "spark.io.compression.codec"
   private val shortCompressionCodecNames = Map(
     "lz4" -> classOf[LZ4CompressionCodec].getName,
     "lzf" -> classOf[LZFCompressionCodec].getName,
     "snappy" -> classOf[SnappyCompressionCodec].getName)
 
   def createCodec(conf: SparkConf): CompressionCodec = {
-    createCodec(conf, conf.get("spark.io.compression.codec", DEFAULT_COMPRESSION_CODEC))
+    createCodec(conf, conf.get(configKey, DEFAULT_COMPRESSION_CODEC))
   }
 
   def createCodec(conf: SparkConf, codecName: String): CompressionCodec = {
     val codecClass = shortCompressionCodecNames.getOrElse(codecName.toLowerCase, codecName)
-    val ctor = Class.forName(codecClass, true, Utils.getContextOrSparkClassLoader)
-      .getConstructor(classOf[SparkConf])
-    ctor.newInstance(conf).asInstanceOf[CompressionCodec]
+    val codec = try {
+      val ctor = Class.forName(codecClass, true, Utils.getContextOrSparkClassLoader)
+        .getConstructor(classOf[SparkConf])
+      Some(ctor.newInstance(conf).asInstanceOf[CompressionCodec])
+    } catch {
+      case e: ClassNotFoundException => None
+      case e: IllegalArgumentException => None
+    }
+    codec.getOrElse(throw new IllegalArgumentException(s"Codec [$codecName] is not available. " +
+      s"Consider setting $configKey=$FALLBACK_COMPRESSION_CODEC"))
   }
 
+  val FALLBACK_COMPRESSION_CODEC = "lzf"
   val DEFAULT_COMPRESSION_CODEC = "snappy"
   val ALL_COMPRESSION_CODECS = shortCompressionCodecNames.values.toSeq
 }
@@ -120,6 +129,12 @@ class LZFCompressionCodec(conf: SparkConf) extends CompressionCodec {
 @DeveloperApi
 class SnappyCompressionCodec(conf: SparkConf) extends CompressionCodec {
 
+  try {
+    Snappy.getNativeLibraryVersion
+  } catch {
+    case e: Error => throw new IllegalArgumentException
+  }
+
   override def compressedOutputStream(s: OutputStream): OutputStream = {
     val blockSize = conf.getInt("spark.io.compression.snappy.block.size", 32768)
     new SnappyOutputStream(s, blockSize)
diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
index 5dd67b0cbf683..45633e3de01dd 100644
--- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
@@ -76,22 +76,36 @@ private[spark] class MetricsSystem private (
   private val sources = new mutable.ArrayBuffer[Source]
   private val registry = new MetricRegistry()
 
+  private var running: Boolean = false
+
   // Treat MetricsServlet as a special sink as it should be exposed to add handlers to web ui
   private var metricsServlet: Option[MetricsServlet] = None
 
-  /** Get any UI handlers used by this metrics system. */
-  def getServletHandlers = metricsServlet.map(_.getHandlers).getOrElse(Array())
+  /**
+   * Get any UI handlers used by this metrics system; can only be called after start().
+   */
+  def getServletHandlers = {
+    require(running, "Can only call getServletHandlers on a running MetricsSystem")
+    metricsServlet.map(_.getHandlers).getOrElse(Array())
+  }
 
   metricsConfig.initialize()
 
   def start() {
+    require(!running, "Attempting to start a MetricsSystem that is already running")
+    running = true
     registerSources()
     registerSinks()
     sinks.foreach(_.start)
   }
 
   def stop() {
-    sinks.foreach(_.stop)
+    if (running) {
+      sinks.foreach(_.stop)
+    } else {
+      logWarning("Stopping a MetricsSystem that is not running")
+    }
+    running = false
   }
 
   def report() {
@@ -107,7 +121,7 @@ private[spark] class MetricsSystem private (
    * @return An unique metric name for each combination of
    *         application, executor/driver and metric source.
    */
-  def buildRegistryName(source: Source): String = {
+  private[spark] def buildRegistryName(source: Source): String = {
     val appId = conf.getOption("spark.app.id")
     val executorId = conf.getOption("spark.executor.id")
     val defaultName = MetricRegistry.name(source.sourceName)
@@ -144,7 +158,7 @@ private[spark] class MetricsSystem private (
     })
   }
 
-  def registerSources() {
+  private def registerSources() {
     val instConfig = metricsConfig.getInstance(instance)
     val sourceConfigs = metricsConfig.subProperties(instConfig, MetricsSystem.SOURCE_REGEX)
 
@@ -160,7 +174,7 @@ private[spark] class MetricsSystem private (
     }
   }
 
-  def registerSinks() {
+  private def registerSinks() {
     val instConfig = metricsConfig.getInstance(instance)
     val sinkConfigs = metricsConfig.subProperties(instConfig, MetricsSystem.SINK_REGEX)
 
diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
index 0027cbb0ff1fb..3f0950dae1f24 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
@@ -60,7 +60,7 @@ class NettyBlockTransferService(conf: SparkConf, securityManager: SecurityManage
     }
     transportContext = new TransportContext(transportConf, rpcHandler)
     clientFactory = transportContext.createClientFactory(bootstrap.toList)
-    server = transportContext.createServer()
+    server = transportContext.createServer(conf.getInt("spark.blockManager.port", 0))
     appId = conf.getAppId
     logInfo("Server created on " + server.getPort)
   }
diff --git a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala
index df4b085d2251e..03c4137ca0a81 100644
--- a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala
+++ b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala
@@ -81,11 +81,24 @@ private[nio] class ConnectionManager(
   private val ackTimeoutMonitor =
     new HashedWheelTimer(Utils.namedThreadFactory("AckTimeoutMonitor"))
 
-  private val ackTimeout = conf.getInt("spark.core.connection.ack.wait.timeout", 60)
+  private val ackTimeout =
+    conf.getInt("spark.core.connection.ack.wait.timeout", conf.getInt("spark.network.timeout", 120))
+
+  // Get the thread counts from the Spark Configuration.
+  // 
+  // Even though the ThreadPoolExecutor constructor takes both a minimum and maximum value,
+  // we only query for the minimum value because we are using LinkedBlockingDeque.
+  // 
+  // The JavaDoc for ThreadPoolExecutor points out that when using a LinkedBlockingDeque (which is 
+  // an unbounded queue) no more than corePoolSize threads will ever be created, so only the "min"
+  // parameter is necessary.
+  private val handlerThreadCount = conf.getInt("spark.core.connection.handler.threads.min", 20)
+  private val ioThreadCount = conf.getInt("spark.core.connection.io.threads.min", 4)
+  private val connectThreadCount = conf.getInt("spark.core.connection.connect.threads.min", 1)
 
   private val handleMessageExecutor = new ThreadPoolExecutor(
-    conf.getInt("spark.core.connection.handler.threads.min", 20),
-    conf.getInt("spark.core.connection.handler.threads.max", 60),
+    handlerThreadCount,
+    handlerThreadCount,
     conf.getInt("spark.core.connection.handler.threads.keepalive", 60), TimeUnit.SECONDS,
     new LinkedBlockingDeque[Runnable](),
     Utils.namedThreadFactory("handle-message-executor")) {
@@ -96,12 +109,11 @@ private[nio] class ConnectionManager(
         logError("Error in handleMessageExecutor is not handled properly", t)
       }
     }
-
   }
 
   private val handleReadWriteExecutor = new ThreadPoolExecutor(
-    conf.getInt("spark.core.connection.io.threads.min", 4),
-    conf.getInt("spark.core.connection.io.threads.max", 32),
+    ioThreadCount,
+    ioThreadCount,
     conf.getInt("spark.core.connection.io.threads.keepalive", 60), TimeUnit.SECONDS,
     new LinkedBlockingDeque[Runnable](),
     Utils.namedThreadFactory("handle-read-write-executor")) {
@@ -112,14 +124,13 @@ private[nio] class ConnectionManager(
         logError("Error in handleReadWriteExecutor is not handled properly", t)
       }
     }
-
   }
 
   // Use a different, yet smaller, thread pool - infrequently used with very short lived tasks :
   // which should be executed asap
   private val handleConnectExecutor = new ThreadPoolExecutor(
-    conf.getInt("spark.core.connection.connect.threads.min", 1),
-    conf.getInt("spark.core.connection.connect.threads.max", 8),
+    connectThreadCount,
+    connectThreadCount,
     conf.getInt("spark.core.connection.connect.threads.keepalive", 60), TimeUnit.SECONDS,
     new LinkedBlockingDeque[Runnable](),
     Utils.namedThreadFactory("handle-connect-executor")) {
@@ -130,7 +141,6 @@ private[nio] class ConnectionManager(
         logError("Error in handleConnectExecutor is not handled properly", t)
       }
     }
-
   }
 
   private val serverChannel = ServerSocketChannel.open()
@@ -164,7 +174,7 @@ private[nio] class ConnectionManager(
     serverChannel.socket.bind(new InetSocketAddress(port))
     (serverChannel, serverChannel.socket.getLocalPort)
   }
-  Utils.startServiceOnPort[ServerSocketChannel](port, startService, name)
+  Utils.startServiceOnPort[ServerSocketChannel](port, startService, conf, name)
   serverChannel.register(selector, SelectionKey.OP_ACCEPT)
 
   val id = new ConnectionManagerId(Utils.localHostName, serverChannel.socket.getLocalPort)
diff --git a/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
index 7ba1182f0ed27..1c13e2c372845 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
@@ -95,7 +95,8 @@ private[spark] object CheckpointRDD extends Logging {
 
     val finalOutputName = splitIdToFile(ctx.partitionId)
     val finalOutputPath = new Path(outputDir, finalOutputName)
-    val tempOutputPath = new Path(outputDir, "." + finalOutputName + "-attempt-" + ctx.attemptId)
+    val tempOutputPath =
+      new Path(outputDir, "." + finalOutputName + "-attempt-" + ctx.attemptNumber)
 
     if (fs.exists(tempOutputPath)) {
       throw new IOException("Checkpoint failed: temporary path " +
@@ -119,7 +120,7 @@ private[spark] object CheckpointRDD extends Logging {
         logInfo("Deleting tempOutputPath " + tempOutputPath)
         fs.delete(tempOutputPath, false)
         throw new IOException("Checkpoint failed: failed to save output of task: "
-          + ctx.attemptId + " and final output path does not exist")
+          + ctx.attemptNumber + " and final output path does not exist")
       } else {
         // Some other copy of this task must've finished before us and renamed it
         logInfo("Final output path " + finalOutputPath + " already exists; not overwriting it")
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
index ffc0a8a6d67eb..07398a6fa62f6 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
@@ -60,7 +60,7 @@ private[spark] class CoGroupPartition(idx: Int, val deps: Array[CoGroupSplitDep]
  * A RDD that cogroups its parents. For each key k in parent RDDs, the resulting RDD contains a
  * tuple with the list of values for that key.
  *
- * Note: This is an internal API. We recommend users use RDD.coGroup(...) instead of
+ * Note: This is an internal API. We recommend users use RDD.cogroup(...) instead of
  * instantiating this directly.
 
  * @param rdds parent RDDs.
@@ -70,8 +70,8 @@ private[spark] class CoGroupPartition(idx: Int, val deps: Array[CoGroupSplitDep]
 class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part: Partitioner)
   extends RDD[(K, Array[Iterable[_]])](rdds.head.context, Nil) {
 
-  // For example, `(k, a) cogroup (k, b)` produces k -> Seq(ArrayBuffer as, ArrayBuffer bs).
-  // Each ArrayBuffer is represented as a CoGroup, and the resulting Seq as a CoGroupCombiner.
+  // For example, `(k, a) cogroup (k, b)` produces k -> Array(ArrayBuffer as, ArrayBuffer bs).
+  // Each ArrayBuffer is represented as a CoGroup, and the resulting Array as a CoGroupCombiner.
   // CoGroupValue is the intermediate state of each value before being merged in compute.
   private type CoGroup = CompactBuffer[Any]
   private type CoGroupValue = (Any, Int)  // Int is dependency number
@@ -159,8 +159,8 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part:
       for ((it, depNum) <- rddIterators) {
         map.insertAll(it.map(pair => (pair._1, new CoGroupValue(pair._2, depNum))))
       }
-      context.taskMetrics.memoryBytesSpilled += map.memoryBytesSpilled
-      context.taskMetrics.diskBytesSpilled += map.diskBytesSpilled
+      context.taskMetrics.incMemoryBytesSpilled(map.memoryBytesSpilled)
+      context.taskMetrics.incDiskBytesSpilled(map.diskBytesSpilled)
       new InterruptibleIterator(context,
         map.iterator.asInstanceOf[Iterator[(K, Array[Iterable[_]])]])
     }
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
index 9fab1d78abb04..b073eba8a1574 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
@@ -35,11 +35,10 @@ import org.apache.spark.util.Utils
  * @param preferredLocation the preferred location for this partition
  */
 private[spark] case class CoalescedRDDPartition(
-                                  index: Int,
-                                  @transient rdd: RDD[_],
-                                  parentsIndices: Array[Int],
-                                  @transient preferredLocation: String = ""
-                                  ) extends Partition {
+    index: Int,
+    @transient rdd: RDD[_],
+    parentsIndices: Array[Int],
+    @transient preferredLocation: Option[String] = None) extends Partition {
   var parents: Seq[Partition] = parentsIndices.map(rdd.partitions(_))
 
   @throws(classOf[IOException])
@@ -55,9 +54,10 @@ private[spark] case class CoalescedRDDPartition(
    * @return locality of this coalesced partition between 0 and 1
    */
   def localFraction: Double = {
-    val loc = parents.count(p =>
-      rdd.context.getPreferredLocs(rdd, p.index).map(tl => tl.host).contains(preferredLocation))
-
+    val loc = parents.count { p =>
+      val parentPreferredLocations = rdd.context.getPreferredLocs(rdd, p.index).map(_.host)
+      preferredLocation.exists(parentPreferredLocations.contains)
+    }
     if (parents.size == 0) 0.0 else (loc.toDouble / parents.size.toDouble)
   }
 }
@@ -73,9 +73,9 @@ private[spark] case class CoalescedRDDPartition(
  * @param balanceSlack used to trade-off balance and locality. 1.0 is all locality, 0 is all balance
  */
 private[spark] class CoalescedRDD[T: ClassTag](
-                                      @transient var prev: RDD[T],
-                                      maxPartitions: Int,
-                                      balanceSlack: Double = 0.10)
+    @transient var prev: RDD[T],
+    maxPartitions: Int,
+    balanceSlack: Double = 0.10)
   extends RDD[T](prev.context, Nil) {  // Nil since we implement getDependencies
 
   override def getPartitions: Array[Partition] = {
@@ -113,7 +113,7 @@ private[spark] class CoalescedRDD[T: ClassTag](
    * @return the machine most preferred by split
    */
   override def getPreferredLocations(partition: Partition): Seq[String] = {
-    List(partition.asInstanceOf[CoalescedRDDPartition].preferredLocation)
+    partition.asInstanceOf[CoalescedRDDPartition].preferredLocation.toSeq
   }
 }
 
@@ -147,7 +147,7 @@ private[spark] class CoalescedRDD[T: ClassTag](
  *
  */
 
-private[spark] class PartitionCoalescer(maxPartitions: Int, prev: RDD[_], balanceSlack: Double) {
+private class PartitionCoalescer(maxPartitions: Int, prev: RDD[_], balanceSlack: Double) {
 
   def compare(o1: PartitionGroup, o2: PartitionGroup): Boolean = o1.size < o2.size
   def compare(o1: Option[PartitionGroup], o2: Option[PartitionGroup]): Boolean =
@@ -341,8 +341,14 @@ private[spark] class PartitionCoalescer(maxPartitions: Int, prev: RDD[_], balanc
   }
 }
 
-private[spark] case class PartitionGroup(prefLoc: String = "") {
+private case class PartitionGroup(prefLoc: Option[String] = None) {
   var arr = mutable.ArrayBuffer[Partition]()
-
   def size = arr.size
 }
+
+private object PartitionGroup {
+  def apply(prefLoc: String): PartitionGroup = {
+    require(prefLoc != "", "Preferred location must not be empty")
+    PartitionGroup(Some(prefLoc))
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index a157e36e2286e..056aef0bc210a 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -131,7 +131,7 @@ class HadoopRDD[K, V](
   // used to build JobTracker ID
   private val createTime = new Date()
 
-  private val shouldCloneJobConf = sc.conf.get("spark.hadoop.cloneConf", "false").toBoolean
+  private val shouldCloneJobConf = sc.conf.getBoolean("spark.hadoop.cloneConf", false)
 
   // Returns a JobConf that will be used on slaves to obtain input splits for Hadoop reads.
   protected def getJobConf(): JobConf = {
@@ -213,23 +213,24 @@ class HadoopRDD[K, V](
       logInfo("Input split: " + split.inputSplit)
       val jobConf = getJobConf()
 
-      val inputMetrics = new InputMetrics(DataReadMethod.Hadoop)
+      val inputMetrics = context.taskMetrics
+        .getInputMetricsForReadMethod(DataReadMethod.Hadoop)
+
       // Find a function that will return the FileSystem bytes read by this thread. Do this before
       // creating RecordReader, because RecordReader's constructor might read some bytes
-      val bytesReadCallback = if (split.inputSplit.value.isInstanceOf[FileSplit]) {
-        SparkHadoopUtil.get.getFSBytesReadOnThreadCallback(
-          split.inputSplit.value.asInstanceOf[FileSplit].getPath, jobConf)
-      } else {
-        None
-      }
-      if (bytesReadCallback.isDefined) {
-        context.taskMetrics.inputMetrics = Some(inputMetrics)
-      }
+      val bytesReadCallback = inputMetrics.bytesReadCallback.orElse(
+        split.inputSplit.value match {
+          case split: FileSplit =>
+            SparkHadoopUtil.get.getFSBytesReadOnThreadCallback(split.getPath, jobConf)
+          case _ => None
+        }
+      )
+      inputMetrics.setBytesReadCallback(bytesReadCallback)
 
       var reader: RecordReader[K, V] = null
       val inputFormat = getInputFormat(jobConf)
       HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmm").format(createTime),
-        context.stageId, theSplit.index, context.attemptId.toInt, jobConf)
+        context.stageId, theSplit.index, context.attemptNumber, jobConf)
       reader = inputFormat.getRecordReader(split.inputSplit.value, jobConf, Reporter.NULL)
 
       // Register an on-task-completion callback to close the input stream.
@@ -237,8 +238,6 @@ class HadoopRDD[K, V](
       val key: K = reader.createKey()
       val value: V = reader.createValue()
 
-      var recordsSinceMetricsUpdate = 0
-
       override def getNext() = {
         try {
           finished = !reader.next(key, value)
@@ -247,15 +246,6 @@ class HadoopRDD[K, V](
             finished = true
         }
 
-        // Update bytes read metric every few records
-        if (recordsSinceMetricsUpdate == HadoopRDD.RECORDS_BETWEEN_BYTES_READ_METRIC_UPDATES
-            && bytesReadCallback.isDefined) {
-          recordsSinceMetricsUpdate = 0
-          val bytesReadFn = bytesReadCallback.get
-          inputMetrics.bytesRead = bytesReadFn()
-        } else {
-          recordsSinceMetricsUpdate += 1
-        }
         (key, value)
       }
 
@@ -263,14 +253,12 @@ class HadoopRDD[K, V](
         try {
           reader.close()
           if (bytesReadCallback.isDefined) {
-            val bytesReadFn = bytesReadCallback.get
-            inputMetrics.bytesRead = bytesReadFn()
+            inputMetrics.updateBytesRead()
           } else if (split.inputSplit.value.isInstanceOf[FileSplit]) {
             // If we can't get the bytes read from the FS stats, fall back to the split size,
             // which may be inaccurate.
             try {
-              inputMetrics.bytesRead = split.inputSplit.value.getLength
-              context.taskMetrics.inputMetrics = Some(inputMetrics)
+              inputMetrics.addBytesRead(split.inputSplit.value.getLength)
             } catch {
               case e: java.io.IOException =>
                 logWarning("Unable to get input size to set InputMetrics for task", e)
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index e55d03d391e03..7b0e3c87ccff4 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -109,18 +109,19 @@ class NewHadoopRDD[K, V](
       logInfo("Input split: " + split.serializableHadoopSplit)
       val conf = confBroadcast.value.value
 
-      val inputMetrics = new InputMetrics(DataReadMethod.Hadoop)
+      val inputMetrics = context.taskMetrics
+        .getInputMetricsForReadMethod(DataReadMethod.Hadoop)
+
       // Find a function that will return the FileSystem bytes read by this thread. Do this before
       // creating RecordReader, because RecordReader's constructor might read some bytes
-      val bytesReadCallback = if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit]) {
-        SparkHadoopUtil.get.getFSBytesReadOnThreadCallback(
-          split.serializableHadoopSplit.value.asInstanceOf[FileSplit].getPath, conf)
-      } else {
-        None
-      }
-      if (bytesReadCallback.isDefined) {
-        context.taskMetrics.inputMetrics = Some(inputMetrics)
-      }
+      val bytesReadCallback = inputMetrics.bytesReadCallback.orElse(
+        split.serializableHadoopSplit.value match {
+          case split: FileSplit =>
+            SparkHadoopUtil.get.getFSBytesReadOnThreadCallback(split.getPath, conf)
+          case _ => None
+        }
+      )
+      inputMetrics.setBytesReadCallback(bytesReadCallback)
 
       val attemptId = newTaskAttemptID(jobTrackerId, id, isMap = true, split.index, 0)
       val hadoopAttemptContext = newTaskAttemptContext(conf, attemptId)
@@ -154,33 +155,19 @@ class NewHadoopRDD[K, V](
         }
         havePair = false
 
-        // Update bytes read metric every few records
-        if (recordsSinceMetricsUpdate == HadoopRDD.RECORDS_BETWEEN_BYTES_READ_METRIC_UPDATES
-            && bytesReadCallback.isDefined) {
-          recordsSinceMetricsUpdate = 0
-          val bytesReadFn = bytesReadCallback.get
-          inputMetrics.bytesRead = bytesReadFn()
-        } else {
-          recordsSinceMetricsUpdate += 1
-        }
-
         (reader.getCurrentKey, reader.getCurrentValue)
       }
 
       private def close() {
         try {
           reader.close()
-
-          // Update metrics with final amount
           if (bytesReadCallback.isDefined) {
-            val bytesReadFn = bytesReadCallback.get
-            inputMetrics.bytesRead = bytesReadFn()
+            inputMetrics.updateBytesRead()
           } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit]) {
             // If we can't get the bytes read from the FS stats, fall back to the split size,
             // which may be inaccurate.
             try {
-              inputMetrics.bytesRead = split.serializableHadoopSplit.value.getLength
-              context.taskMetrics.inputMetrics = Some(inputMetrics)
+              inputMetrics.addBytesRead(split.serializableHadoopSplit.value.getLength)
             } catch {
               case e: java.io.IOException =>
                 logWarning("Unable to get input size to set InputMetrics for task", e)
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index c43e1f2fe135e..0f37d830ef34f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -25,6 +25,7 @@ import scala.collection.{Map, mutable}
 import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
+import scala.util.DynamicVariable
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus
 import org.apache.hadoop.conf.{Configurable, Configuration}
@@ -84,7 +85,10 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
         throw new SparkException("Default partitioner cannot partition array keys.")
       }
     }
-    val aggregator = new Aggregator[K, V, C](createCombiner, mergeValue, mergeCombiners)
+    val aggregator = new Aggregator[K, V, C](
+      self.context.clean(createCombiner),
+      self.context.clean(mergeValue),
+      self.context.clean(mergeCombiners))
     if (self.partitioner == Some(partitioner)) {
       self.mapPartitions(iter => {
         val context = TaskContext.get()
@@ -120,11 +124,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   def aggregateByKey[U: ClassTag](zeroValue: U, partitioner: Partitioner)(seqOp: (U, V) => U,
       combOp: (U, U) => U): RDD[(K, U)] = {
     // Serialize the zero value to a byte array so that we can get a new clone of it on each key
-    val zeroBuffer = SparkEnv.get.closureSerializer.newInstance().serialize(zeroValue)
+    val zeroBuffer = SparkEnv.get.serializer.newInstance().serialize(zeroValue)
     val zeroArray = new Array[Byte](zeroBuffer.limit)
     zeroBuffer.get(zeroArray)
 
-    lazy val cachedSerializer = SparkEnv.get.closureSerializer.newInstance()
+    lazy val cachedSerializer = SparkEnv.get.serializer.newInstance()
     val createZero = () => cachedSerializer.deserialize[U](ByteBuffer.wrap(zeroArray))
 
     combineByKey[U]((v: V) => seqOp(createZero(), v), seqOp, combOp, partitioner)
@@ -165,12 +169,12 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    */
   def foldByKey(zeroValue: V, partitioner: Partitioner)(func: (V, V) => V): RDD[(K, V)] = {
     // Serialize the zero value to a byte array so that we can get a new clone of it on each key
-    val zeroBuffer = SparkEnv.get.closureSerializer.newInstance().serialize(zeroValue)
+    val zeroBuffer = SparkEnv.get.serializer.newInstance().serialize(zeroValue)
     val zeroArray = new Array[Byte](zeroBuffer.limit)
     zeroBuffer.get(zeroArray)
 
     // When deserializing, use a lazy val to create just one instance of the serializer per task
-    lazy val cachedSerializer = SparkEnv.get.closureSerializer.newInstance()
+    lazy val cachedSerializer = SparkEnv.get.serializer.newInstance()
     val createZero = () => cachedSerializer.deserialize[V](ByteBuffer.wrap(zeroArray))
 
     combineByKey[V]((v: V) => func(createZero(), v), func, func, partitioner)
@@ -433,6 +437,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Note: This operation may be very expensive. If you are grouping in order to perform an
    * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
    * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+   *
+   * Note: As currently implemented, groupByKey must be able to hold all the key-value pairs for any
+   * key in memory. If a key has too many values, it can result in an [[OutOfMemoryError]].
    */
   def groupByKey(partitioner: Partitioner): RDD[(K, Iterable[V])] = {
     // groupByKey shouldn't use map side combine because map side combine does not
@@ -454,6 +461,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Note: This operation may be very expensive. If you are grouping in order to perform an
    * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
    * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+   *
+   * Note: As currently implemented, groupByKey must be able to hold all the key-value pairs for any
+   * key in memory. If a key has too many values, it can result in an [[OutOfMemoryError]].
    */
   def groupByKey(numPartitions: Int): RDD[(K, Iterable[V])] = {
     groupByKey(new HashPartitioner(numPartitions))
@@ -480,7 +490,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    */
   def join[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, W))] = {
     this.cogroup(other, partitioner).flatMapValues( pair =>
-      for (v <- pair._1; w <- pair._2) yield (v, w)
+      for (v <- pair._1.iterator; w <- pair._2.iterator) yield (v, w)
     )
   }
 
@@ -493,9 +503,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   def leftOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, Option[W]))] = {
     this.cogroup(other, partitioner).flatMapValues { pair =>
       if (pair._2.isEmpty) {
-        pair._1.map(v => (v, None))
+        pair._1.iterator.map(v => (v, None))
       } else {
-        for (v <- pair._1; w <- pair._2) yield (v, Some(w))
+        for (v <- pair._1.iterator; w <- pair._2.iterator) yield (v, Some(w))
       }
     }
   }
@@ -510,9 +520,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       : RDD[(K, (Option[V], W))] = {
     this.cogroup(other, partitioner).flatMapValues { pair =>
       if (pair._1.isEmpty) {
-        pair._2.map(w => (None, w))
+        pair._2.iterator.map(w => (None, w))
       } else {
-        for (v <- pair._1; w <- pair._2) yield (Some(v), w)
+        for (v <- pair._1.iterator; w <- pair._2.iterator) yield (Some(v), w)
       }
     }
   }
@@ -528,9 +538,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   def fullOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner)
       : RDD[(K, (Option[V], Option[W]))] = {
     this.cogroup(other, partitioner).flatMapValues {
-      case (vs, Seq()) => vs.map(v => (Some(v), None))
-      case (Seq(), ws) => ws.map(w => (None, Some(w)))
-      case (vs, ws) => for (v <- vs; w <- ws) yield (Some(v), Some(w))
+      case (vs, Seq()) => vs.iterator.map(v => (Some(v), None))
+      case (Seq(), ws) => ws.iterator.map(w => (None, Some(w)))
+      case (vs, ws) => for (v <- vs.iterator; w <- ws.iterator) yield (Some(v), Some(w))
     }
   }
 
@@ -961,19 +971,16 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val outfmt = job.getOutputFormatClass
     val jobFormat = outfmt.newInstance
 
-    if (self.conf.getBoolean("spark.hadoop.validateOutputSpecs", true)) {
+    if (isOutputSpecValidationEnabled) {
       // FileOutputFormat ignores the filesystem parameter
       jobFormat.checkOutputSpecs(job)
     }
 
     val writeShard = (context: TaskContext, iter: Iterator[(K,V)]) => {
       val config = wrappedConf.value
-      // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
-      // around by taking a mod. We expect that no task will be attempted 2 billion times.
-      val attemptNumber = (context.attemptId % Int.MaxValue).toInt
       /* "reduce task" <split #> <attempt # = spark task #> */
       val attemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = false, context.partitionId,
-        attemptNumber)
+        context.attemptNumber)
       val hadoopContext = newTaskAttemptContext(config, attemptId)
       val format = outfmt.newInstance
       format match {
@@ -1000,7 +1007,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
         writer.close(hadoopContext)
       }
       committer.commitTask(hadoopContext)
-      bytesWrittenCallback.foreach { fn => outputMetrics.bytesWritten = fn() }
+      bytesWrittenCallback.foreach { fn => outputMetrics.setBytesWritten(fn()) }
       1
     } : Int
 
@@ -1039,7 +1046,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     logDebug("Saving as hadoop file of type (" + keyClass.getSimpleName + ", " +
       valueClass.getSimpleName + ")")
 
-    if (self.conf.getBoolean("spark.hadoop.validateOutputSpecs", true)) {
+    if (isOutputSpecValidationEnabled) {
       // FileOutputFormat ignores the filesystem parameter
       val ignoredFs = FileSystem.get(hadoopConf)
       hadoopConf.getOutputFormat.checkOutputSpecs(ignoredFs, hadoopConf)
@@ -1052,11 +1059,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       val config = wrappedConf.value
       // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
       // around by taking a mod. We expect that no task will be attempted 2 billion times.
-      val attemptNumber = (context.attemptId % Int.MaxValue).toInt
+      val taskAttemptId = (context.taskAttemptId % Int.MaxValue).toInt
 
       val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context, config)
 
-      writer.setup(context.stageId, context.partitionId, attemptNumber)
+      writer.setup(context.stageId, context.partitionId, taskAttemptId)
       writer.open()
       try {
         var recordsWritten = 0L
@@ -1072,7 +1079,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
         writer.close()
       }
       writer.commit()
-      bytesWrittenCallback.foreach { fn => outputMetrics.bytesWritten = fn() }
+      bytesWrittenCallback.foreach { fn => outputMetrics.setBytesWritten(fn()) }
     }
 
     self.context.runJob(self, writeToFile)
@@ -1095,7 +1102,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       outputMetrics: OutputMetrics, recordsWritten: Long): Unit = {
     if (recordsWritten % PairRDDFunctions.RECORDS_BETWEEN_BYTES_WRITTEN_METRIC_UPDATES == 0
         && bytesWrittenCallback.isDefined) {
-      bytesWrittenCallback.foreach { fn => outputMetrics.bytesWritten = fn() }
+      bytesWrittenCallback.foreach { fn => outputMetrics.setBytesWritten(fn()) }
     }
   }
 
@@ -1114,8 +1121,22 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   private[spark] def valueClass: Class[_] = vt.runtimeClass
 
   private[spark] def keyOrdering: Option[Ordering[K]] = Option(ord)
+
+  // Note: this needs to be a function instead of a 'val' so that the disableOutputSpecValidation
+  // setting can take effect:
+  private def isOutputSpecValidationEnabled: Boolean = {
+    val validationDisabled = PairRDDFunctions.disableOutputSpecValidation.value
+    val enabledInConf = self.conf.getBoolean("spark.hadoop.validateOutputSpecs", true)
+    enabledInConf && !validationDisabled
+  }
 }
 
 private[spark] object PairRDDFunctions {
   val RECORDS_BETWEEN_BYTES_WRITTEN_METRIC_UPDATES = 256
+
+  /**
+   * Allows for the `spark.hadoop.validateOutputSpecs` checks to be disabled on a case-by-case
+   * basis; see SPARK-4835 for more details.
+   */
+  val disableOutputSpecValidation: DynamicVariable[Boolean] = new DynamicVariable[Boolean](false)
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
index 87b22de6ae697..f12d0cffaba34 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
@@ -111,7 +111,8 @@ private object ParallelCollectionRDD {
   /**
    * Slice a collection into numSlices sub-collections. One extra thing we do here is to treat Range
    * collections specially, encoding the slices as other Ranges to minimize memory cost. This makes
-   * it efficient to run Spark over RDDs representing large sets of numbers.
+   * it efficient to run Spark over RDDs representing large sets of numbers. And if the collection
+   * is an inclusive Range, we use inclusive range for the last slice.
    */
   def slice[T: ClassTag](seq: Seq[T], numSlices: Int): Seq[Seq[T]] = {
     if (numSlices < 1) {
@@ -127,19 +128,15 @@ private object ParallelCollectionRDD {
       })
     }
     seq match {
-      case r: Range.Inclusive => {
-        val sign = if (r.step < 0) {
-          -1
-        } else {
-          1
-        }
-        slice(new Range(
-          r.start, r.end + sign, r.step).asInstanceOf[Seq[T]], numSlices)
-      }
       case r: Range => {
-        positions(r.length, numSlices).map({
-          case (start, end) =>
+        positions(r.length, numSlices).zipWithIndex.map({ case ((start, end), index) =>
+          // If the range is inclusive, use inclusive range for the last slice
+          if (r.isInclusive && index == numSlices - 1) {
+            new Range.Inclusive(r.start + start * r.step, r.end, r.step)
+          }
+          else {
             new Range(r.start + start * r.step, r.start + end * r.step, r.step)
+          }
         }).toSeq.asInstanceOf[Seq[Seq[T]]]
       }
       case nr: NumericRange[_] => {
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 214f22bc5b603..ab7410a1f7f99 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -76,10 +76,27 @@ import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, Bernoulli
  * on RDD internals.
  */
 abstract class RDD[T: ClassTag](
-    @transient private var sc: SparkContext,
+    @transient private var _sc: SparkContext,
     @transient private var deps: Seq[Dependency[_]]
   ) extends Serializable with Logging {
 
+  if (classOf[RDD[_]].isAssignableFrom(elementClassTag.runtimeClass)) {
+    // This is a warning instead of an exception in order to avoid breaking user programs that
+    // might have defined nested RDDs without running jobs with them.
+    logWarning("Spark does not support nested RDDs (see SPARK-5063)")
+  }
+
+  private def sc: SparkContext = {
+    if (_sc == null) {
+      throw new SparkException(
+        "RDD transformations and actions can only be invoked by the driver, not inside of other " +
+        "transformations; for example, rdd1.map(x => rdd2.values.count() * x) is invalid because " +
+        "the values transformation and count action cannot be performed inside of the rdd1.map " +
+        "transformation. For more information, see SPARK-5063.")
+    }
+    _sc
+  }
+
   /** Construct an RDD with just a one-to-one dependency on one parent */
   def this(@transient oneParent: RDD[_]) =
     this(oneParent.context , List(new OneToOneDependency(oneParent)))
@@ -1146,15 +1163,20 @@ abstract class RDD[T: ClassTag](
     if (num == 0) {
       Array.empty
     } else {
-      mapPartitions { items =>
+      val mapRDDs = mapPartitions { items =>
         // Priority keeps the largest elements, so let's reverse the ordering.
         val queue = new BoundedPriorityQueue[T](num)(ord.reverse)
         queue ++= util.collection.Utils.takeOrdered(items, num)(ord)
         Iterator.single(queue)
-      }.reduce { (queue1, queue2) =>
-        queue1 ++= queue2
-        queue1
-      }.toArray.sorted(ord)
+      }
+      if (mapRDDs.partitions.size == 0) {
+        Array.empty
+      } else {
+        mapRDDs.reduce { (queue1, queue2) =>
+          queue1 ++= queue2
+          queue1
+        }.toArray.sorted(ord)
+      }
     }
   }
 
@@ -1170,11 +1192,36 @@ abstract class RDD[T: ClassTag](
    * */
   def min()(implicit ord: Ordering[T]): T = this.reduce(ord.min)
 
+  /**
+   * @return true if and only if the RDD contains no elements at all. Note that an RDD
+   *         may be empty even when it has at least 1 partition.
+   */
+  def isEmpty(): Boolean = partitions.length == 0 || take(1).length == 0
+
   /**
    * Save this RDD as a text file, using string representations of elements.
    */
   def saveAsTextFile(path: String) {
-    this.map(x => (NullWritable.get(), new Text(x.toString)))
+    // https://issues.apache.org/jira/browse/SPARK-2075
+    //
+    // NullWritable is a `Comparable` in Hadoop 1.+, so the compiler cannot find an implicit
+    // Ordering for it and will use the default `null`. However, it's a `Comparable[NullWritable]`
+    // in Hadoop 2.+, so the compiler will call the implicit `Ordering.ordered` method to create an
+    // Ordering for `NullWritable`. That's why the compiler will generate different anonymous
+    // classes for `saveAsTextFile` in Hadoop 1.+ and Hadoop 2.+.
+    //
+    // Therefore, here we provide an explicit Ordering `null` to make sure the compiler generate
+    // same bytecodes for `saveAsTextFile`.
+    val nullWritableClassTag = implicitly[ClassTag[NullWritable]]
+    val textClassTag = implicitly[ClassTag[Text]]
+    val r = this.mapPartitions { iter =>
+      val text = new Text()
+      iter.map { x =>
+        text.set(x.toString)
+        (NullWritable.get(), text)
+      }
+    }
+    RDD.rddToPairRDDFunctions(r)(nullWritableClassTag, textClassTag, null)
       .saveAsHadoopFile[TextOutputFormat[NullWritable, Text]](path)
   }
 
@@ -1182,7 +1229,17 @@ abstract class RDD[T: ClassTag](
    * Save this RDD as a compressed text file, using string representations of elements.
    */
   def saveAsTextFile(path: String, codec: Class[_ <: CompressionCodec]) {
-    this.map(x => (NullWritable.get(), new Text(x.toString)))
+    // https://issues.apache.org/jira/browse/SPARK-2075
+    val nullWritableClassTag = implicitly[ClassTag[NullWritable]]
+    val textClassTag = implicitly[ClassTag[Text]]
+    val r = this.mapPartitions { iter =>
+      val text = new Text()
+      iter.map { x =>
+        text.set(x.toString)
+        (NullWritable.get(), text)
+      }
+    }
+    RDD.rddToPairRDDFunctions(r)(nullWritableClassTag, textClassTag, null)
       .saveAsHadoopFile[TextOutputFormat[NullWritable, Text]](path, codec)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index cb8ccfbdbdcbb..1cfe98673773a 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -19,6 +19,7 @@ package org.apache.spark.scheduler
 
 import java.io.NotSerializableException
 import java.util.Properties
+import java.util.concurrent.{TimeUnit, Executors}
 import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Map, Stack}
@@ -28,8 +29,6 @@ import scala.language.postfixOps
 import scala.reflect.ClassTag
 import scala.util.control.NonFatal
 
-import akka.actor._
-import akka.actor.SupervisorStrategy.Stop
 import akka.pattern.ask
 import akka.util.Timeout
 
@@ -39,7 +38,7 @@ import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.partial.{ApproximateActionListener, ApproximateEvaluator, PartialResult}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage._
-import org.apache.spark.util.{CallSite, SystemClock, Clock, Utils}
+import org.apache.spark.util.{CallSite, EventLoop, SystemClock, Clock, Utils}
 import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat
 
 /**
@@ -67,8 +66,6 @@ class DAGScheduler(
     clock: Clock = SystemClock)
   extends Logging {
 
-  import DAGScheduler._
-
   def this(sc: SparkContext, taskScheduler: TaskScheduler) = {
     this(
       sc,
@@ -112,14 +109,10 @@ class DAGScheduler(
   //       stray messages to detect.
   private val failedEpoch = new HashMap[String, Long]
 
-  private val dagSchedulerActorSupervisor =
-    env.actorSystem.actorOf(Props(new DAGSchedulerActorSupervisor(this)))
-
   // A closure serializer that we reuse.
   // This is only safe because DAGScheduler runs in a single thread.
   private val closureSerializer = SparkEnv.get.closureSerializer.newInstance()
 
-  private[scheduler] var eventProcessActor: ActorRef = _
 
   /** If enabled, we may run certain actions like take() and first() locally. */
   private val localExecutionEnabled = sc.getConf.getBoolean("spark.localExecution.enabled", false)
@@ -127,26 +120,20 @@ class DAGScheduler(
   /** If enabled, FetchFailed will not cause stage retry, in order to surface the problem. */
   private val disallowStageRetryForTest = sc.getConf.getBoolean("spark.test.noStageRetry", false)
 
-  private def initializeEventProcessActor() {
-    // blocking the thread until supervisor is started, which ensures eventProcessActor is
-    // not null before any job is submitted
-    implicit val timeout = Timeout(30 seconds)
-    val initEventActorReply =
-      dagSchedulerActorSupervisor ? Props(new DAGSchedulerEventProcessActor(this))
-    eventProcessActor = Await.result(initEventActorReply, timeout.duration).
-      asInstanceOf[ActorRef]
-  }
+  private val messageScheduler =
+    Executors.newScheduledThreadPool(1, Utils.namedThreadFactory("dag-scheduler-message"))
 
-  initializeEventProcessActor()
+  private[scheduler] val eventProcessLoop = new DAGSchedulerEventProcessLoop(this)
+  taskScheduler.setDAGScheduler(this)
 
   // Called by TaskScheduler to report task's starting.
   def taskStarted(task: Task[_], taskInfo: TaskInfo) {
-    eventProcessActor ! BeginEvent(task, taskInfo)
+    eventProcessLoop.post(BeginEvent(task, taskInfo))
   }
 
   // Called to report that a task has completed and results are being fetched remotely.
   def taskGettingResult(taskInfo: TaskInfo) {
-    eventProcessActor ! GettingResultEvent(taskInfo)
+    eventProcessLoop.post(GettingResultEvent(taskInfo))
   }
 
   // Called by TaskScheduler to report task completions or failures.
@@ -157,7 +144,8 @@ class DAGScheduler(
       accumUpdates: Map[Long, Any],
       taskInfo: TaskInfo,
       taskMetrics: TaskMetrics) {
-    eventProcessActor ! CompletionEvent(task, reason, result, accumUpdates, taskInfo, taskMetrics)
+    eventProcessLoop.post(
+      CompletionEvent(task, reason, result, accumUpdates, taskInfo, taskMetrics))
   }
 
   /**
@@ -179,18 +167,18 @@ class DAGScheduler(
 
   // Called by TaskScheduler when an executor fails.
   def executorLost(execId: String) {
-    eventProcessActor ! ExecutorLost(execId)
+    eventProcessLoop.post(ExecutorLost(execId))
   }
 
   // Called by TaskScheduler when a host is added
   def executorAdded(execId: String, host: String) {
-    eventProcessActor ! ExecutorAdded(execId, host)
+    eventProcessLoop.post(ExecutorAdded(execId, host))
   }
 
   // Called by TaskScheduler to cancel an entire TaskSet due to either repeated failures or
   // cancellation of the job itself.
   def taskSetFailed(taskSet: TaskSet, reason: String) {
-    eventProcessActor ! TaskSetFailed(taskSet, reason)
+    eventProcessLoop.post(TaskSetFailed(taskSet, reason))
   }
 
   private def getCacheLocs(rdd: RDD[_]): Array[Seq[TaskLocation]] = {
@@ -495,8 +483,8 @@ class DAGScheduler(
     assert(partitions.size > 0)
     val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
     val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
-    eventProcessActor ! JobSubmitted(
-      jobId, rdd, func2, partitions.toArray, allowLocal, callSite, waiter, properties)
+    eventProcessLoop.post(JobSubmitted(
+      jobId, rdd, func2, partitions.toArray, allowLocal, callSite, waiter, properties))
     waiter
   }
 
@@ -536,8 +524,8 @@ class DAGScheduler(
     val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
     val partitions = (0 until rdd.partitions.size).toArray
     val jobId = nextJobId.getAndIncrement()
-    eventProcessActor ! JobSubmitted(
-      jobId, rdd, func2, partitions, allowLocal = false, callSite, listener, properties)
+    eventProcessLoop.post(JobSubmitted(
+      jobId, rdd, func2, partitions, allowLocal = false, callSite, listener, properties))
     listener.awaitResult()    // Will throw an exception if the job fails
   }
 
@@ -546,19 +534,19 @@ class DAGScheduler(
    */
   def cancelJob(jobId: Int) {
     logInfo("Asked to cancel job " + jobId)
-    eventProcessActor ! JobCancelled(jobId)
+    eventProcessLoop.post(JobCancelled(jobId))
   }
 
   def cancelJobGroup(groupId: String) {
     logInfo("Asked to cancel job group " + groupId)
-    eventProcessActor ! JobGroupCancelled(groupId)
+    eventProcessLoop.post(JobGroupCancelled(groupId))
   }
 
   /**
    * Cancel all jobs that are running or waiting in the queue.
    */
   def cancelAllJobs() {
-    eventProcessActor ! AllJobsCancelled
+    eventProcessLoop.post(AllJobsCancelled)
   }
 
   private[scheduler] def doCancelAllJobs() {
@@ -574,7 +562,7 @@ class DAGScheduler(
    * Cancel all jobs associated with a running or scheduled stage.
    */
   def cancelStage(stageId: Int) {
-    eventProcessActor ! StageCancelled(stageId)
+    eventProcessLoop.post(StageCancelled(stageId))
   }
 
   /**
@@ -634,8 +622,8 @@ class DAGScheduler(
     try {
       val rdd = job.finalStage.rdd
       val split = rdd.partitions(job.partitions(0))
-      val taskContext =
-        new TaskContextImpl(job.finalStage.id, job.partitions(0), 0, true)
+      val taskContext = new TaskContextImpl(job.finalStage.id, job.partitions(0), taskAttemptId = 0,
+        attemptNumber = 0, runningLocally = true)
       TaskContextHelper.setTaskContext(taskContext)
       try {
         val result = job.func(taskContext, rdd.iterator(split, taskContext))
@@ -660,7 +648,7 @@ class DAGScheduler(
       // completion events or stage abort
       stageIdToStage -= s.id
       jobIdToStageIds -= job.jobId
-      listenerBus.post(SparkListenerJobEnd(job.jobId, jobResult))
+      listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTime(), jobResult))
     }
   }
 
@@ -709,7 +697,7 @@ class DAGScheduler(
         stage.latestInfo.stageFailed(stageFailedMessage)
         listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))
       }
-      listenerBus.post(SparkListenerJobEnd(job.jobId, JobFailed(error)))
+      listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTime(), JobFailed(error)))
     }
   }
 
@@ -748,9 +736,11 @@ class DAGScheduler(
       logInfo("Missing parents: " + getMissingParentStages(finalStage))
       val shouldRunLocally =
         localExecutionEnabled && allowLocal && finalStage.parents.isEmpty && partitions.length == 1
+      val jobSubmissionTime = clock.getTime()
       if (shouldRunLocally) {
         // Compute very short actions like first() or take() with no parent stages locally.
-        listenerBus.post(SparkListenerJobStart(job.jobId, Seq.empty, properties))
+        listenerBus.post(
+          SparkListenerJobStart(job.jobId, jobSubmissionTime, Seq.empty, properties))
         runLocally(job)
       } else {
         jobIdToActiveJob(jobId) = job
@@ -758,7 +748,8 @@ class DAGScheduler(
         finalStage.resultOfJob = Some(job)
         val stageIds = jobIdToStageIds(jobId).toArray
         val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
-        listenerBus.post(SparkListenerJobStart(job.jobId, stageInfos, properties))
+        listenerBus.post(
+          SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
         submitStage(finalStage)
       }
     }
@@ -865,26 +856,6 @@ class DAGScheduler(
     }
 
     if (tasks.size > 0) {
-      // Preemptively serialize a task to make sure it can be serialized. We are catching this
-      // exception here because it would be fairly hard to catch the non-serializable exception
-      // down the road, where we have several different implementations for local scheduler and
-      // cluster schedulers.
-      //
-      // We've already serialized RDDs and closures in taskBinary, but here we check for all other
-      // objects such as Partition.
-      try {
-        closureSerializer.serialize(tasks.head)
-      } catch {
-        case e: NotSerializableException =>
-          abortStage(stage, "Task not serializable: " + e.toString)
-          runningStages -= stage
-          return
-        case NonFatal(e) => // Other exceptions, such as IllegalArgumentException from Kryo.
-          abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}")
-          runningStages -= stage
-          return
-      }
-
       logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
       stage.pendingTasks ++= tasks
       logDebug("New pending tasks: " + stage.pendingTasks)
@@ -984,7 +955,8 @@ class DAGScheduler(
                   if (job.numFinished == job.numPartitions) {
                     markStageAsFinished(stage)
                     cleanupStateForJobAndIndependentStages(job)
-                    listenerBus.post(SparkListenerJobEnd(job.jobId, JobSucceeded))
+                    listenerBus.post(
+                      SparkListenerJobEnd(job.jobId, clock.getTime(), JobSucceeded))
                   }
 
                   // taskSucceeded runs some user code that might throw an exception. Make sure
@@ -1078,16 +1050,15 @@ class DAGScheduler(
 
         if (disallowStageRetryForTest) {
           abortStage(failedStage, "Fetch failure will not retry stage due to testing config")
-        } else if (failedStages.isEmpty && eventProcessActor != null) {
+        } else if (failedStages.isEmpty) {
           // Don't schedule an event to resubmit failed stages if failed isn't empty, because
-          // in that case the event will already have been scheduled. eventProcessActor may be
-          // null during unit tests.
+          // in that case the event will already have been scheduled.
           // TODO: Cancel running tasks in the stage
-          import env.actorSystem.dispatcher
           logInfo(s"Resubmitting $mapStage (${mapStage.name}) and " +
             s"$failedStage (${failedStage.name}) due to fetch failure")
-          env.actorSystem.scheduler.scheduleOnce(
-            RESUBMIT_TIMEOUT, eventProcessActor, ResubmitFailedStages)
+          messageScheduler.schedule(new Runnable {
+            override def run(): Unit = eventProcessLoop.post(ResubmitFailedStages)
+          }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS)
         }
         failedStages += failedStage
         failedStages += mapStage
@@ -1253,7 +1224,7 @@ class DAGScheduler(
     if (ableToCancelStages) {
       job.listener.jobFailed(error)
       cleanupStateForJobAndIndependentStages(job)
-      listenerBus.post(SparkListenerJobEnd(job.jobId, JobFailed(error)))
+      listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTime(), JobFailed(error)))
     }
   }
 
@@ -1345,46 +1316,21 @@ class DAGScheduler(
 
   def stop() {
     logInfo("Stopping DAGScheduler")
-    dagSchedulerActorSupervisor ! PoisonPill
+    eventProcessLoop.stop()
     taskScheduler.stop()
   }
-}
-
-private[scheduler] class DAGSchedulerActorSupervisor(dagScheduler: DAGScheduler)
-  extends Actor with Logging {
-
-  override val supervisorStrategy =
-    OneForOneStrategy() {
-      case x: Exception =>
-        logError("eventProcesserActor failed; shutting down SparkContext", x)
-        try {
-          dagScheduler.doCancelAllJobs()
-        } catch {
-          case t: Throwable => logError("DAGScheduler failed to cancel all jobs.", t)
-        }
-        dagScheduler.sc.stop()
-        Stop
-    }
 
-  def receive = {
-    case p: Props => sender ! context.actorOf(p)
-    case _ => logWarning("received unknown message in DAGSchedulerActorSupervisor")
-  }
+  // Start the event thread at the end of the constructor
+  eventProcessLoop.start()
 }
 
-private[scheduler] class DAGSchedulerEventProcessActor(dagScheduler: DAGScheduler)
-  extends Actor with Logging {
-
-  override def preStart() {
-    // set DAGScheduler for taskScheduler to ensure eventProcessActor is always
-    // valid when the messages arrive
-    dagScheduler.taskScheduler.setDAGScheduler(dagScheduler)
-  }
+private[scheduler] class DAGSchedulerEventProcessLoop(dagScheduler: DAGScheduler)
+  extends EventLoop[DAGSchedulerEvent]("dag-scheduler-event-loop") with Logging {
 
   /**
    * The main event loop of the DAG scheduler.
    */
-  def receive = {
+  override def onReceive(event: DAGSchedulerEvent): Unit = event match {
     case JobSubmitted(jobId, rdd, func, partitions, allowLocal, callSite, listener, properties) =>
       dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, allowLocal, callSite,
         listener, properties)
@@ -1423,7 +1369,17 @@ private[scheduler] class DAGSchedulerEventProcessActor(dagScheduler: DAGSchedule
       dagScheduler.resubmitFailedStages()
   }
 
-  override def postStop() {
+  override def onError(e: Throwable): Unit = {
+    logError("DAGSchedulerEventProcessLoop failed; shutting down SparkContext", e)
+    try {
+      dagScheduler.doCancelAllJobs()
+    } catch {
+      case t: Throwable => logError("DAGScheduler failed to cancel all jobs.", t)
+    }
+    dagScheduler.sc.stop()
+  }
+
+  override def onStop() {
     // Cancel any active jobs in postStop hook
     dagScheduler.cleanUpAfterSchedulerStop()
   }
@@ -1433,9 +1389,5 @@ private[spark] object DAGScheduler {
   // The time, in millis, to wait for fetch failure events to stop coming in after one is detected;
   // this is a simplistic way to avoid resubmitting tasks in the non-fetchable map stage one by one
   // as more failure events come in
-  val RESUBMIT_TIMEOUT = 200.milliseconds
-
-  // The time, in millis, to wake up between polls of the completion queue in order to potentially
-  // resubmit failed stages
-  val POLL_TIMEOUT = 10L
+  val RESUBMIT_TIMEOUT = 200
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
index 597dbc884913c..30075c172bdb1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -17,20 +17,23 @@
 
 package org.apache.spark.scheduler
 
+import java.io._
+import java.net.URI
+
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
+import com.google.common.base.Charsets
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path}
 import org.apache.hadoop.fs.permission.FsPermission
 import org.json4s.JsonAST.JValue
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.{Logging, SparkConf, SparkContext}
+import org.apache.spark.{Logging, SparkConf, SPARK_VERSION}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.io.CompressionCodec
-import org.apache.spark.SPARK_VERSION
-import org.apache.spark.util.{FileLogger, JsonProtocol, Utils}
+import org.apache.spark.util.{JsonProtocol, Utils}
 
 /**
  * A SparkListener that logs events to persistent storage.
@@ -58,36 +61,78 @@ private[spark] class EventLoggingListener(
   private val shouldOverwrite = sparkConf.getBoolean("spark.eventLog.overwrite", false)
   private val testing = sparkConf.getBoolean("spark.eventLog.testing", false)
   private val outputBufferSize = sparkConf.getInt("spark.eventLog.buffer.kb", 100) * 1024
-  val logDir = EventLoggingListener.getLogDirPath(logBaseDir, appId)
-  val logDirName: String = logDir.split("/").last
-  protected val logger = new FileLogger(logDir, sparkConf, hadoopConf, outputBufferSize,
-    shouldCompress, shouldOverwrite, Some(LOG_FILE_PERMISSIONS))
+  private val fileSystem = Utils.getHadoopFileSystem(new URI(logBaseDir), hadoopConf)
+
+  // Only defined if the file system scheme is not local
+  private var hadoopDataStream: Option[FSDataOutputStream] = None
+
+  // The Hadoop APIs have changed over time, so we use reflection to figure out
+  // the correct method to use to flush a hadoop data stream. See SPARK-1518
+  // for details.
+  private val hadoopFlushMethod = {
+    val cls = classOf[FSDataOutputStream]
+    scala.util.Try(cls.getMethod("hflush")).getOrElse(cls.getMethod("sync"))
+  }
+
+  private var writer: Option[PrintWriter] = None
 
   // For testing. Keep track of all JSON serialized events that have been logged.
   private[scheduler] val loggedEvents = new ArrayBuffer[JValue]
 
+  // Visible for tests only.
+  private[scheduler] val logPath = getLogPath(logBaseDir, appId)
+
   /**
-   * Begin logging events.
-   * If compression is used, log a file that indicates which compression library is used.
+   * Creates the log file in the configured log directory.
    */
   def start() {
-    logger.start()
-    logInfo("Logging events to %s".format(logDir))
-    if (shouldCompress) {
-      val codec =
-        sparkConf.get("spark.io.compression.codec", CompressionCodec.DEFAULT_COMPRESSION_CODEC)
-      logger.newFile(COMPRESSION_CODEC_PREFIX + codec)
+    if (!fileSystem.isDirectory(new Path(logBaseDir))) {
+      throw new IllegalArgumentException(s"Log directory $logBaseDir does not exist.")
     }
-    logger.newFile(SPARK_VERSION_PREFIX + SPARK_VERSION)
-    logger.newFile(LOG_PREFIX + logger.fileIndex)
+
+    val workingPath = logPath + IN_PROGRESS
+    val uri = new URI(workingPath)
+    val path = new Path(workingPath)
+    val defaultFs = FileSystem.getDefaultUri(hadoopConf).getScheme
+    val isDefaultLocal = defaultFs == null || defaultFs == "file"
+
+    if (shouldOverwrite && fileSystem.exists(path)) {
+      logWarning(s"Event log $path already exists. Overwriting...")
+      fileSystem.delete(path, true)
+    }
+
+    /* The Hadoop LocalFileSystem (r1.0.4) has known issues with syncing (HADOOP-7844).
+     * Therefore, for local files, use FileOutputStream instead. */
+    val dstream =
+      if ((isDefaultLocal && uri.getScheme == null) || uri.getScheme == "file") {
+        new FileOutputStream(uri.getPath)
+      } else {
+        hadoopDataStream = Some(fileSystem.create(path))
+        hadoopDataStream.get
+      }
+
+    val compressionCodec =
+      if (shouldCompress) {
+        Some(CompressionCodec.createCodec(sparkConf))
+      } else {
+        None
+      }
+
+    fileSystem.setPermission(path, LOG_FILE_PERMISSIONS)
+    val logStream = initEventLog(new BufferedOutputStream(dstream, outputBufferSize),
+      compressionCodec)
+    writer = Some(new PrintWriter(logStream))
+
+    logInfo("Logging events to %s".format(logPath))
   }
 
   /** Log the event as JSON. */
   private def logEvent(event: SparkListenerEvent, flushLogger: Boolean = false) {
     val eventJson = JsonProtocol.sparkEventToJson(event)
-    logger.logLine(compact(render(eventJson)))
+    writer.foreach(_.println(compact(render(eventJson))))
     if (flushLogger) {
-      logger.flush()
+      writer.foreach(_.flush())
+      hadoopDataStream.foreach(hadoopFlushMethod.invoke(_))
     }
     if (testing) {
       loggedEvents += eventJson
@@ -123,130 +168,168 @@ private[spark] class EventLoggingListener(
     logEvent(event, flushLogger = true)
   override def onApplicationEnd(event: SparkListenerApplicationEnd) =
     logEvent(event, flushLogger = true)
+  override def onExecutorAdded(event: SparkListenerExecutorAdded) =
+    logEvent(event, flushLogger = true)
+  override def onExecutorRemoved(event: SparkListenerExecutorRemoved) =
+    logEvent(event, flushLogger = true)
+
   // No-op because logging every update would be overkill
   override def onExecutorMetricsUpdate(event: SparkListenerExecutorMetricsUpdate) { }
 
   /**
-   * Stop logging events.
-   * In addition, create an empty special file to indicate application completion.
+   * Stop logging events. The event log file will be renamed so that it loses the
+   * ".inprogress" suffix.
    */
   def stop() = {
-    logger.newFile(APPLICATION_COMPLETE)
-    logger.stop()
+    writer.foreach(_.close())
+
+    val target = new Path(logPath)
+    if (fileSystem.exists(target)) {
+      if (shouldOverwrite) {
+        logWarning(s"Event log $target already exists. Overwriting...")
+        fileSystem.delete(target, true)
+      } else {
+        throw new IOException("Target log file already exists (%s)".format(logPath))
+      }
+    }
+    fileSystem.rename(new Path(logPath + IN_PROGRESS), target)
   }
+
 }
 
 private[spark] object EventLoggingListener extends Logging {
+  // Suffix applied to the names of files still being written by applications.
+  val IN_PROGRESS = ".inprogress"
   val DEFAULT_LOG_DIR = "/tmp/spark-events"
-  val LOG_PREFIX = "EVENT_LOG_"
-  val SPARK_VERSION_PREFIX = "SPARK_VERSION_"
-  val COMPRESSION_CODEC_PREFIX = "COMPRESSION_CODEC_"
-  val APPLICATION_COMPLETE = "APPLICATION_COMPLETE"
-  val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("770", 8).toShort)
 
-  // A cache for compression codecs to avoid creating the same codec many times
-  private val codecMap = new mutable.HashMap[String, CompressionCodec]
+  private val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("770", 8).toShort)
 
-  def isEventLogFile(fileName: String): Boolean = {
-    fileName.startsWith(LOG_PREFIX)
-  }
+  // Marker for the end of header data in a log file. After this marker, log data, potentially
+  // compressed, will be found.
+  private val HEADER_END_MARKER = "=== LOG_HEADER_END ==="
 
-  def isSparkVersionFile(fileName: String): Boolean = {
-    fileName.startsWith(SPARK_VERSION_PREFIX)
-  }
+  // To avoid corrupted files causing the heap to fill up. Value is arbitrary.
+  private val MAX_HEADER_LINE_LENGTH = 4096
 
-  def isCompressionCodecFile(fileName: String): Boolean = {
-    fileName.startsWith(COMPRESSION_CODEC_PREFIX)
-  }
+  // A cache for compression codecs to avoid creating the same codec many times
+  private val codecMap = new mutable.HashMap[String, CompressionCodec]
 
-  def isApplicationCompleteFile(fileName: String): Boolean = {
-    fileName == APPLICATION_COMPLETE
-  }
+  /**
+   * Write metadata about the event log to the given stream.
+   *
+   * The header is a serialized version of a map, except it does not use Java serialization to
+   * avoid incompatibilities between different JDKs. It writes one map entry per line, in
+   * "key=value" format.
+   *
+   * The very last entry in the header is the `HEADER_END_MARKER` marker, so that the parsing code
+   * can know when to stop.
+   *
+   * The format needs to be kept in sync with the openEventLog() method below. Also, it cannot
+   * change in new Spark versions without some other way of detecting the change (like some
+   * metadata encoded in the file name).
+   *
+   * @param logStream Raw output stream to the even log file.
+   * @param compressionCodec Optional compression codec to use.
+   * @return A stream where to write event log data. This may be a wrapper around the original
+   *         stream (for example, when compression is enabled).
+   */
+  def initEventLog(
+      logStream: OutputStream,
+      compressionCodec: Option[CompressionCodec]): OutputStream = {
+    val meta = mutable.HashMap(("version" -> SPARK_VERSION))
+    compressionCodec.foreach { codec =>
+      meta += ("compressionCodec" -> codec.getClass().getName())
+    }
 
-  def parseSparkVersion(fileName: String): String = {
-    if (isSparkVersionFile(fileName)) {
-      fileName.replaceAll(SPARK_VERSION_PREFIX, "")
-    } else ""
-  }
+    def write(entry: String) = {
+      val bytes = entry.getBytes(Charsets.UTF_8)
+      if (bytes.length > MAX_HEADER_LINE_LENGTH) {
+        throw new IOException(s"Header entry too long: ${entry}")
+      }
+      logStream.write(bytes, 0, bytes.length)
+    }
 
-  def parseCompressionCodec(fileName: String): String = {
-    if (isCompressionCodecFile(fileName)) {
-      fileName.replaceAll(COMPRESSION_CODEC_PREFIX, "")
-    } else ""
+    meta.foreach { case (k, v) => write(s"$k=$v\n") }
+    write(s"$HEADER_END_MARKER\n")
+    compressionCodec.map(_.compressedOutputStream(logStream)).getOrElse(logStream)
   }
 
   /**
-   * Return a file-system-safe path to the log directory for the given application.
+   * Return a file-system-safe path to the log file for the given application.
    *
-   * @param logBaseDir A base directory for the path to the log directory for given application.
+   * @param logBaseDir Directory where the log file will be written.
    * @param appId A unique app ID.
    * @return A path which consists of file-system-safe characters.
    */
-  def getLogDirPath(logBaseDir: String, appId: String): String = {
+  def getLogPath(logBaseDir: String, appId: String): String = {
     val name = appId.replaceAll("[ :/]", "-").replaceAll("[${}'\"]", "_").toLowerCase
     Utils.resolveURI(logBaseDir) + "/" + name.stripSuffix("/")
   }
 
   /**
-   * Parse the event logging information associated with the logs in the given directory.
+   * Opens an event log file and returns an input stream to the event data.
    *
-   * Specifically, this looks for event log files, the Spark version file, the compression
-   * codec file (if event logs are compressed), and the application completion file (if the
-   * application has run to completion).
+   * @return 2-tuple (event input stream, Spark version of event data)
    */
-  def parseLoggingInfo(logDir: Path, fileSystem: FileSystem): EventLoggingInfo = {
+  def openEventLog(log: Path, fs: FileSystem): (InputStream, String) = {
+    // It's not clear whether FileSystem.open() throws FileNotFoundException or just plain
+    // IOException when a file does not exist, so try our best to throw a proper exception.
+    if (!fs.exists(log)) {
+      throw new FileNotFoundException(s"File $log does not exist.")
+    }
+
+    val in = new BufferedInputStream(fs.open(log))
+    // Read a single line from the input stream without buffering.
+    // We cannot use BufferedReader because we must avoid reading
+    // beyond the end of the header, after which the content of the
+    // file may be compressed.
+    def readLine(): String = {
+      val bytes = new ByteArrayOutputStream()
+      var next = in.read()
+      var count = 0
+      while (next != '\n') {
+        if (next == -1) {
+          throw new IOException("Unexpected end of file.")
+        }
+        bytes.write(next)
+        count = count + 1
+        if (count > MAX_HEADER_LINE_LENGTH) {
+          throw new IOException("Maximum header line length exceeded.")
+        }
+        next = in.read()
+      }
+      new String(bytes.toByteArray(), Charsets.UTF_8)
+    }
+
+    // Parse the header metadata in the form of k=v pairs
+    // This assumes that every line before the header end marker follows this format
     try {
-      val fileStatuses = fileSystem.listStatus(logDir)
-      val filePaths =
-        if (fileStatuses != null) {
-          fileStatuses.filter(!_.isDir).map(_.getPath).toSeq
-        } else {
-          Seq[Path]()
+      val meta = new mutable.HashMap[String, String]()
+      var foundEndMarker = false
+      while (!foundEndMarker) {
+        readLine() match {
+          case HEADER_END_MARKER =>
+            foundEndMarker = true
+          case entry =>
+            val prop = entry.split("=", 2)
+            if (prop.length != 2) {
+              throw new IllegalArgumentException("Invalid metadata in log file.")
+            }
+            meta += (prop(0) -> prop(1))
         }
-      if (filePaths.isEmpty) {
-        logWarning("No files found in logging directory %s".format(logDir))
       }
-      EventLoggingInfo(
-        logPaths = filePaths.filter { path => isEventLogFile(path.getName) },
-        sparkVersion = filePaths
-          .find { path => isSparkVersionFile(path.getName) }
-          .map { path => parseSparkVersion(path.getName) }
-          .getOrElse("<Unknown>"),
-        compressionCodec = filePaths
-          .find { path => isCompressionCodecFile(path.getName) }
-          .map { path =>
-            val codec = EventLoggingListener.parseCompressionCodec(path.getName)
-            val conf = new SparkConf
-            conf.set("spark.io.compression.codec", codec)
-            codecMap.getOrElseUpdate(codec, CompressionCodec.createCodec(conf))
-          },
-        applicationComplete = filePaths.exists { path => isApplicationCompleteFile(path.getName) }
-      )
+
+      val sparkVersion = meta.get("version").getOrElse(
+        throw new IllegalArgumentException("Missing Spark version in log metadata."))
+      val codec = meta.get("compressionCodec").map { codecName =>
+        codecMap.getOrElseUpdate(codecName, CompressionCodec.createCodec(new SparkConf, codecName))
+      }
+      (codec.map(_.compressedInputStream(in)).getOrElse(in), sparkVersion)
     } catch {
       case e: Exception =>
-        logError("Exception in parsing logging info from directory %s".format(logDir), e)
-        EventLoggingInfo.empty
+        in.close()
+        throw e
     }
   }
 
-  /**
-   * Parse the event logging information associated with the logs in the given directory.
-   */
-  def parseLoggingInfo(logDir: String, fileSystem: FileSystem): EventLoggingInfo = {
-    parseLoggingInfo(new Path(logDir), fileSystem)
-  }
-}
-
-
-/**
- * Information needed to process the event logs associated with an application.
- */
-private[spark] case class EventLoggingInfo(
-    logPaths: Seq[Path],
-    sparkVersion: String,
-    compressionCodec: Option[CompressionCodec],
-    applicationComplete: Boolean = false)
-
-private[spark] object EventLoggingInfo {
-  def empty = EventLoggingInfo(Seq[Path](), "<Unknown>", None, applicationComplete = false)
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala
index 01d5943d777f3..1efce124c0a6b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala
@@ -122,7 +122,7 @@ private[spark] class CompressedMapStatus(
 
 /**
  * A [[MapStatus]] implementation that only stores the average size of non-empty blocks,
- * plus a bitmap for tracking which blocks are non-empty.  During serialization, this bitmap
+ * plus a bitmap for tracking which blocks are empty.  During serialization, this bitmap
  * is compressed.
  *
  * @param loc location where the task is being executed
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
index f89724d4ea196..584f4e7789d1a 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
@@ -17,74 +17,45 @@
 
 package org.apache.spark.scheduler
 
-import java.io.{BufferedInputStream, InputStream}
+import java.io.{InputStream, IOException}
 
 import scala.io.Source
 
-import org.apache.hadoop.fs.{Path, FileSystem}
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.Logging
-import org.apache.spark.io.CompressionCodec
 import org.apache.spark.util.JsonProtocol
 
 /**
- * A SparkListenerBus that replays logged events from persisted storage.
- *
- * This assumes the given paths are valid log files, where each line can be deserialized into
- * exactly one SparkListenerEvent.
+ * A SparkListenerBus that can be used to replay events from serialized event data.
  */
-private[spark] class ReplayListenerBus(
-    logPaths: Seq[Path],
-    fileSystem: FileSystem,
-    compressionCodec: Option[CompressionCodec])
-  extends SparkListenerBus with Logging {
-
-  private var replayed = false
-
-  if (logPaths.length == 0) {
-    logWarning("Log path provided contains no log files.")
-  }
+private[spark] class ReplayListenerBus extends SparkListenerBus with Logging {
 
   /**
-   * Replay each event in the order maintained in the given logs.
-   * This should only be called exactly once.
+   * Replay each event in the order maintained in the given stream. The stream is expected to
+   * contain one JSON-encoded SparkListenerEvent per line.
+   *
+   * This method can be called multiple times, but the listener behavior is undefined after any
+   * error is thrown by this method.
+   *
+   * @param logData Stream containing event log data.
+   * @param version Spark version that generated the events.
    */
-  def replay() {
-    assert(!replayed, "ReplayListenerBus cannot replay events more than once")
-    logPaths.foreach { path =>
-      // Keep track of input streams at all levels to close them later
-      // This is necessary because an exception can occur in between stream initializations
-      var fileStream: Option[InputStream] = None
-      var bufferedStream: Option[InputStream] = None
-      var compressStream: Option[InputStream] = None
-      var currentLine = "<not started>"
-      try {
-        fileStream = Some(fileSystem.open(path))
-        bufferedStream = Some(new BufferedInputStream(fileStream.get))
-        compressStream = Some(wrapForCompression(bufferedStream.get))
-
-        // Parse each line as an event and post the event to all attached listeners
-        val lines = Source.fromInputStream(compressStream.get).getLines()
-        lines.foreach { line =>
-          currentLine = line
-          postToAll(JsonProtocol.sparkEventFromJson(parse(line)))
-        }
-      } catch {
-        case e: Exception =>
-          logError("Exception in parsing Spark event log %s".format(path), e)
-          logError("Malformed line: %s\n".format(currentLine))
-      } finally {
-        fileStream.foreach(_.close())
-        bufferedStream.foreach(_.close())
-        compressStream.foreach(_.close())
+  def replay(logData: InputStream, version: String) {
+    var currentLine: String = null
+    try {
+      val lines = Source.fromInputStream(logData).getLines()
+      lines.foreach { line =>
+        currentLine = line
+        postToAll(JsonProtocol.sparkEventFromJson(parse(line)))
       }
+    } catch {
+      case ioe: IOException =>
+        throw ioe
+      case e: Exception =>
+        logError("Exception in parsing Spark event log.", e)
+        logError("Malformed line: %s\n".format(currentLine))
     }
-    replayed = true
   }
 
-  /** If a compression codec is specified, wrap the given stream in a compression stream. */
-  private def wrapForCompression(stream: InputStream): InputStream = {
-    compressionCodec.map(_.compressedInputStream(stream)).getOrElse(stream)
-  }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
index b62b0c1312693..e5d1eb767e109 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
@@ -25,6 +25,7 @@ import scala.collection.mutable
 import org.apache.spark.{Logging, TaskEndReason}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.storage.BlockManagerId
 import org.apache.spark.util.{Distribution, Utils}
 
@@ -58,6 +59,7 @@ case class SparkListenerTaskEnd(
 @DeveloperApi
 case class SparkListenerJobStart(
     jobId: Int,
+    time: Long,
     stageInfos: Seq[StageInfo],
     properties: Properties = null)
   extends SparkListenerEvent {
@@ -67,7 +69,11 @@ case class SparkListenerJobStart(
 }
 
 @DeveloperApi
-case class SparkListenerJobEnd(jobId: Int, jobResult: JobResult) extends SparkListenerEvent
+case class SparkListenerJobEnd(
+    jobId: Int,
+    time: Long,
+    jobResult: JobResult)
+  extends SparkListenerEvent
 
 @DeveloperApi
 case class SparkListenerEnvironmentUpdate(environmentDetails: Map[String, Seq[(String, String)]])
@@ -84,6 +90,14 @@ case class SparkListenerBlockManagerRemoved(time: Long, blockManagerId: BlockMan
 @DeveloperApi
 case class SparkListenerUnpersistRDD(rddId: Int) extends SparkListenerEvent
 
+@DeveloperApi
+case class SparkListenerExecutorAdded(executorId: String, executorInfo: ExecutorInfo)
+  extends SparkListenerEvent
+
+@DeveloperApi
+case class SparkListenerExecutorRemoved(executorId: String)
+  extends SparkListenerEvent
+
 /**
  * Periodic updates from executors.
  * @param execId executor id
@@ -109,7 +123,8 @@ private[spark] case object SparkListenerShutdown extends SparkListenerEvent
 /**
  * :: DeveloperApi ::
  * Interface for listening to events from the Spark scheduler. Note that this is an internal
- * interface which might change in different Spark releases.
+ * interface which might change in different Spark releases. Java clients should extend
+ * {@link JavaSparkListener}
  */
 @DeveloperApi
 trait SparkListener {
@@ -183,6 +198,16 @@ trait SparkListener {
    * Called when the driver receives task metrics from an executor in a heartbeat.
    */
   def onExecutorMetricsUpdate(executorMetricsUpdate: SparkListenerExecutorMetricsUpdate) { }
+
+  /**
+   * Called when the driver registers a new executor.
+   */
+  def onExecutorAdded(executorAdded: SparkListenerExecutorAdded) { }
+
+  /**
+   * Called when the driver removes an executor.
+   */
+  def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved) { }
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala
index e79ffd7a3587d..e700c6af542f4 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala
@@ -70,6 +70,10 @@ private[spark] trait SparkListenerBus extends Logging {
         foreachListener(_.onApplicationEnd(applicationEnd))
       case metricsUpdate: SparkListenerExecutorMetricsUpdate =>
         foreachListener(_.onExecutorMetricsUpdate(metricsUpdate))
+      case executorAdded: SparkListenerExecutorAdded =>
+        foreachListener(_.onExecutorAdded(executorAdded))
+      case executorRemoved: SparkListenerExecutorRemoved =>
+        foreachListener(_.onExecutorRemoved(executorRemoved))
       case SparkListenerShutdown =>
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index 2552d03d18d06..847a4912eec13 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -44,10 +44,18 @@ import org.apache.spark.util.Utils
  */
 private[spark] abstract class Task[T](val stageId: Int, var partitionId: Int) extends Serializable {
 
-  final def run(attemptId: Long): T = {
-    context = new TaskContextImpl(stageId, partitionId, attemptId, false)
+  /**
+   * Called by Executor to run this task.
+   *
+   * @param taskAttemptId an identifier for this task attempt that is unique within a SparkContext.
+   * @param attemptNumber how many times this task has been attempted (0 for the first attempt)
+   * @return the result of the task
+   */
+  final def run(taskAttemptId: Long, attemptNumber: Int): T = {
+    context = new TaskContextImpl(stageId = stageId, partitionId = partitionId,
+      taskAttemptId = taskAttemptId, attemptNumber = attemptNumber, runningLocally = false)
     TaskContextHelper.setTaskContext(context)
-    context.taskMetrics.hostname = Utils.localHostName()
+    context.taskMetrics.setHostname(Utils.localHostName())
     taskThread = Thread.currentThread()
     if (_killed) {
       kill(interruptThread = false)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
index 4c96b9e5fef60..1c7c81c488c3a 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
@@ -27,6 +27,7 @@ import org.apache.spark.util.SerializableBuffer
  */
 private[spark] class TaskDescription(
     val taskId: Long,
+    val attemptNumber: Int,
     val executorId: String,
     val name: String,
     val index: Int,    // Index within this task's TaskSet
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
index 819b51e12ad8c..774f3d8cdb275 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
@@ -19,6 +19,7 @@ package org.apache.spark.scheduler
 
 import java.nio.ByteBuffer
 
+import scala.language.existentials
 import scala.util.control.NonFatal
 
 import org.apache.spark._
@@ -76,7 +77,7 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
               (deserializedResult, size)
           }
 
-          result.metrics.resultSize = size
+          result.metrics.setResultSize(size)
           scheduler.handleSuccessfulTask(taskSetManager, tid, result)
         } catch {
           case cnf: ClassNotFoundException =>
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index cd3c015321e85..33a7aae5d3fcd 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -31,6 +31,7 @@ import scala.util.Random
 import org.apache.spark._
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
+import org.apache.spark.scheduler.TaskLocality.TaskLocality
 import org.apache.spark.util.Utils
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.storage.BlockManagerId
@@ -167,7 +168,7 @@ private[spark] class TaskSchedulerImpl(
             if (!hasLaunchedTask) {
               logWarning("Initial job has not accepted any resources; " +
                 "check your cluster UI to ensure that workers are registered " +
-                "and have sufficient memory")
+                "and have sufficient resources")
             } else {
               this.cancel()
             }
@@ -209,6 +210,40 @@ private[spark] class TaskSchedulerImpl(
       .format(manager.taskSet.id, manager.parent.name))
   }
 
+  private def resourceOfferSingleTaskSet(
+      taskSet: TaskSetManager,
+      maxLocality: TaskLocality,
+      shuffledOffers: Seq[WorkerOffer],
+      availableCpus: Array[Int],
+      tasks: Seq[ArrayBuffer[TaskDescription]]) : Boolean = {
+    var launchedTask = false
+    for (i <- 0 until shuffledOffers.size) {
+      val execId = shuffledOffers(i).executorId
+      val host = shuffledOffers(i).host
+      if (availableCpus(i) >= CPUS_PER_TASK) {
+        try {
+          for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
+            tasks(i) += task
+            val tid = task.taskId
+            taskIdToTaskSetId(tid) = taskSet.taskSet.id
+            taskIdToExecutorId(tid) = execId
+            executorsByHost(host) += execId
+            availableCpus(i) -= CPUS_PER_TASK
+            assert(availableCpus(i) >= 0)
+            launchedTask = true
+          }
+        } catch {
+          case e: TaskNotSerializableException =>
+            logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
+            // Do not offer resources for this task, but don't throw an error to allow other
+            // task sets to be submitted.
+            return launchedTask
+        }
+      }
+    }
+    return launchedTask
+  }
+
   /**
    * Called by cluster manager to offer resources on slaves. We respond by asking our active task
    * sets for tasks in order of priority. We fill each node with tasks in a round-robin manner so
@@ -251,23 +286,8 @@ private[spark] class TaskSchedulerImpl(
     var launchedTask = false
     for (taskSet <- sortedTaskSets; maxLocality <- taskSet.myLocalityLevels) {
       do {
-        launchedTask = false
-        for (i <- 0 until shuffledOffers.size) {
-          val execId = shuffledOffers(i).executorId
-          val host = shuffledOffers(i).host
-          if (availableCpus(i) >= CPUS_PER_TASK) {
-            for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
-              tasks(i) += task
-              val tid = task.taskId
-              taskIdToTaskSetId(tid) = taskSet.taskSet.id
-              taskIdToExecutorId(tid) = execId
-              executorsByHost(host) += execId
-              availableCpus(i) -= CPUS_PER_TASK
-              assert(availableCpus(i) >= 0)
-              launchedTask = true
-            }
-          }
-        }
+        launchedTask = resourceOfferSingleTaskSet(
+            taskSet, maxLocality, shuffledOffers, availableCpus, tasks)
       } while (launchedTask)
     }
 
@@ -394,9 +414,6 @@ private[spark] class TaskSchedulerImpl(
       taskResultGetter.stop()
     }
     starvationTimer.cancel()
-
-    // sleeping for an arbitrary 1 seconds to ensure that messages are sent out.
-    Thread.sleep(1000L)
   }
 
   override def defaultParallelism() = backend.defaultParallelism()
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index cabdc655f89bf..5c94c6bbcb37b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -18,12 +18,14 @@
 package org.apache.spark.scheduler
 
 import java.io.NotSerializableException
+import java.nio.ByteBuffer
 import java.util.Arrays
 
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashMap
 import scala.collection.mutable.HashSet
 import scala.math.{min, max}
+import scala.util.control.NonFatal
 
 import org.apache.spark._
 import org.apache.spark.executor.TaskMetrics
@@ -249,7 +251,7 @@ private[spark] class TaskSetManager(
    * This method also cleans up any tasks in the list that have already
    * been launched, since we want that to happen lazily.
    */
-  private def findTaskFromList(execId: String, list: ArrayBuffer[Int]): Option[Int] = {
+  private def dequeueTaskFromList(execId: String, list: ArrayBuffer[Int]): Option[Int] = {
     var indexOffset = list.size
     while (indexOffset > 0) {
       indexOffset -= 1
@@ -290,7 +292,7 @@ private[spark] class TaskSetManager(
    * an attempt running on this host, in case the host is slow. In addition, the task should meet
    * the given locality constraint.
    */
-  private def findSpeculativeTask(execId: String, host: String, locality: TaskLocality.Value)
+  private def dequeueSpeculativeTask(execId: String, host: String, locality: TaskLocality.Value)
     : Option[(Int, TaskLocality.Value)] =
   {
     speculatableTasks.retain(index => !successful(index)) // Remove finished tasks from set
@@ -366,22 +368,22 @@ private[spark] class TaskSetManager(
    *
    * @return An option containing (task index within the task set, locality, is speculative?)
    */
-  private def findTask(execId: String, host: String, maxLocality: TaskLocality.Value)
+  private def dequeueTask(execId: String, host: String, maxLocality: TaskLocality.Value)
     : Option[(Int, TaskLocality.Value, Boolean)] =
   {
-    for (index <- findTaskFromList(execId, getPendingTasksForExecutor(execId))) {
+    for (index <- dequeueTaskFromList(execId, getPendingTasksForExecutor(execId))) {
       return Some((index, TaskLocality.PROCESS_LOCAL, false))
     }
 
     if (TaskLocality.isAllowed(maxLocality, TaskLocality.NODE_LOCAL)) {
-      for (index <- findTaskFromList(execId, getPendingTasksForHost(host))) {
+      for (index <- dequeueTaskFromList(execId, getPendingTasksForHost(host))) {
         return Some((index, TaskLocality.NODE_LOCAL, false))
       }
     }
 
     if (TaskLocality.isAllowed(maxLocality, TaskLocality.NO_PREF)) {
       // Look for noPref tasks after NODE_LOCAL for minimize cross-rack traffic
-      for (index <- findTaskFromList(execId, pendingTasksWithNoPrefs)) {
+      for (index <- dequeueTaskFromList(execId, pendingTasksWithNoPrefs)) {
         return Some((index, TaskLocality.PROCESS_LOCAL, false))
       }
     }
@@ -389,20 +391,20 @@ private[spark] class TaskSetManager(
     if (TaskLocality.isAllowed(maxLocality, TaskLocality.RACK_LOCAL)) {
       for {
         rack <- sched.getRackForHost(host)
-        index <- findTaskFromList(execId, getPendingTasksForRack(rack))
+        index <- dequeueTaskFromList(execId, getPendingTasksForRack(rack))
       } {
         return Some((index, TaskLocality.RACK_LOCAL, false))
       }
     }
 
     if (TaskLocality.isAllowed(maxLocality, TaskLocality.ANY)) {
-      for (index <- findTaskFromList(execId, allPendingTasks)) {
+      for (index <- dequeueTaskFromList(execId, allPendingTasks)) {
         return Some((index, TaskLocality.ANY, false))
       }
     }
 
     // find a speculative task if all others tasks have been scheduled
-    findSpeculativeTask(execId, host, maxLocality).map {
+    dequeueSpeculativeTask(execId, host, maxLocality).map {
       case (taskIndex, allowedLocality) => (taskIndex, allowedLocality, true)}
   }
 
@@ -417,6 +419,7 @@ private[spark] class TaskSetManager(
    * @param host  the host Id of the offered resource
    * @param maxLocality the maximum locality we want to schedule the tasks at
    */
+  @throws[TaskNotSerializableException]
   def resourceOffer(
       execId: String,
       host: String,
@@ -436,7 +439,7 @@ private[spark] class TaskSetManager(
         }
       }
 
-      findTask(execId, host, allowedLocality) match {
+      dequeueTask(execId, host, allowedLocality) match {
         case Some((index, taskLocality, speculative)) => {
           // Found a task; do some bookkeeping and return a task description
           val task = tasks(index)
@@ -456,10 +459,17 @@ private[spark] class TaskSetManager(
           }
           // Serialize and return the task
           val startTime = clock.getTime()
-          // We rely on the DAGScheduler to catch non-serializable closures and RDDs, so in here
-          // we assume the task can be serialized without exceptions.
-          val serializedTask = Task.serializeWithDependencies(
-            task, sched.sc.addedFiles, sched.sc.addedJars, ser)
+          val serializedTask: ByteBuffer = try {
+            Task.serializeWithDependencies(task, sched.sc.addedFiles, sched.sc.addedJars, ser)
+          } catch {
+            // If the task cannot be serialized, then there's no point to re-attempt the task,
+            // as it will always fail. So just abort the whole task-set.
+            case NonFatal(e) =>
+              val msg = s"Failed to serialize task $taskId, not attempting to retry it."
+              logError(msg, e)
+              abort(s"$msg Exception during serialization: $e")
+              throw new TaskNotSerializableException(e)
+          }
           if (serializedTask.limit > TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024 &&
               !emittedTaskSizeWarning) {
             emittedTaskSizeWarning = true
@@ -477,7 +487,8 @@ private[spark] class TaskSetManager(
               taskName, taskId, host, taskLocality, serializedTask.limit))
 
           sched.dagScheduler.taskStarted(task, info)
-          return Some(new TaskDescription(taskId, execId, taskName, index, serializedTask))
+          return Some(new TaskDescription(taskId = taskId, attemptNumber = attemptNum, execId,
+            taskName, index, serializedTask))
         }
         case _ =>
       }
@@ -704,7 +715,7 @@ private[spark] class TaskSetManager(
 
     // Re-enqueue pending tasks for this host based on the status of the cluster. Note
     // that it's okay if we add a task to the same queue twice (if it had multiple preferred
-    // locations), because findTaskFromList will skip already-running tasks.
+    // locations), because dequeueTaskFromList will skip already-running tasks.
     for (index <- getPendingTasksForExecutor(execId)) {
       addPendingTask(index, readding=true)
     }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 29cd34429b881..5786d367464f4 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -27,8 +27,8 @@ import akka.actor._
 import akka.pattern.ask
 import akka.remote.{DisassociatedEvent, RemotingLifecycleEvent}
 
-import org.apache.spark.{SparkEnv, Logging, SparkException, TaskState}
-import org.apache.spark.scheduler.{SchedulerBackend, SlaveLost, TaskDescription, TaskSchedulerImpl, WorkerOffer}
+import org.apache.spark.{ExecutorAllocationClient, Logging, SparkEnv, SparkException, TaskState}
+import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
 import org.apache.spark.util.{ActorLogReceive, SerializableBuffer, AkkaUtils, Utils}
 
@@ -42,7 +42,7 @@ import org.apache.spark.util.{ActorLogReceive, SerializableBuffer, AkkaUtils, Ut
  */
 private[spark]
 class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSystem: ActorSystem)
-  extends SchedulerBackend with Logging
+  extends ExecutorAllocationClient with SchedulerBackend with Logging
 {
   // Use an atomic variable to track total number of cores in the cluster for simplicity and speed
   var totalCoreCount = new AtomicInteger(0)
@@ -66,6 +66,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
   // Number of executors requested from the cluster manager that have not registered yet
   private var numPendingExecutors = 0
 
+  private val listenerBus = scheduler.sc.listenerBus
+
   // Executors we have requested the cluster manager to kill that have not died yet
   private val executorsPendingToRemove = new HashSet[String]
 
@@ -106,6 +108,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
               logDebug(s"Decremented number of pending executors ($numPendingExecutors left)")
             }
           }
+          listenerBus.post(SparkListenerExecutorAdded(executorId, data))
           makeOffers()
         }
 
@@ -213,6 +216,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
           totalCoreCount.addAndGet(-executorInfo.totalCores)
           totalRegisteredExecutors.addAndGet(-1)
           scheduler.executorLost(executorId, SlaveLost(reason))
+          listenerBus.post(SparkListenerExecutorRemoved(executorId))
         case None => logError(s"Asked to remove non-existent executor $executorId")
       }
     }
@@ -307,7 +311,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
    * Request an additional number of executors from the cluster manager.
    * Return whether the request is acknowledged.
    */
-  final def requestExecutors(numAdditionalExecutors: Int): Boolean = synchronized {
+  final override def requestExecutors(numAdditionalExecutors: Int): Boolean = synchronized {
     logInfo(s"Requesting $numAdditionalExecutors additional executor(s) from the cluster manager")
     logDebug(s"Number of pending executors is now $numPendingExecutors")
     numPendingExecutors += numAdditionalExecutors
@@ -334,7 +338,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
    * Request that the cluster manager kill the specified executors.
    * Return whether the kill request is acknowledged.
    */
-  final def killExecutors(executorIds: Seq[String]): Boolean = synchronized {
+  final override def killExecutors(executorIds: Seq[String]): Boolean = synchronized {
     logInfo(s"Requesting to kill executor(s) ${executorIds.mkString(", ")}")
     val filteredExecutorIds = new ArrayBuffer[String]
     executorIds.foreach { id =>
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala
index b71bd5783d6df..eb52ddfb1eab1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala
@@ -31,7 +31,7 @@ import akka.actor.{Address, ActorRef}
 private[cluster] class ExecutorData(
    val executorActor: ActorRef,
    val executorAddress: Address,
-   val executorHost: String ,
+   override val executorHost: String,
    var freeCores: Int,
-   val totalCores: Int
-)
+   override val totalCores: Int
+) extends ExecutorInfo(executorHost, totalCores)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorInfo.scala
new file mode 100644
index 0000000000000..b4738e64c9391
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorInfo.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster
+
+import org.apache.spark.annotation.DeveloperApi
+
+/**
+ * :: DeveloperApi ::
+ * Stores information about an executor to pass from the scheduler to SparkListeners.
+ */
+@DeveloperApi
+class ExecutorInfo(
+   val executorHost: String,
+   val totalCores: Int
+) {
+
+  def canEqual(other: Any): Boolean = other.isInstanceOf[ExecutorInfo]
+
+  override def equals(other: Any): Boolean = other match {
+    case that: ExecutorInfo =>
+      (that canEqual this) &&
+        executorHost == that.executorHost &&
+        totalCores == that.totalCores
+    case _ => false
+  }
+
+  override def hashCode(): Int = {
+    val state = Seq(executorHost, totalCores)
+    state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
index 8c7de75600b5f..7eb87a564d6f5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
@@ -55,19 +55,26 @@ private[spark] class SparkDeploySchedulerBackend(
       "{{WORKER_URL}}")
     val extraJavaOpts = sc.conf.getOption("spark.executor.extraJavaOptions")
       .map(Utils.splitCommandString).getOrElse(Seq.empty)
-    val classPathEntries = sc.conf.getOption("spark.executor.extraClassPath").toSeq.flatMap { cp =>
-      cp.split(java.io.File.pathSeparator)
-    }
-    val libraryPathEntries =
-      sc.conf.getOption("spark.executor.extraLibraryPath").toSeq.flatMap { cp =>
-        cp.split(java.io.File.pathSeparator)
+    val classPathEntries = sc.conf.getOption("spark.executor.extraClassPath")
+      .map(_.split(java.io.File.pathSeparator).toSeq).getOrElse(Nil)
+    val libraryPathEntries = sc.conf.getOption("spark.executor.extraLibraryPath")
+      .map(_.split(java.io.File.pathSeparator).toSeq).getOrElse(Nil)
+
+    // When testing, expose the parent class path to the child. This is processed by
+    // compute-classpath.{cmd,sh} and makes all needed jars available to child processes
+    // when the assembly is built with the "*-provided" profiles enabled.
+    val testingClassPath =
+      if (sys.props.contains("spark.testing")) {
+        sys.props("java.class.path").split(java.io.File.pathSeparator).toSeq
+      } else {
+        Nil
       }
 
     // Start executors with a few necessary configs for registering with the scheduler
     val sparkJavaOpts = Utils.sparkJavaOpts(conf, SparkConf.isExecutorStartupConf)
     val javaOpts = sparkJavaOpts ++ extraJavaOpts
     val command = Command("org.apache.spark.executor.CoarseGrainedExecutorBackend",
-      args, sc.executorEnvs, classPathEntries, libraryPathEntries, javaOpts)
+      args, sc.executorEnvs, classPathEntries ++ testingClassPath, libraryPathEntries, javaOpts)
     val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")
     val appDesc = new ApplicationDescription(sc.appName, maxCores, sc.executorMemory, command,
       appUIAddress, sc.eventLogDir)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index 50721b9d6cd6c..f14aaeea0a25c 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.scheduler.cluster
 
+import scala.concurrent.{Future, ExecutionContext}
+
 import akka.actor.{Actor, ActorRef, Props}
 import akka.remote.{DisassociatedEvent, RemotingLifecycleEvent}
 
@@ -24,7 +26,9 @@ import org.apache.spark.SparkContext
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
 import org.apache.spark.scheduler.TaskSchedulerImpl
 import org.apache.spark.ui.JettyUtils
-import org.apache.spark.util.AkkaUtils
+import org.apache.spark.util.{AkkaUtils, Utils}
+
+import scala.util.control.NonFatal
 
 /**
  * Abstract Yarn scheduler backend that contains common logic
@@ -97,6 +101,9 @@ private[spark] abstract class YarnSchedulerBackend(
   private class YarnSchedulerActor extends Actor {
     private var amActor: Option[ActorRef] = None
 
+    implicit val askAmActorExecutor = ExecutionContext.fromExecutor(
+      Utils.newDaemonCachedThreadPool("yarn-scheduler-ask-am-executor"))
+
     override def preStart(): Unit = {
       // Listen for disassociation events
       context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
@@ -110,7 +117,12 @@ private[spark] abstract class YarnSchedulerBackend(
       case r: RequestExecutors =>
         amActor match {
           case Some(actor) =>
-            sender ! AkkaUtils.askWithReply[Boolean](r, actor, askTimeout)
+            val driverActor = sender
+            Future {
+              driverActor ! AkkaUtils.askWithReply[Boolean](r, actor, askTimeout)
+            } onFailure {
+              case NonFatal(e) => logError(s"Sending $r to AM was unsuccessful", e)
+            }
           case None =>
             logWarning("Attempted to request executors before the AM has registered!")
             sender ! false
@@ -119,7 +131,12 @@ private[spark] abstract class YarnSchedulerBackend(
       case k: KillExecutors =>
         amActor match {
           case Some(actor) =>
-            sender ! AkkaUtils.askWithReply[Boolean](k, actor, askTimeout)
+            val driverActor = sender
+            Future {
+              driverActor ! AkkaUtils.askWithReply[Boolean](k, actor, askTimeout)
+            } onFailure {
+              case NonFatal(e) => logError(s"Sending $k to AM was unsuccessful", e)
+            }
           case None =>
             logWarning("Attempted to kill executors before the AM has registered!")
             sender ! false
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
index 10e6886c16a4f..79c9051e88691 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
@@ -22,14 +22,17 @@ import java.util.{ArrayList => JArrayList, List => JList}
 import java.util.Collections
 
 import scala.collection.JavaConversions._
-import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
+import scala.collection.mutable.{HashMap, HashSet}
 
 import org.apache.mesos.protobuf.ByteString
 import org.apache.mesos.{Scheduler => MScheduler}
 import org.apache.mesos._
-import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, TaskState => MesosTaskState, _}
+import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, TaskState => MesosTaskState,
+  ExecutorInfo => MesosExecutorInfo, _}
 
+import org.apache.spark.executor.MesosExecutorBackend
 import org.apache.spark.{Logging, SparkContext, SparkException, TaskState}
+import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.scheduler._
 import org.apache.spark.util.Utils
 
@@ -62,6 +65,9 @@ private[spark] class MesosSchedulerBackend(
 
   var classLoader: ClassLoader = null
 
+  // The listener bus to publish executor added/removed events.
+  val listenerBus = sc.listenerBus
+
   @volatile var appId: String = _
 
   override def start() {
@@ -87,7 +93,7 @@ private[spark] class MesosSchedulerBackend(
     }
   }
 
-  def createExecutorInfo(execId: String): ExecutorInfo = {
+  def createExecutorInfo(execId: String): MesosExecutorInfo = {
     val executorSparkHome = sc.conf.getOption("spark.mesos.executor.home")
       .orElse(sc.getSparkHome()) // Fall back to driver Spark home for backward compatibility
       .getOrElse {
@@ -118,14 +124,15 @@ private[spark] class MesosSchedulerBackend(
     val command = CommandInfo.newBuilder()
       .setEnvironment(environment)
     val uri = sc.conf.get("spark.executor.uri", null)
+    val executorBackendName = classOf[MesosExecutorBackend].getName
     if (uri == null) {
-      val executorPath = new File(executorSparkHome, "/sbin/spark-executor").getCanonicalPath
-      command.setValue("%s %s".format(prefixEnv, executorPath))
+      val executorPath = new File(executorSparkHome, "/bin/spark-class").getCanonicalPath
+      command.setValue(s"$prefixEnv $executorPath $executorBackendName")
     } else {
       // Grab everything to the first '.'. We'll use that and '*' to
       // glob the directory "correctly".
       val basename = uri.split('/').last.split('.').head
-      command.setValue("cd %s*; %s ./sbin/spark-executor".format(basename, prefixEnv))
+      command.setValue(s"cd ${basename}*; $prefixEnv ./bin/spark-class $executorBackendName")
       command.addUris(CommandInfo.URI.newBuilder().setValue(uri))
     }
     val cpus = Resource.newBuilder()
@@ -141,7 +148,7 @@ private[spark] class MesosSchedulerBackend(
         Value.Scalar.newBuilder()
           .setValue(MemoryUtils.calculateTotalMemory(sc)).build())
       .build()
-    ExecutorInfo.newBuilder()
+    MesosExecutorInfo.newBuilder()
       .setExecutorId(ExecutorID.newBuilder().setValue(execId).build())
       .setCommand(command)
       .setData(ByteString.copyFrom(createExecArg()))
@@ -237,6 +244,7 @@ private[spark] class MesosSchedulerBackend(
       }
 
       val slaveIdToOffer = usableOffers.map(o => o.getSlaveId.getValue -> o).toMap
+      val slaveIdToWorkerOffer = workerOffers.map(o => o.executorId -> o).toMap
 
       val mesosTasks = new HashMap[String, JArrayList[MesosTaskInfo]]
 
@@ -260,6 +268,10 @@ private[spark] class MesosSchedulerBackend(
       val filters = Filters.newBuilder().setRefuseSeconds(1).build() // TODO: lower timeout?
 
       mesosTasks.foreach { case (slaveId, tasks) =>
+        slaveIdToWorkerOffer.get(slaveId).foreach(o =>
+          listenerBus.post(SparkListenerExecutorAdded(slaveId,
+            new ExecutorInfo(o.host, o.cores)))
+        )
         d.launchTasks(Collections.singleton(slaveIdToOffer(slaveId).getId), tasks, filters)
       }
 
@@ -296,7 +308,7 @@ private[spark] class MesosSchedulerBackend(
       .setExecutor(createExecutorInfo(slaveId))
       .setName(task.name)
       .addResources(cpuResource)
-      .setData(ByteString.copyFrom(task.serializedTask))
+      .setData(MesosTaskLaunchData(task.serializedTask, task.attemptNumber).toByteString)
       .build()
   }
 
@@ -315,7 +327,7 @@ private[spark] class MesosSchedulerBackend(
       synchronized {
         if (status.getState == MesosTaskState.TASK_LOST && taskIdToSlaveId.contains(tid)) {
           // We lost the executor on this slave, so remember that it's gone
-          slaveIdsWithExecutors -= taskIdToSlaveId(tid)
+          removeExecutor(taskIdToSlaveId(tid))
         }
         if (isFinished(status.getState)) {
           taskIdToSlaveId.remove(tid)
@@ -344,12 +356,20 @@ private[spark] class MesosSchedulerBackend(
 
   override def frameworkMessage(d: SchedulerDriver, e: ExecutorID, s: SlaveID, b: Array[Byte]) {}
 
+  /**
+   * Remove executor associated with slaveId in a thread safe manner.
+   */
+  private def removeExecutor(slaveId: String) = {
+    synchronized {
+      listenerBus.post(SparkListenerExecutorRemoved(slaveId))
+      slaveIdsWithExecutors -= slaveId
+    }
+  }
+
   private def recordSlaveLost(d: SchedulerDriver, slaveId: SlaveID, reason: ExecutorLossReason) {
     inClassLoader() {
       logInfo("Mesos slave lost: " + slaveId.getValue)
-      synchronized {
-        slaveIdsWithExecutors -= slaveId.getValue
-      }
+      removeExecutor(slaveId.getValue)
       scheduler.executorLost(slaveId.getValue, reason)
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchData.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchData.scala
new file mode 100644
index 0000000000000..5e7e6567a3e06
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchData.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import java.nio.ByteBuffer
+
+import org.apache.mesos.protobuf.ByteString
+
+import org.apache.spark.Logging
+
+/**
+ * Wrapper for serializing the data sent when launching Mesos tasks.
+ */
+private[spark] case class MesosTaskLaunchData(
+  serializedTask: ByteBuffer,
+  attemptNumber: Int) extends Logging {
+
+  def toByteString: ByteString = {
+    val dataBuffer = ByteBuffer.allocate(4 + serializedTask.limit)
+    dataBuffer.putInt(attemptNumber)
+    dataBuffer.put(serializedTask)
+    dataBuffer.rewind
+    logDebug(s"ByteBuffer size: [${dataBuffer.remaining}]")
+    ByteString.copyFrom(dataBuffer)
+  }
+}
+
+private[spark] object MesosTaskLaunchData extends Logging {
+  def fromByteString(byteString: ByteString): MesosTaskLaunchData = {
+    val byteBuffer = byteString.asReadOnlyByteBuffer()
+    logDebug(s"ByteBuffer size: [${byteBuffer.remaining}]")
+    val attemptNumber = byteBuffer.getInt // updates the position by 4 bytes
+    val serializedTask = byteBuffer.slice() // subsequence starting at the current position
+    MesosTaskLaunchData(serializedTask, attemptNumber)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
index a2f1f14264a99..05b6fa54564b7 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
@@ -41,17 +41,18 @@ private case class StopExecutor()
  * and the TaskSchedulerImpl.
  */
 private[spark] class LocalActor(
-  scheduler: TaskSchedulerImpl,
-  executorBackend: LocalBackend,
-  private val totalCores: Int) extends Actor with ActorLogReceive with Logging {
+    scheduler: TaskSchedulerImpl,
+    executorBackend: LocalBackend,
+    private val totalCores: Int)
+  extends Actor with ActorLogReceive with Logging {
 
   private var freeCores = totalCores
 
   private val localExecutorId = SparkContext.DRIVER_IDENTIFIER
   private val localExecutorHostname = "localhost"
 
-  val executor = new Executor(
-    localExecutorId, localExecutorHostname, scheduler.conf.getAll, totalCores, isLocal = true)
+  private val executor = new Executor(
+    localExecutorId, localExecutorHostname, SparkEnv.get, isLocal = true)
 
   override def receiveWithLogging = {
     case ReviveOffers =>
@@ -75,7 +76,8 @@ private[spark] class LocalActor(
     val offers = Seq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores))
     for (task <- scheduler.resourceOffers(offers).flatten) {
       freeCores -= scheduler.CPUS_PER_TASK
-      executor.launchTask(executorBackend, task.taskId, task.name, task.serializedTask)
+      executor.launchTask(executorBackend, taskId = task.taskId, attemptNumber = task.attemptNumber,
+        task.name, task.serializedTask)
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
index 662a7b91248aa..fa8a337ad63a8 100644
--- a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
@@ -92,7 +92,7 @@ private[spark] class JavaSerializerInstance(counterReset: Int, defaultClassLoade
   }
 
   override def deserializeStream(s: InputStream): DeserializationStream = {
-    new JavaDeserializationStream(s, Utils.getContextOrSparkClassLoader)
+    new JavaDeserializationStream(s, defaultClassLoader)
   }
 
   def deserializeStream(s: InputStream, loader: ClassLoader): DeserializationStream = {
diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 621a951c27d07..d56e23ce4478a 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -26,9 +26,10 @@ import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializ
 import com.twitter.chill.{AllScalaRegistrar, EmptyScalaKryoInstantiator}
 
 import org.apache.spark._
+import org.apache.spark.api.python.PythonBroadcast
 import org.apache.spark.broadcast.HttpBroadcast
 import org.apache.spark.network.nio.{PutBlock, GotBlock, GetBlock}
-import org.apache.spark.scheduler.MapStatus
+import org.apache.spark.scheduler.{CompressedMapStatus, HighlyCompressedMapStatus}
 import org.apache.spark.storage._
 import org.apache.spark.util.BoundedPriorityQueue
 import org.apache.spark.util.collection.CompactBuffer
@@ -90,6 +91,7 @@ class KryoSerializer(conf: SparkConf)
     // Allow sending SerializableWritable
     kryo.register(classOf[SerializableWritable[_]], new KryoJavaSerializer())
     kryo.register(classOf[HttpBroadcast[_]], new KryoJavaSerializer())
+    kryo.register(classOf[PythonBroadcast], new KryoJavaSerializer())
 
     try {
       // Use the default classloader when calling the user registrator.
@@ -205,7 +207,8 @@ private[serializer] object KryoSerializer {
     classOf[PutBlock],
     classOf[GotBlock],
     classOf[GetBlock],
-    classOf[MapStatus],
+    classOf[CompressedMapStatus],
+    classOf[HighlyCompressedMapStatus],
     classOf[CompactBuffer[_]],
     classOf[BlockManagerId],
     classOf[Array[Byte]],
diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleManager.scala
index 801ae54086053..a44a8e1249256 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleManager.scala
@@ -20,8 +20,8 @@ package org.apache.spark.shuffle
 import org.apache.spark.{TaskContext, ShuffleDependency}
 
 /**
- * Pluggable interface for shuffle systems. A ShuffleManager is created in SparkEnv on both the
- * driver and executors, based on the spark.shuffle.manager setting. The driver registers shuffles
+ * Pluggable interface for shuffle systems. A ShuffleManager is created in SparkEnv on the driver
+ * and on each executor, based on the spark.shuffle.manager setting. The driver registers shuffles
  * with it, and executors (or tasks running locally in the driver) can ask to read and write data.
  *
  * NOTE: this will be instantiated by SparkEnv so its constructor can take a SparkConf and
diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala
index de72148ccc7ac..41bafabde05b9 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala
@@ -59,8 +59,8 @@ private[spark] class HashShuffleReader[K, C](
         // the ExternalSorter won't spill to disk.
         val sorter = new ExternalSorter[K, C, C](ordering = Some(keyOrd), serializer = Some(ser))
         sorter.insertAll(aggregatedIter)
-        context.taskMetrics.memoryBytesSpilled += sorter.memoryBytesSpilled
-        context.taskMetrics.diskBytesSpilled += sorter.diskBytesSpilled
+        context.taskMetrics.incMemoryBytesSpilled(sorter.memoryBytesSpilled)
+        context.taskMetrics.incDiskBytesSpilled(sorter.diskBytesSpilled)
         sorter.iterator
       case None =>
         aggregatedIter
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index d7b184f8a10e9..8bc5a1cd18b64 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -34,10 +34,9 @@ import org.apache.spark.executor._
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.network._
 import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
-import org.apache.spark.network.netty.{SparkTransportConf, NettyBlockTransferService}
+import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.network.shuffle.ExternalShuffleClient
 import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo
-import org.apache.spark.network.util.{ConfigProvider, TransportConf}
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.ShuffleManager
 import org.apache.spark.shuffle.hash.HashShuffleManager
@@ -54,7 +53,7 @@ private[spark] class BlockResult(
     readMethod: DataReadMethod.Value,
     bytes: Long) {
   val inputMetrics = new InputMetrics(readMethod)
-  inputMetrics.bytesRead = bytes
+  inputMetrics.addBytesRead(bytes)
 }
 
 /**
@@ -120,7 +119,7 @@ private[spark] class BlockManager(
   private[spark] var shuffleServerId: BlockManagerId = _
 
   // Client to read other executors' shuffle files. This is either an external service, or just the
-  // standard BlockTranserService to directly connect to other Executors.
+  // standard BlockTransferService to directly connect to other Executors.
   private[spark] val shuffleClient = if (externalShuffleServiceEnabled) {
     val transConf = SparkTransportConf.fromSparkConf(conf, numUsableCores)
     new ExternalShuffleClient(transConf, securityManager, securityManager.isAuthenticationEnabled())
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala
index 9cbda41223a8b..64133464d8daa 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala
@@ -52,8 +52,7 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
 
   private val akkaTimeout = AkkaUtils.askTimeout(conf)
 
-  val slaveTimeout = conf.getLong("spark.storage.blockManagerSlaveTimeoutMs",
-    math.max(conf.getInt("spark.executor.heartbeatInterval", 10000) * 3, 45000))
+  val slaveTimeout = conf.getLong("spark.storage.blockManagerSlaveTimeoutMs", 120 * 1000)
 
   val checkTimeoutInterval = conf.getLong("spark.storage.blockManagerTimeoutIntervalMs", 60000)
 
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
index 9c469370ffe1f..3198d766fca37 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
@@ -160,14 +160,14 @@ private[spark] class DiskBlockObjectWriter(
     }
     finalPosition = file.length()
     // In certain compression codecs, more bytes are written after close() is called
-    writeMetrics.shuffleBytesWritten += (finalPosition - reportedPosition)
+    writeMetrics.incShuffleBytesWritten(finalPosition - reportedPosition)
   }
 
   // Discard current writes. We do this by flushing the outstanding writes and then
   // truncating the file to its initial position.
   override def revertPartialWritesAndClose() {
     try {
-      writeMetrics.shuffleBytesWritten -= (reportedPosition - initialPosition)
+      writeMetrics.decShuffleBytesWritten(reportedPosition - initialPosition)
 
       if (initialized) {
         objOut.flush()
@@ -212,14 +212,14 @@ private[spark] class DiskBlockObjectWriter(
    */
   private def updateBytesWritten() {
     val pos = channel.position()
-    writeMetrics.shuffleBytesWritten += (pos - reportedPosition)
+    writeMetrics.incShuffleBytesWritten(pos - reportedPosition)
     reportedPosition = pos
   }
 
   private def callWithTiming(f: => Unit) = {
     val start = System.nanoTime()
     f
-    writeMetrics.shuffleWriteTime += (System.nanoTime() - start)
+    writeMetrics.incShuffleWriteTime(System.nanoTime() - start)
   }
 
   // For testing
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
index 58fba54710510..af05eb3ca69ce 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.storage
 
-import java.io.File
+import java.io.{IOException, File}
 import java.text.SimpleDateFormat
 import java.util.{Date, Random, UUID}
 
@@ -71,7 +71,9 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
           old
         } else {
           val newDir = new File(localDirs(dirId), "%02x".format(subDirId))
-          newDir.mkdir()
+          if (!newDir.exists() && !newDir.mkdir()) {
+            throw new IOException(s"Failed to create local dir in $newDir.")
+          }
           subDirs(dirId)(subDirId) = newDir
           newDir
         }
@@ -164,7 +166,7 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
   /** Cleanup local dirs and stop shuffle sender. */
   private[spark] def stop() {
     // Only perform cleanup if an external service is not serving our shuffle files.
-    if (!blockManager.externalShuffleServiceEnabled) {
+    if (!blockManager.externalShuffleServiceEnabled || blockManager.blockManagerId.isDriver) {
       localDirs.foreach { localDir =>
         if (localDir.isDirectory() && localDir.exists()) {
           try {
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
index 8dadf6794039e..61ef5ff168791 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
@@ -31,7 +31,8 @@ import org.apache.spark.util.Utils
 private[spark] class DiskStore(blockManager: BlockManager, diskManager: DiskBlockManager)
   extends BlockStore(blockManager) with Logging {
 
-  val minMemoryMapBytes = blockManager.conf.getLong("spark.storage.memoryMapThreshold", 2 * 4096L)
+  val minMemoryMapBytes = blockManager.conf.getLong(
+    "spark.storage.memoryMapThreshold", 2 * 1024L * 1024L)
 
   override def getSize(blockId: BlockId): Long = {
     diskManager.getFile(blockId.name).length
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index 2499c11a65b0e..ab9ee4f0096bf 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -156,8 +156,8 @@ final class ShuffleBlockFetcherIterator(
             // This needs to be released after use.
             buf.retain()
             results.put(new SuccessFetchResult(BlockId(blockId), sizeMap(blockId), buf))
-            shuffleMetrics.remoteBytesRead += buf.size
-            shuffleMetrics.remoteBlocksFetched += 1
+            shuffleMetrics.incRemoteBytesRead(buf.size)
+            shuffleMetrics.incRemoteBlocksFetched(1)
           }
           logTrace("Got remote block " + blockId + " after " + Utils.getUsedTimeMs(startTime))
         }
@@ -233,7 +233,7 @@ final class ShuffleBlockFetcherIterator(
       val blockId = iter.next()
       try {
         val buf = blockManager.getBlockData(blockId)
-        shuffleMetrics.localBlocksFetched += 1
+        shuffleMetrics.incLocalBlocksFetched(1)
         buf.retain()
         results.put(new SuccessFetchResult(blockId, 0, buf))
       } catch {
@@ -277,7 +277,7 @@ final class ShuffleBlockFetcherIterator(
     currentResult = results.take()
     val result = currentResult
     val stopFetchWait = System.currentTimeMillis()
-    shuffleMetrics.fetchWaitTime += (stopFetchWait - startFetchWait)
+    shuffleMetrics.incFetchWaitTime(stopFetchWait - startFetchWait)
 
     result match {
       case SuccessFetchResult(_, size, _) => bytesInFlight -= size
diff --git a/core/src/main/scala/org/apache/spark/storage/StorageLevel.scala b/core/src/main/scala/org/apache/spark/storage/StorageLevel.scala
index 56edc4fe2e4ad..e5e1cf5a69a19 100644
--- a/core/src/main/scala/org/apache/spark/storage/StorageLevel.scala
+++ b/core/src/main/scala/org/apache/spark/storage/StorageLevel.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.storage
 
 import java.io.{Externalizable, IOException, ObjectInput, ObjectOutput}
+import java.util.concurrent.ConcurrentHashMap
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.util.Utils
@@ -220,8 +221,7 @@ object StorageLevel {
     getCachedStorageLevel(obj)
   }
 
-  private[spark] val storageLevelCache =
-    new java.util.concurrent.ConcurrentHashMap[StorageLevel, StorageLevel]()
+  private[spark] val storageLevelCache = new ConcurrentHashMap[StorageLevel, StorageLevel]()
 
   private[spark] def getCachedStorageLevel(level: StorageLevel): StorageLevel = {
     storageLevelCache.putIfAbsent(level, level)
diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index 2a27d49d2de05..88fed833f922d 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -201,7 +201,7 @@ private[spark] object JettyUtils extends Logging {
       }
     }
 
-    val (server, boundPort) = Utils.startServiceOnPort[Server](port, connect, serverName)
+    val (server, boundPort) = Utils.startServiceOnPort[Server](port, connect, conf, serverName)
     ServerInfo(server, boundPort, collection)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
index 176907dffa46a..0c24ad2760e08 100644
--- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
@@ -57,8 +57,6 @@ private[spark] class SparkUI private (
     attachHandler(createRedirectHandler("/", "/jobs", basePath = basePath))
     attachHandler(
       createRedirectHandler("/stages/stage/kill", "/stages", stagesTab.handleKillRequest))
-    // If the UI is live, then serve
-    sc.foreach { _.env.metricsSystem.getServletHandlers.foreach(attachHandler) }
   }
   initialize()
 
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index 315327c3c6b7c..b5022fe853c49 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -181,7 +181,9 @@ private[spark] object UIUtils extends Logging {
       </li>
     }
     val helpButton: Seq[Node] = helpText.map { helpText =>
-      <a data-toggle="tooltip" data-placement="bottom" title={helpText}>(?)</a>
+      <sup>
+        (<a data-toggle="tooltip" data-placement="bottom" title={helpText}>?</a>)
+      </sup>
     }.getOrElse(Seq.empty)
 
     <html>
@@ -192,9 +194,12 @@ private[spark] object UIUtils extends Logging {
       <body>
         <div class="navbar navbar-static-top">
           <div class="navbar-inner">
-            <a href={prependBaseUri("/")} class="brand">
-              <img src={prependBaseUri("/static/spark-logo-77x50px-hd.png")} />
-            </a>
+            <div class="brand">
+              <a href={prependBaseUri("/")} class="brand">
+                <img src={prependBaseUri("/static/spark-logo-77x50px-hd.png")} />
+                <span class="version">{org.apache.spark.SPARK_VERSION}</span>
+              </a>
+            </div>
             <ul class="nav">{header}</ul>
             <p class="navbar-text pull-right">
               <strong title={appName}>{shortAppName}</strong> application UI
@@ -212,11 +217,6 @@ private[spark] object UIUtils extends Logging {
           </div>
           {content}
         </div>
-        <div id="footer">
-          <div class="container-fluid">
-            <p class="muted credit">Spark {org.apache.spark.SPARK_VERSION}</p>
-          </div>
-        </div>
       </body>
     </html>
   }
@@ -234,8 +234,9 @@ private[spark] object UIUtils extends Logging {
             <div class="span12">
               <h3 style="vertical-align: middle; display: inline-block;">
                 <a style="text-decoration: none" href={prependBaseUri("/")}>
-                  <img src={prependBaseUri("/static/spark-logo-77x50px-hd.png")}
-                       style="margin-right: 15px;" />
+                  <img src={prependBaseUri("/static/spark-logo-77x50px-hd.png")} />
+                  <span class="version" 
+                        style="margin-right: 15px;">{org.apache.spark.SPARK_VERSION}</span>
                 </a>
                 {title}
               </h3>
@@ -243,11 +244,6 @@ private[spark] object UIUtils extends Logging {
           </div>
           {content}
         </div>
-        <div id="footer">
-          <div class="container-fluid">
-            <p class="muted credit">Spark {org.apache.spark.SPARK_VERSION}</p>
-          </div>
-        </div>
       </body>
     </html>
   }
diff --git a/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala b/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala
index b4677447c8872..fc1844600f1cb 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala
@@ -22,20 +22,23 @@ import scala.util.Random
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.scheduler.SchedulingMode
 
+// scalastyle:off
 /**
  * Continuously generates jobs that expose various features of the WebUI (internal testing tool).
  *
- * Usage: ./bin/spark-class org.apache.spark.ui.UIWorkloadGenerator [master] [FIFO|FAIR]
+ * Usage: ./bin/spark-class org.apache.spark.ui.UIWorkloadGenerator [master] [FIFO|FAIR] [#job set (4 jobs per set)]
  */
+// scalastyle:on
 private[spark] object UIWorkloadGenerator {
 
   val NUM_PARTITIONS = 100
   val INTER_JOB_WAIT_MS = 5000
 
   def main(args: Array[String]) {
-    if (args.length < 2) {
+    if (args.length < 3) {
       println(
-        "usage: ./bin/spark-class org.apache.spark.ui.UIWorkloadGenerator [master] [FIFO|FAIR]")
+        "usage: ./bin/spark-class org.apache.spark.ui.UIWorkloadGenerator " +
+          "[master] [FIFO|FAIR] [#job set (4 jobs per set)]")
       System.exit(1)
     }
 
@@ -45,6 +48,7 @@ private[spark] object UIWorkloadGenerator {
     if (schedulingMode == SchedulingMode.FAIR) {
       conf.set("spark.scheduler.mode", "FAIR")
     }
+    val nJobSet = args(2).toInt
     val sc = new SparkContext(conf)
 
     def setProperties(s: String) = {
@@ -84,7 +88,7 @@ private[spark] object UIWorkloadGenerator {
       ("Job with delays", baseData.map(x => Thread.sleep(100)).count)
     )
 
-    while (true) {
+    (1 to nJobSet).foreach { _ =>
       for ((desc, job) <- jobs) {
         new Thread {
           override def run() {
@@ -101,5 +105,6 @@ private[spark] object UIWorkloadGenerator {
         Thread.sleep(INTER_JOB_WAIT_MS)
       }
     }
+    sc.stop()
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index ea2d187a0e8e4..045c69da06feb 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -21,7 +21,6 @@ import scala.xml.{Node, NodeSeq}
 
 import javax.servlet.http.HttpServletRequest
 
-import org.apache.spark.JobExecutionStatus
 import org.apache.spark.ui.{WebUIPage, UIUtils}
 import org.apache.spark.ui.jobs.UIData.JobUIData
 
@@ -47,17 +46,17 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
       val lastStageData = lastStageInfo.flatMap { s =>
         listener.stageIdToData.get((s.stageId, s.attemptId))
       }
-      val isComplete = job.status == JobExecutionStatus.SUCCEEDED
+
       val lastStageName = lastStageInfo.map(_.name).getOrElse("(Unknown Stage Name)")
       val lastStageDescription = lastStageData.flatMap(_.description).getOrElse("")
       val duration: Option[Long] = {
-        job.startTime.map { start =>
-          val end = job.endTime.getOrElse(System.currentTimeMillis())
+        job.submissionTime.map { start =>
+          val end = job.completionTime.getOrElse(System.currentTimeMillis())
           end - start
         }
       }
       val formattedDuration = duration.map(d => UIUtils.formatDuration(d)).getOrElse("Unknown")
-      val formattedSubmissionTime = job.startTime.map(UIUtils.formatDate).getOrElse("Unknown")
+      val formattedSubmissionTime = job.submissionTime.map(UIUtils.formatDate).getOrElse("Unknown")
       val detailUrl =
         "%s/jobs/job?id=%s".format(UIUtils.prependBaseUri(parent.basePath), job.jobId)
       <tr>
@@ -65,10 +64,10 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
           {job.jobId} {job.jobGroup.map(id => s"($id)").getOrElse("")}
         </td>
         <td>
-          <div><em>{lastStageDescription}</em></div>
+          <span class="description-input" title={lastStageDescription}>{lastStageDescription}</span>
           <a href={detailUrl}>{lastStageName}</a>
         </td>
-        <td sorttable_customkey={job.startTime.getOrElse(-1).toString}>
+        <td sorttable_customkey={job.submissionTime.getOrElse(-1).toString}>
           {formattedSubmissionTime}
         </td>
         <td sorttable_customkey={duration.getOrElse(-1).toString}>{formattedDuration}</td>
@@ -101,11 +100,15 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
       val now = System.currentTimeMillis
 
       val activeJobsTable =
-        jobsTable(activeJobs.sortBy(_.startTime.getOrElse(-1L)).reverse)
+        jobsTable(activeJobs.sortBy(_.submissionTime.getOrElse(-1L)).reverse)
       val completedJobsTable =
-        jobsTable(completedJobs.sortBy(_.endTime.getOrElse(-1L)).reverse)
+        jobsTable(completedJobs.sortBy(_.completionTime.getOrElse(-1L)).reverse)
       val failedJobsTable =
-        jobsTable(failedJobs.sortBy(_.endTime.getOrElse(-1L)).reverse)
+        jobsTable(failedJobs.sortBy(_.completionTime.getOrElse(-1L)).reverse)
+
+      val shouldShowActiveJobs = activeJobs.nonEmpty
+      val shouldShowCompletedJobs = completedJobs.nonEmpty
+      val shouldShowFailedJobs = failedJobs.nonEmpty
 
       val summary: NodeSeq =
         <div>
@@ -121,27 +124,47 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
               <strong>Scheduling Mode: </strong>
               {listener.schedulingMode.map(_.toString).getOrElse("Unknown")}
             </li>
-            <li>
-              <a href="#active"><strong>Active Jobs:</strong></a>
-              {activeJobs.size}
-            </li>
-            <li>
-              <a href="#completed"><strong>Completed Jobs:</strong></a>
-              {completedJobs.size}
-            </li>
-            <li>
-              <a href="#failed"><strong>Failed Jobs:</strong></a>
-              {failedJobs.size}
-            </li>
+            {
+              if (shouldShowActiveJobs) {
+                <li>
+                  <a href="#active"><strong>Active Jobs:</strong></a>
+                  {activeJobs.size}
+                </li>
+              }
+            }
+            {
+              if (shouldShowCompletedJobs) {
+                <li>
+                  <a href="#completed"><strong>Completed Jobs:</strong></a>
+                  {completedJobs.size}
+                </li>
+              }
+            }
+            {
+              if (shouldShowFailedJobs) {
+                <li>
+                  <a href="#failed"><strong>Failed Jobs:</strong></a>
+                  {failedJobs.size}
+                </li>
+              }
+            }
           </ul>
         </div>
 
-      val content = summary ++
-        <h4 id="active">Active Jobs ({activeJobs.size})</h4> ++ activeJobsTable ++
-        <h4 id="completed">Completed Jobs ({completedJobs.size})</h4> ++ completedJobsTable ++
-        <h4 id ="failed">Failed Jobs ({failedJobs.size})</h4> ++ failedJobsTable
-
-      val helpText = """A job is triggered by a action, like "count()" or "saveAsTextFile()".""" +
+      var content = summary
+      if (shouldShowActiveJobs) {
+        content ++= <h4 id="active">Active Jobs ({activeJobs.size})</h4> ++
+          activeJobsTable
+      }
+      if (shouldShowCompletedJobs) {
+        content ++= <h4 id="completed">Completed Jobs ({completedJobs.size})</h4> ++
+          completedJobsTable
+      }
+      if (shouldShowFailedJobs) {
+        content ++= <h4 id ="failed">Failed Jobs ({failedJobs.size})</h4> ++
+          failedJobsTable
+      }
+      val helpText = """A job is triggered by an action, like "count()" or "saveAsTextFile()".""" +
         " Click on a job's title to see information about the stages of tasks associated with" +
         " the job."
 
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
index b0f8ca2ab0d3f..479f967fb1541 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
@@ -33,6 +33,7 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") {
   def render(request: HttpServletRequest): Seq[Node] = {
     listener.synchronized {
       val activeStages = listener.activeStages.values.toSeq
+      val pendingStages = listener.pendingStages.values.toSeq
       val completedStages = listener.completedStages.reverse.toSeq
       val numCompletedStages = listener.numCompletedStages
       val failedStages = listener.failedStages.reverse.toSeq
@@ -43,6 +44,10 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") {
         new StageTableBase(activeStages.sortBy(_.submissionTime).reverse,
           parent.basePath, parent.listener, isFairScheduler = parent.isFairScheduler,
           killEnabled = parent.killEnabled)
+      val pendingStagesTable =
+        new StageTableBase(pendingStages.sortBy(_.submissionTime).reverse,
+          parent.basePath, parent.listener, isFairScheduler = parent.isFairScheduler,
+          killEnabled = false)
       val completedStagesTable =
         new StageTableBase(completedStages.sortBy(_.submissionTime).reverse, parent.basePath,
           parent.listener, isFairScheduler = parent.isFairScheduler, killEnabled = false)
@@ -54,48 +59,86 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") {
       val pools = sc.map(_.getAllPools).getOrElse(Seq.empty[Schedulable])
       val poolTable = new PoolTable(pools, parent)
 
+      val shouldShowActiveStages = activeStages.nonEmpty
+      val shouldShowPendingStages = pendingStages.nonEmpty
+      val shouldShowCompletedStages = completedStages.nonEmpty
+      val shouldShowFailedStages = failedStages.nonEmpty
+
       val summary: NodeSeq =
         <div>
           <ul class="unstyled">
-            {if (sc.isDefined) {
-              // Total duration is not meaningful unless the UI is live
-              <li>
-                <strong>Total Duration: </strong>
-                {UIUtils.formatDuration(now - sc.get.startTime)}
-              </li>
-            }}
+            {
+              if (sc.isDefined) {
+                // Total duration is not meaningful unless the UI is live
+                <li>
+                  <strong>Total Duration: </strong>
+                  {UIUtils.formatDuration(now - sc.get.startTime)}
+                </li>
+              }
+            }
             <li>
               <strong>Scheduling Mode: </strong>
               {listener.schedulingMode.map(_.toString).getOrElse("Unknown")}
             </li>
-            <li>
-              <a href="#active"><strong>Active Stages:</strong></a>
-              {activeStages.size}
-            </li>
-            <li>
-              <a href="#completed"><strong>Completed Stages:</strong></a>
-              {numCompletedStages}
-            </li>
-             <li>
-             <a href="#failed"><strong>Failed Stages:</strong></a>
-              {numFailedStages}
-            </li>
+            {
+              if (shouldShowActiveStages) {
+                <li>
+                  <a href="#active"><strong>Active Stages:</strong></a>
+                  {activeStages.size}
+                </li>
+              }
+            }
+            {
+              if (shouldShowPendingStages) {
+                <li>
+                  <a href="#pending"><strong>Pending Stages:</strong></a>
+                  {pendingStages.size}
+                </li>
+              }
+            }
+            {
+              if (shouldShowCompletedStages) {
+                <li>
+                  <a href="#completed"><strong>Completed Stages:</strong></a>
+                  {numCompletedStages}
+                </li>
+              }
+            }
+            {
+              if (shouldShowFailedStages) {
+                <li>
+                  <a href="#failed"><strong>Failed Stages:</strong></a>
+                  {numFailedStages}
+                </li>
+              }
+            }
           </ul>
         </div>
 
-      val content = summary ++
-        {if (sc.isDefined && isFairScheduler) {
-          <h4>{pools.size} Fair Scheduler Pools</h4> ++ poolTable.toNodeSeq
-        } else {
-          Seq[Node]()
-        }} ++
-        <h4 id="active">Active Stages ({activeStages.size})</h4> ++
-        activeStagesTable.toNodeSeq ++
-        <h4 id="completed">Completed Stages ({numCompletedStages})</h4> ++
-        completedStagesTable.toNodeSeq ++
-        <h4 id ="failed">Failed Stages ({numFailedStages})</h4> ++
+      var content = summary ++
+        {
+          if (sc.isDefined && isFairScheduler) {
+            <h4>{pools.size} Fair Scheduler Pools</h4> ++ poolTable.toNodeSeq
+          } else {
+            Seq[Node]()
+          }
+        }
+      if (shouldShowActiveStages) {
+        content ++= <h4 id="active">Active Stages ({activeStages.size})</h4> ++
+        activeStagesTable.toNodeSeq
+      }
+      if (shouldShowPendingStages) {
+        content ++= <h4 id="pending">Pending Stages ({pendingStages.size}</h4> ++
+        pendingStagesTable.toNodeSeq
+      }
+      if (shouldShowCompletedStages) {
+        content ++= <h4 id="completed">Completed Stages ({numCompletedStages})</h4> ++
+        completedStagesTable.toNodeSeq
+      }
+      if (shouldShowFailedStages) {
+        content ++= <h4 id ="failed">Failed Stages ({numFailedStages})</h4> ++
         failedStagesTable.toNodeSeq
-
+      }
       UIUtils.headerSparkPage("Spark Stages (for all jobs)", content, parent)
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
index 72935beb3a34a..4d200eeda86b9 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
@@ -56,6 +56,7 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
   val jobIdToData = new HashMap[JobId, JobUIData]
 
   // Stages:
+  val pendingStages = new HashMap[StageId, StageInfo]
   val activeStages = new HashMap[StageId, StageInfo]
   val completedStages = ListBuffer[StageInfo]()
   val skippedStages = ListBuffer[StageInfo]()
@@ -153,14 +154,14 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
     val jobData: JobUIData =
       new JobUIData(
         jobId = jobStart.jobId,
-        startTime = Some(System.currentTimeMillis),
-        endTime = None,
+        submissionTime = Option(jobStart.time).filter(_ >= 0),
         stageIds = jobStart.stageIds,
         jobGroup = jobGroup,
         status = JobExecutionStatus.RUNNING)
+    jobStart.stageInfos.foreach(x => pendingStages(x.stageId) = x)
     // Compute (a potential underestimate of) the number of tasks that will be run by this job.
     // This may be an underestimate because the job start event references all of the result
-    // stages's transitive stage dependencies, but some of these stages might be skipped if their
+    // stages' transitive stage dependencies, but some of these stages might be skipped if their
     // output is available from earlier runs.
     // See https://github.com/apache/spark/pull/3009 for a more extensive discussion.
     jobData.numTasks = {
@@ -186,7 +187,9 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
       logWarning(s"Job completed for unknown job ${jobEnd.jobId}")
       new JobUIData(jobId = jobEnd.jobId)
     }
-    jobData.endTime = Some(System.currentTimeMillis())
+    jobData.completionTime = Option(jobEnd.time).filter(_ >= 0)
+
+    jobData.stageIds.foreach(pendingStages.remove)
     jobEnd.jobResult match {
       case JobSucceeded =>
         completedJobs += jobData
@@ -257,7 +260,7 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
   override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted) = synchronized {
     val stage = stageSubmitted.stageInfo
     activeStages(stage.stageId) = stage
-
+    pendingStages.remove(stage.stageId)
     val poolName = Option(stageSubmitted.properties).map {
       p => p.getProperty("spark.scheduler.pool", DEFAULT_POOL_NAME)
     }.getOrElse(DEFAULT_POOL_NAME)
@@ -309,7 +312,7 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
   override def onTaskEnd(taskEnd: SparkListenerTaskEnd) = synchronized {
     val info = taskEnd.taskInfo
     // If stage attempt id is -1, it means the DAGScheduler had no idea which attempt this task
-    // compeletion event is for. Let's just drop it here. This means we might have some speculation
+    // completion event is for. Let's just drop it here. This means we might have some speculation
     // tasks on the web ui that's never marked as complete.
     if (info != null && taskEnd.stageAttemptId != -1) {
       val stageData = stageIdToData.getOrElseUpdate((taskEnd.stageId, taskEnd.stageAttemptId), {
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
index e7d6244dcd679..703d43f9c640d 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
@@ -112,9 +112,8 @@ private[ui] class StageTableBase(
       stageData <- listener.stageIdToData.get((s.stageId, s.attemptId))
       desc <- stageData.description
     } yield {
-      <div><em>{desc}</em></div>
+      <span class="description-input" title={desc}>{desc}</span>
     }
-
     <div>{stageDesc.getOrElse("")} {killLink} {nameLink} {details}</div>
   }
 
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
index 48fd7caa1a1ed..01f7e23212c3d 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
@@ -40,15 +40,15 @@ private[jobs] object UIData {
 
   class JobUIData(
     var jobId: Int = -1,
-    var startTime: Option[Long] = None,
-    var endTime: Option[Long] = None,
+    var submissionTime: Option[Long] = None,
+    var completionTime: Option[Long] = None,
     var stageIds: Seq[Int] = Seq.empty,
     var jobGroup: Option[String] = None,
     var status: JobExecutionStatus = JobExecutionStatus.UNKNOWN,
     /* Tasks */
     // `numTasks` is a potential underestimate of the true number of tasks that this job will run.
     // This may be an underestimate because the job start event references all of the result
-    // stages's transitive stage dependencies, but some of these stages might be skipped if their
+    // stages' transitive stage dependencies, but some of these stages might be skipped if their
     // output is available from earlier runs.
     // See https://github.com/apache/spark/pull/3009 for a more extensive discussion.
     var numTasks: Int = 0,
diff --git a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
index 8c2457f56bffe..4c9b1e3c46f0f 100644
--- a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
@@ -53,7 +53,7 @@ private[spark] object AkkaUtils extends Logging {
     val startService: Int => (ActorSystem, Int) = { actualPort =>
       doCreateActorSystem(name, host, actualPort, conf, securityManager)
     }
-    Utils.startServiceOnPort(port, startService, name)
+    Utils.startServiceOnPort(port, startService, conf, name)
   }
 
   private def doCreateActorSystem(
@@ -65,7 +65,7 @@ private[spark] object AkkaUtils extends Logging {
 
     val akkaThreads   = conf.getInt("spark.akka.threads", 4)
     val akkaBatchSize = conf.getInt("spark.akka.batchSize", 15)
-    val akkaTimeout = conf.getInt("spark.akka.timeout", 100)
+    val akkaTimeout = conf.getInt("spark.akka.timeout", conf.getInt("spark.network.timeout", 120))
     val akkaFrameSize = maxFrameSizeBytes(conf)
     val akkaLogLifecycleEvents = conf.getBoolean("spark.akka.logLifecycleEvents", false)
     val lifecycleEvents = if (akkaLogLifecycleEvents) "on" else "off"
@@ -89,7 +89,7 @@ private[spark] object AkkaUtils extends Logging {
     }
     val requireCookie = if (isAuthOn) "on" else "off"
     val secureCookie = if (isAuthOn) secretKey else ""
-    logDebug("In createActorSystem, requireCookie is: " + requireCookie)
+    logDebug(s"In createActorSystem, requireCookie is: $requireCookie")
 
     val akkaConf = ConfigFactory.parseMap(conf.getAkkaConf.toMap[String, String]).withFallback(
       ConfigFactory.parseString(
@@ -140,8 +140,8 @@ private[spark] object AkkaUtils extends Logging {
   def maxFrameSizeBytes(conf: SparkConf): Int = {
     val frameSizeInMB = conf.getInt("spark.akka.frameSize", 10)
     if (frameSizeInMB > AKKA_MAX_FRAME_SIZE_IN_MB) {
-      throw new IllegalArgumentException("spark.akka.frameSize should not be greater than "
-        + AKKA_MAX_FRAME_SIZE_IN_MB + "MB")
+      throw new IllegalArgumentException(
+        s"spark.akka.frameSize should not be greater than $AKKA_MAX_FRAME_SIZE_IN_MB MB")
     }
     frameSizeInMB * 1024 * 1024
   }
@@ -182,8 +182,8 @@ private[spark] object AkkaUtils extends Logging {
       timeout: FiniteDuration): T = {
     // TODO: Consider removing multiple attempts
     if (actor == null) {
-      throw new SparkException("Error sending message as actor is null " +
-        "[message = " + message + "]")
+      throw new SparkException(s"Error sending message [message = $message]" +
+        " as actor is null ")
     }
     var attempts = 0
     var lastException: Exception = null
@@ -200,13 +200,13 @@ private[spark] object AkkaUtils extends Logging {
         case ie: InterruptedException => throw ie
         case e: Exception =>
           lastException = e
-          logWarning("Error sending message in " + attempts + " attempts", e)
+          logWarning(s"Error sending message [message = $message] in $attempts attempts", e)
       }
       Thread.sleep(retryInterval)
     }
 
     throw new SparkException(
-      "Error sending message [message = " + message + "]", lastException)
+      s"Error sending message [message = $message]", lastException)
   }
 
   def makeDriverRef(name: String, conf: SparkConf, actorSystem: ActorSystem): ActorRef = {
diff --git a/core/src/main/scala/org/apache/spark/util/EventLoop.scala b/core/src/main/scala/org/apache/spark/util/EventLoop.scala
new file mode 100644
index 0000000000000..b0ed908b84424
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/EventLoop.scala
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.util.concurrent.atomic.AtomicBoolean
+import java.util.concurrent.{BlockingQueue, LinkedBlockingDeque}
+
+import scala.util.control.NonFatal
+
+import org.apache.spark.Logging
+
+/**
+ * An event loop to receive events from the caller and process all events in the event thread. It
+ * will start an exclusive event thread to process all events.
+ *
+ * Note: The event queue will grow indefinitely. So subclasses should make sure `onReceive` can
+ * handle events in time to avoid the potential OOM.
+ */
+private[spark] abstract class EventLoop[E](name: String) extends Logging {
+
+  private val eventQueue: BlockingQueue[E] = new LinkedBlockingDeque[E]()
+
+  private val stopped = new AtomicBoolean(false)
+
+  private val eventThread = new Thread(name) {
+    setDaemon(true)
+
+    override def run(): Unit = {
+      try {
+        while (!stopped.get) {
+          val event = eventQueue.take()
+          try {
+            onReceive(event)
+          } catch {
+            case NonFatal(e) => {
+              try {
+                onError(e)
+              } catch {
+                case NonFatal(e) => logError("Unexpected error in " + name, e)
+              }
+            }
+          }
+        }
+      } catch {
+        case ie: InterruptedException => // exit even if eventQueue is not empty
+        case NonFatal(e) => logError("Unexpected error in " + name, e)
+      }
+    }
+
+  }
+
+  def start(): Unit = {
+    if (stopped.get) {
+      throw new IllegalStateException(name + " has already been stopped")
+    }
+    // Call onStart before starting the event thread to make sure it happens before onReceive
+    onStart()
+    eventThread.start()
+  }
+
+  def stop(): Unit = {
+    if (stopped.compareAndSet(false, true)) {
+      eventThread.interrupt()
+      eventThread.join()
+      // Call onStop after the event thread exits to make sure onReceive happens before onStop
+      onStop()
+    } else {
+      // Keep quiet to allow calling `stop` multiple times.
+    }
+  }
+
+  /**
+   * Put the event into the event queue. The event thread will process it later.
+   */
+  def post(event: E): Unit = {
+    eventQueue.put(event)
+  }
+
+  /**
+   * Return if the event thread has already been started but not yet stopped.
+   */
+  def isActive: Boolean = eventThread.isAlive
+
+  /**
+   * Invoked when `start()` is called but before the event thread starts.
+   */
+  protected def onStart(): Unit = {}
+
+  /**
+   * Invoked when `stop()` is called and the event thread exits.
+   */
+  protected def onStop(): Unit = {}
+
+  /**
+   * Invoked in the event thread when polling events from the event queue.
+   *
+   * Note: Should avoid calling blocking actions in `onReceive`, or the event thread will be blocked
+   * and cannot process events in time. If you want to call some blocking actions, run them in
+   * another thread.
+   */
+  protected def onReceive(event: E): Unit
+
+  /**
+   * Invoked if `onReceive` throws any non fatal error. Any non fatal error thrown from `onError`
+   * will be ignored.
+   */
+  protected def onError(e: Throwable): Unit
+
+}
diff --git a/core/src/main/scala/org/apache/spark/util/FileLogger.scala b/core/src/main/scala/org/apache/spark/util/FileLogger.scala
deleted file mode 100644
index fdc73f08261a6..0000000000000
--- a/core/src/main/scala/org/apache/spark/util/FileLogger.scala
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.util
-
-import java.io.{BufferedOutputStream, FileOutputStream, IOException, PrintWriter}
-import java.net.URI
-import java.text.SimpleDateFormat
-import java.util.Date
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path}
-import org.apache.hadoop.fs.permission.FsPermission
-
-import org.apache.spark.{Logging, SparkConf}
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.io.CompressionCodec
-
-/**
- * A generic class for logging information to file.
- *
- * @param logDir Path to the directory in which files are logged
- * @param outputBufferSize The buffer size to use when writing to an output stream in bytes
- * @param compress Whether to compress output
- * @param overwrite Whether to overwrite existing files
- */
-private[spark] class FileLogger(
-    logDir: String,
-    sparkConf: SparkConf,
-    hadoopConf: Configuration,
-    outputBufferSize: Int = 8 * 1024, // 8 KB
-    compress: Boolean = false,
-    overwrite: Boolean = true,
-    dirPermissions: Option[FsPermission] = None)
-  extends Logging {
-
-  def this(
-      logDir: String,
-      sparkConf: SparkConf,
-      compress: Boolean,
-      overwrite: Boolean) = {
-    this(logDir, sparkConf, SparkHadoopUtil.get.newConfiguration(sparkConf), compress = compress,
-      overwrite = overwrite)
-  }
-
-  def this(
-      logDir: String,
-      sparkConf: SparkConf,
-      compress: Boolean) = {
-    this(logDir, sparkConf, SparkHadoopUtil.get.newConfiguration(sparkConf), compress = compress,
-      overwrite = true)
-  }
-
-  def this(
-      logDir: String,
-      sparkConf: SparkConf) = {
-    this(logDir, sparkConf, SparkHadoopUtil.get.newConfiguration(sparkConf), compress = false,
-      overwrite = true)
-  }
-
-  private val dateFormat = new ThreadLocal[SimpleDateFormat]() {
-    override def initialValue(): SimpleDateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss")
-  }
-
-  /**
-   * To avoid effects of FileSystem#close or FileSystem.closeAll called from other modules,
-   * create unique FileSystem instance only for FileLogger
-   */
-  private val fileSystem = {
-    val conf = SparkHadoopUtil.get.newConfiguration(sparkConf)
-    val logUri = new URI(logDir)
-    val scheme = logUri.getScheme
-    if (scheme == "hdfs") {
-      conf.setBoolean("fs.hdfs.impl.disable.cache", true)
-    }
-    FileSystem.get(logUri, conf)
-  }
-
-  var fileIndex = 0
-
-  // Only used if compression is enabled
-  private lazy val compressionCodec = CompressionCodec.createCodec(sparkConf)
-
-  // Only defined if the file system scheme is not local
-  private var hadoopDataStream: Option[FSDataOutputStream] = None
-
-  // The Hadoop APIs have changed over time, so we use reflection to figure out
-  // the correct method to use to flush a hadoop data stream. See SPARK-1518
-  // for details.
-  private val hadoopFlushMethod = {
-    val cls = classOf[FSDataOutputStream]
-    scala.util.Try(cls.getMethod("hflush")).getOrElse(cls.getMethod("sync"))
-  }
-
-  private var writer: Option[PrintWriter] = None
-
-  /**
-   * Start this logger by creating the logging directory.
-   */
-  def start() {
-    createLogDir()
-  }
-
-  /**
-   * Create a logging directory with the given path.
-   */
-  private def createLogDir() {
-    val path = new Path(logDir)
-    if (fileSystem.exists(path)) {
-      if (overwrite) {
-        logWarning("Log directory %s already exists. Overwriting...".format(logDir))
-        // Second parameter is whether to delete recursively
-        fileSystem.delete(path, true)
-      } else {
-        throw new IOException("Log directory %s already exists!".format(logDir))
-      }
-    }
-    if (!fileSystem.mkdirs(path)) {
-      throw new IOException("Error in creating log directory: %s".format(logDir))
-    }
-    if (dirPermissions.isDefined) {
-      val fsStatus = fileSystem.getFileStatus(path)
-      if (fsStatus.getPermission.toShort != dirPermissions.get.toShort) {
-        fileSystem.setPermission(path, dirPermissions.get)
-      }
-    }
-  }
-
-  /**
-   * Create a new writer for the file identified by the given path.
-   * If the permissions are not passed in, it will default to use the permissions
-   * (dirPermissions) used when class was instantiated.
-   */
-  private def createWriter(fileName: String, perms: Option[FsPermission] = None): PrintWriter = {
-    val logPath = logDir + "/" + fileName
-    val uri = new URI(logPath)
-    val path = new Path(logPath)
-    val defaultFs = FileSystem.getDefaultUri(hadoopConf).getScheme
-    val isDefaultLocal = defaultFs == null || defaultFs == "file"
-
-    /* The Hadoop LocalFileSystem (r1.0.4) has known issues with syncing (HADOOP-7844).
-     * Therefore, for local files, use FileOutputStream instead. */
-    val dstream =
-      if ((isDefaultLocal && uri.getScheme == null) || uri.getScheme == "file") {
-        // Second parameter is whether to append
-        new FileOutputStream(uri.getPath, !overwrite)
-      } else {
-        hadoopDataStream = Some(fileSystem.create(path, overwrite))
-        hadoopDataStream.get
-      }
-
-    perms.orElse(dirPermissions).foreach { p => fileSystem.setPermission(path, p) }
-    val bstream = new BufferedOutputStream(dstream, outputBufferSize)
-    val cstream = if (compress) compressionCodec.compressedOutputStream(bstream) else bstream
-    new PrintWriter(cstream)
-  }
-
-  /**
-   * Log the message to the given writer.
-   * @param msg The message to be logged
-   * @param withTime Whether to prepend message with a timestamp
-   */
-  def log(msg: String, withTime: Boolean = false) {
-    val writeInfo = if (!withTime) {
-      msg
-    } else {
-      val date = new Date(System.currentTimeMillis)
-      dateFormat.get.format(date) + ": " + msg
-    }
-    writer.foreach(_.print(writeInfo))
-  }
-
-  /**
-   * Log the message to the given writer as a new line.
-   * @param msg The message to be logged
-   * @param withTime Whether to prepend message with a timestamp
-   */
-  def logLine(msg: String, withTime: Boolean = false) = log(msg + "\n", withTime)
-
-  /**
-   * Flush the writer to disk manually.
-   *
-   * When using a Hadoop filesystem, we need to invoke the hflush or sync
-   * method. In HDFS, hflush guarantees that the data gets to all the
-   * DataNodes.
-   */
-  def flush() {
-    writer.foreach(_.flush())
-    hadoopDataStream.foreach(hadoopFlushMethod.invoke(_))
-  }
-
-  /**
-   * Close the writer. Any subsequent calls to log or flush will have no effect.
-   */
-  def close() {
-    writer.foreach(_.close())
-    writer = None
-  }
-
-  /**
-   * Start a writer for a new file, closing the existing one if it exists.
-   * @param fileName Name of the new file, defaulting to the file index if not provided.
-   * @param perms Permissions to put on the new file.
-   */
-  def newFile(fileName: String = "", perms: Option[FsPermission] = None) {
-    fileIndex += 1
-    writer.foreach(_.close())
-    val name = fileName match {
-      case "" => fileIndex.toString
-      case _ => fileName
-    }
-    writer = Some(createWriter(name, perms))
-  }
-
-  /**
-   * Close all open writers, streams, and file systems. Any subsequent uses of this FileLogger
-   * instance will throw exceptions.
-   */
-  def stop() {
-    hadoopDataStream.foreach(_.close())
-    writer.foreach(_.close())
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index e7b80e8774b9c..f896b5072e4fa 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -19,6 +19,8 @@ package org.apache.spark.util
 
 import java.util.{Properties, UUID}
 
+import org.apache.spark.scheduler.cluster.ExecutorInfo
+
 import scala.collection.JavaConverters._
 import scala.collection.Map
 
@@ -30,6 +32,7 @@ import org.apache.spark.executor._
 import org.apache.spark.scheduler._
 import org.apache.spark.storage._
 import org.apache.spark._
+import org.apache.hadoop.hdfs.web.JsonUtil
 
 /**
  * Serializes SparkListener events to/from JSON.  This protocol provides strong backwards-
@@ -83,7 +86,10 @@ private[spark] object JsonProtocol {
         applicationStartToJson(applicationStart)
       case applicationEnd: SparkListenerApplicationEnd =>
         applicationEndToJson(applicationEnd)
-
+      case executorAdded: SparkListenerExecutorAdded =>
+        executorAddedToJson(executorAdded)
+      case executorRemoved: SparkListenerExecutorRemoved =>
+        executorRemovedToJson(executorRemoved)
       // These aren't used, but keeps compiler happy
       case SparkListenerShutdown => JNothing
       case SparkListenerExecutorMetricsUpdate(_, _) => JNothing
@@ -136,6 +142,7 @@ private[spark] object JsonProtocol {
     val properties = propertiesToJson(jobStart.properties)
     ("Event" -> Utils.getFormattedClassName(jobStart)) ~
     ("Job ID" -> jobStart.jobId) ~
+    ("Submission Time" -> jobStart.time) ~
     ("Stage Infos" -> jobStart.stageInfos.map(stageInfoToJson)) ~  // Added in Spark 1.2.0
     ("Stage IDs" -> jobStart.stageIds) ~
     ("Properties" -> properties)
@@ -145,6 +152,7 @@ private[spark] object JsonProtocol {
     val jobResult = jobResultToJson(jobEnd.jobResult)
     ("Event" -> Utils.getFormattedClassName(jobEnd)) ~
     ("Job ID" -> jobEnd.jobId) ~
+    ("Completion Time" -> jobEnd.time) ~
     ("Job Result" -> jobResult)
   }
 
@@ -194,6 +202,16 @@ private[spark] object JsonProtocol {
     ("Timestamp" -> applicationEnd.time)
   }
 
+  def executorAddedToJson(executorAdded: SparkListenerExecutorAdded): JValue = {
+    ("Event" -> Utils.getFormattedClassName(executorAdded)) ~
+    ("Executor ID" -> executorAdded.executorId) ~
+    ("Executor Info" -> executorInfoToJson(executorAdded.executorInfo))
+  }
+
+  def executorRemovedToJson(executorRemoved: SparkListenerExecutorRemoved): JValue = {
+    ("Event" -> Utils.getFormattedClassName(executorRemoved)) ~
+    ("Executor ID" -> executorRemoved.executorId)
+  }
 
   /** ------------------------------------------------------------------- *
    * JSON serialization methods for classes SparkListenerEvents depend on |
@@ -362,6 +380,10 @@ private[spark] object JsonProtocol {
     ("Disk Size" -> blockStatus.diskSize)
   }
 
+  def executorInfoToJson(executorInfo: ExecutorInfo): JValue = {
+    ("Host" -> executorInfo.executorHost) ~
+    ("Total Cores" -> executorInfo.totalCores)
+  }
 
   /** ------------------------------ *
    * Util JSON serialization methods |
@@ -416,6 +438,8 @@ private[spark] object JsonProtocol {
     val unpersistRDD = Utils.getFormattedClassName(SparkListenerUnpersistRDD)
     val applicationStart = Utils.getFormattedClassName(SparkListenerApplicationStart)
     val applicationEnd = Utils.getFormattedClassName(SparkListenerApplicationEnd)
+    val executorAdded = Utils.getFormattedClassName(SparkListenerExecutorAdded)
+    val executorRemoved = Utils.getFormattedClassName(SparkListenerExecutorRemoved)
 
     (json \ "Event").extract[String] match {
       case `stageSubmitted` => stageSubmittedFromJson(json)
@@ -431,6 +455,8 @@ private[spark] object JsonProtocol {
       case `unpersistRDD` => unpersistRDDFromJson(json)
       case `applicationStart` => applicationStartFromJson(json)
       case `applicationEnd` => applicationEndFromJson(json)
+      case `executorAdded` => executorAddedFromJson(json)
+      case `executorRemoved` => executorRemovedFromJson(json)
     }
   }
 
@@ -469,6 +495,8 @@ private[spark] object JsonProtocol {
 
   def jobStartFromJson(json: JValue): SparkListenerJobStart = {
     val jobId = (json \ "Job ID").extract[Int]
+    val submissionTime =
+      Utils.jsonOption(json \ "Submission Time").map(_.extract[Long]).getOrElse(-1L)
     val stageIds = (json \ "Stage IDs").extract[List[JValue]].map(_.extract[Int])
     val properties = propertiesFromJson(json \ "Properties")
     // The "Stage Infos" field was added in Spark 1.2.0
@@ -476,13 +504,15 @@ private[spark] object JsonProtocol {
       .map(_.extract[Seq[JValue]].map(stageInfoFromJson)).getOrElse {
         stageIds.map(id => new StageInfo(id, 0, "unknown", 0, Seq.empty, "unknown"))
       }
-    SparkListenerJobStart(jobId, stageInfos, properties)
+    SparkListenerJobStart(jobId, submissionTime, stageInfos, properties)
   }
 
   def jobEndFromJson(json: JValue): SparkListenerJobEnd = {
     val jobId = (json \ "Job ID").extract[Int]
+    val completionTime =
+      Utils.jsonOption(json \ "Completion Time").map(_.extract[Long]).getOrElse(-1L)
     val jobResult = jobResultFromJson(json \ "Job Result")
-    SparkListenerJobEnd(jobId, jobResult)
+    SparkListenerJobEnd(jobId, completionTime, jobResult)
   }
 
   def environmentUpdateFromJson(json: JValue): SparkListenerEnvironmentUpdate = {
@@ -523,6 +553,16 @@ private[spark] object JsonProtocol {
     SparkListenerApplicationEnd((json \ "Timestamp").extract[Long])
   }
 
+  def executorAddedFromJson(json: JValue): SparkListenerExecutorAdded = {
+    val executorId = (json \ "Executor ID").extract[String]
+    val executorInfo = executorInfoFromJson(json \ "Executor Info")
+    SparkListenerExecutorAdded(executorId, executorInfo)
+  }
+
+  def executorRemovedFromJson(json: JValue): SparkListenerExecutorRemoved = {
+    val executorId = (json \ "Executor ID").extract[String]
+    SparkListenerExecutorRemoved(executorId)
+  }
 
   /** --------------------------------------------------------------------- *
    * JSON deserialization methods for classes SparkListenerEvents depend on |
@@ -530,7 +570,7 @@ private[spark] object JsonProtocol {
 
   def stageInfoFromJson(json: JValue): StageInfo = {
     val stageId = (json \ "Stage ID").extract[Int]
-    val attemptId = (json \ "Attempt ID").extractOpt[Int].getOrElse(0)
+    val attemptId = (json \ "Stage Attempt ID").extractOpt[Int].getOrElse(0)
     val stageName = (json \ "Stage Name").extract[String]
     val numTasks = (json \ "Number of Tasks").extract[Int]
     val rddInfos = (json \ "RDD Info").extract[List[JValue]].map(rddInfoFromJson(_))
@@ -592,20 +632,20 @@ private[spark] object JsonProtocol {
       return TaskMetrics.empty
     }
     val metrics = new TaskMetrics
-    metrics.hostname = (json \ "Host Name").extract[String]
-    metrics.executorDeserializeTime = (json \ "Executor Deserialize Time").extract[Long]
-    metrics.executorRunTime = (json \ "Executor Run Time").extract[Long]
-    metrics.resultSize = (json \ "Result Size").extract[Long]
-    metrics.jvmGCTime = (json \ "JVM GC Time").extract[Long]
-    metrics.resultSerializationTime = (json \ "Result Serialization Time").extract[Long]
-    metrics.memoryBytesSpilled = (json \ "Memory Bytes Spilled").extract[Long]
-    metrics.diskBytesSpilled = (json \ "Disk Bytes Spilled").extract[Long]
+    metrics.setHostname((json \ "Host Name").extract[String])
+    metrics.setExecutorDeserializeTime((json \ "Executor Deserialize Time").extract[Long])
+    metrics.setExecutorRunTime((json \ "Executor Run Time").extract[Long])
+    metrics.setResultSize((json \ "Result Size").extract[Long])
+    metrics.setJvmGCTime((json \ "JVM GC Time").extract[Long])
+    metrics.setResultSerializationTime((json \ "Result Serialization Time").extract[Long])
+    metrics.incMemoryBytesSpilled((json \ "Memory Bytes Spilled").extract[Long])
+    metrics.incDiskBytesSpilled((json \ "Disk Bytes Spilled").extract[Long])
     metrics.setShuffleReadMetrics(
       Utils.jsonOption(json \ "Shuffle Read Metrics").map(shuffleReadMetricsFromJson))
     metrics.shuffleWriteMetrics =
       Utils.jsonOption(json \ "Shuffle Write Metrics").map(shuffleWriteMetricsFromJson)
-    metrics.inputMetrics =
-      Utils.jsonOption(json \ "Input Metrics").map(inputMetricsFromJson)
+    metrics.setInputMetrics(
+      Utils.jsonOption(json \ "Input Metrics").map(inputMetricsFromJson))
     metrics.outputMetrics =
       Utils.jsonOption(json \ "Output Metrics").map(outputMetricsFromJson)
     metrics.updatedBlocks =
@@ -621,31 +661,31 @@ private[spark] object JsonProtocol {
 
   def shuffleReadMetricsFromJson(json: JValue): ShuffleReadMetrics = {
     val metrics = new ShuffleReadMetrics
-    metrics.remoteBlocksFetched = (json \ "Remote Blocks Fetched").extract[Int]
-    metrics.localBlocksFetched = (json \ "Local Blocks Fetched").extract[Int]
-    metrics.fetchWaitTime = (json \ "Fetch Wait Time").extract[Long]
-    metrics.remoteBytesRead = (json \ "Remote Bytes Read").extract[Long]
+    metrics.incRemoteBlocksFetched((json \ "Remote Blocks Fetched").extract[Int])
+    metrics.incLocalBlocksFetched((json \ "Local Blocks Fetched").extract[Int])
+    metrics.incFetchWaitTime((json \ "Fetch Wait Time").extract[Long])
+    metrics.incRemoteBytesRead((json \ "Remote Bytes Read").extract[Long])
     metrics
   }
 
   def shuffleWriteMetricsFromJson(json: JValue): ShuffleWriteMetrics = {
     val metrics = new ShuffleWriteMetrics
-    metrics.shuffleBytesWritten = (json \ "Shuffle Bytes Written").extract[Long]
-    metrics.shuffleWriteTime = (json \ "Shuffle Write Time").extract[Long]
+    metrics.incShuffleBytesWritten((json \ "Shuffle Bytes Written").extract[Long])
+    metrics.incShuffleWriteTime((json \ "Shuffle Write Time").extract[Long])
     metrics
   }
 
   def inputMetricsFromJson(json: JValue): InputMetrics = {
     val metrics = new InputMetrics(
       DataReadMethod.withName((json \ "Data Read Method").extract[String]))
-    metrics.bytesRead = (json \ "Bytes Read").extract[Long]
+    metrics.addBytesRead((json \ "Bytes Read").extract[Long])
     metrics
   }
 
   def outputMetricsFromJson(json: JValue): OutputMetrics = {
     val metrics = new OutputMetrics(
       DataWriteMethod.withName((json \ "Data Write Method").extract[String]))
-    metrics.bytesWritten = (json \ "Bytes Written").extract[Long]
+    metrics.setBytesWritten((json \ "Bytes Written").extract[Long])
     metrics
   }
 
@@ -745,6 +785,11 @@ private[spark] object JsonProtocol {
     BlockStatus(storageLevel, memorySize, diskSize, tachyonSize)
   }
 
+  def executorInfoFromJson(json: JValue): ExecutorInfo = {
+    val executorHost = (json \ "Host").extract[String]
+    val totalCores = (json \ "Total Cores").extract[Int]
+    new ExecutorInfo(executorHost, totalCores)
+  }
 
   /** -------------------------------- *
    * Util JSON deserialization methods |
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 9c04e45a58479..2c04e4ddfbcb7 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -246,8 +246,11 @@ private[spark] object Utils extends Logging {
     retval
   }
 
-  /** Create a temporary directory inside the given parent directory */
-  def createTempDir(root: String = System.getProperty("java.io.tmpdir")): File = {
+  /**
+   * Create a directory inside the given parent directory. The directory is guaranteed to be
+   * newly created, and is not marked for automatic deletion.
+   */
+  def createDirectory(root: String): File = {
     var attempts = 0
     val maxAttempts = 10
     var dir: File = null
@@ -265,6 +268,15 @@ private[spark] object Utils extends Logging {
       } catch { case e: SecurityException => dir = null; }
     }
 
+    dir
+  }
+
+  /**
+   * Create a temporary directory inside the given parent directory. The directory will be
+   * automatically deleted when the VM shuts down.
+   */
+  def createTempDir(root: String = System.getProperty("java.io.tmpdir")): File = {
+    val dir = createDirectory(root)
     registerShutdownDeleteDir(dir)
     dir
   }
@@ -385,16 +397,12 @@ private[spark] object Utils extends Logging {
       } finally {
         lock.release()
       }
-      if (targetFile.exists && !Files.equal(cachedFile, targetFile)) {
-        if (conf.getBoolean("spark.files.overwrite", false)) {
-          targetFile.delete()
-          logInfo((s"File $targetFile exists and does not match contents of $url, " +
-            s"replacing it with $url"))
-        } else {
-          throw new SparkException(s"File $targetFile exists and does not match contents of $url")
-        }
-      }
-      Files.copy(cachedFile, targetFile)
+      copyFile(
+        url,
+        cachedFile,
+        targetFile,
+        conf.getBoolean("spark.files.overwrite", false)
+      )
     } else {
       doFetchFile(url, targetDir, fileName, conf, securityMgr, hadoopConf)
     }
@@ -411,6 +419,104 @@ private[spark] object Utils extends Logging {
     FileUtil.chmod(targetFile.getAbsolutePath, "a+x")
   }
 
+  /**
+   * Download `in` to `tempFile`, then move it to `destFile`.
+   *
+   * If `destFile` already exists:
+   *   - no-op if its contents equal those of `sourceFile`,
+   *   - throw an exception if `fileOverwrite` is false,
+   *   - attempt to overwrite it otherwise.
+   *
+   * @param url URL that `sourceFile` originated from, for logging purposes.
+   * @param in InputStream to download.
+   * @param tempFile File path to download `in` to.
+   * @param destFile File path to move `tempFile` to.
+   * @param fileOverwrite Whether to delete/overwrite an existing `destFile` that does not match
+   *                      `sourceFile`
+   */
+  private def downloadFile(
+      url: String,
+      in: InputStream,
+      tempFile: File,
+      destFile: File,
+      fileOverwrite: Boolean): Unit = {
+
+    try {
+      val out = new FileOutputStream(tempFile)
+      Utils.copyStream(in, out, closeStreams = true)
+      copyFile(url, tempFile, destFile, fileOverwrite, removeSourceFile = true)
+    } finally {
+      // Catch-all for the couple of cases where for some reason we didn't move `tempFile` to
+      // `destFile`.
+      if (tempFile.exists()) {
+        tempFile.delete()
+      }
+    }
+  }
+
+  /**
+   * Copy `sourceFile` to `destFile`.
+   *
+   * If `destFile` already exists:
+   *   - no-op if its contents equal those of `sourceFile`,
+   *   - throw an exception if `fileOverwrite` is false,
+   *   - attempt to overwrite it otherwise.
+   *
+   * @param url URL that `sourceFile` originated from, for logging purposes.
+   * @param sourceFile File path to copy/move from.
+   * @param destFile File path to copy/move to.
+   * @param fileOverwrite Whether to delete/overwrite an existing `destFile` that does not match
+   *                      `sourceFile`
+   * @param removeSourceFile Whether to remove `sourceFile` after / as part of moving/copying it to
+   *                         `destFile`.
+   */
+  private def copyFile(
+      url: String,
+      sourceFile: File,
+      destFile: File,
+      fileOverwrite: Boolean,
+      removeSourceFile: Boolean = false): Unit = {
+
+    if (destFile.exists) {
+      if (!Files.equal(sourceFile, destFile)) {
+        if (fileOverwrite) {
+          logInfo(
+            s"File $destFile exists and does not match contents of $url, replacing it with $url"
+          )
+          if (!destFile.delete()) {
+            throw new SparkException(
+              "Failed to delete %s while attempting to overwrite it with %s".format(
+                destFile.getAbsolutePath,
+                sourceFile.getAbsolutePath
+              )
+            )
+          }
+        } else {
+          throw new SparkException(
+            s"File $destFile exists and does not match contents of $url")
+        }
+      } else {
+        // Do nothing if the file contents are the same, i.e. this file has been copied
+        // previously.
+        logInfo(
+          "%s has been previously copied to %s".format(
+            sourceFile.getAbsolutePath,
+            destFile.getAbsolutePath
+          )
+        )
+        return
+      }
+    }
+
+    // The file does not exist in the target directory. Copy or move it there.
+    if (removeSourceFile) {
+      Files.move(sourceFile, destFile)
+    } else {
+      logInfo(s"Copying ${sourceFile.getAbsolutePath} to ${destFile.getAbsolutePath}")
+      Files.copy(sourceFile, destFile)
+    }
+  }
+
   /**
    * Download a file to target directory. Supports fetching the file in a variety of ways,
    * including HTTP, HDFS and files on a standard filesystem, based on the URL parameter.
@@ -449,67 +555,17 @@ private[spark] object Utils extends Logging {
         uc.setReadTimeout(timeout)
         uc.connect()
         val in = uc.getInputStream()
-        val out = new FileOutputStream(tempFile)
-        Utils.copyStream(in, out, closeStreams = true)
-        if (targetFile.exists && !Files.equal(tempFile, targetFile)) {
-          if (fileOverwrite) {
-            targetFile.delete()
-            logInfo(("File %s exists and does not match contents of %s, " +
-              "replacing it with %s").format(targetFile, url, url))
-          } else {
-            tempFile.delete()
-            throw new SparkException(
-              "File " + targetFile + " exists and does not match contents of" + " " + url)
-          }
-        }
-        Files.move(tempFile, targetFile)
+        downloadFile(url, in, tempFile, targetFile, fileOverwrite)
       case "file" =>
         // In the case of a local file, copy the local file to the target directory.
         // Note the difference between uri vs url.
         val sourceFile = if (uri.isAbsolute) new File(uri) else new File(url)
-        var shouldCopy = true
-        if (targetFile.exists) {
-          if (!Files.equal(sourceFile, targetFile)) {
-            if (fileOverwrite) {
-              targetFile.delete()
-              logInfo(("File %s exists and does not match contents of %s, " +
-                "replacing it with %s").format(targetFile, url, url))
-            } else {
-              throw new SparkException(
-                "File " + targetFile + " exists and does not match contents of" + " " + url)
-            }
-          } else {
-            // Do nothing if the file contents are the same, i.e. this file has been copied
-            // previously.
-            logInfo(sourceFile.getAbsolutePath + " has been previously copied to "
-              + targetFile.getAbsolutePath)
-            shouldCopy = false
-          }
-        }
-
-        if (shouldCopy) {
-          // The file does not exist in the target directory. Copy it there.
-          logInfo("Copying " + sourceFile.getAbsolutePath + " to " + targetFile.getAbsolutePath)
-          Files.copy(sourceFile, targetFile)
-        }
+        copyFile(url, sourceFile, targetFile, fileOverwrite)
       case _ =>
         // Use the Hadoop filesystem library, which supports file://, hdfs://, s3://, and others
         val fs = getHadoopFileSystem(uri, hadoopConf)
         val in = fs.open(new Path(uri))
-        val out = new FileOutputStream(tempFile)
-        Utils.copyStream(in, out, closeStreams = true)
-        if (targetFile.exists && !Files.equal(tempFile, targetFile)) {
-          if (fileOverwrite) {
-            targetFile.delete()
-            logInfo(("File %s exists and does not match contents of %s, " +
-              "replacing it with %s").format(targetFile, url, url))
-          } else {
-            tempFile.delete()
-            throw new SparkException(
-              "File " + targetFile + " exists and does not match contents of" + " " + url)
-          }
-        }
-        Files.move(tempFile, targetFile)
+        downloadFile(url, in, tempFile, targetFile, fileOverwrite)
     }
   }
 
@@ -645,7 +701,7 @@ private[spark] object Utils extends Logging {
     }
   }
 
-  private var customHostname: Option[String] = None
+  private var customHostname: Option[String] = sys.env.get("SPARK_LOCAL_HOSTNAME")
 
   /**
    * Allow setting a custom host name because when we run on Mesos we need to use the same
@@ -934,11 +990,12 @@ private[spark] object Utils extends Logging {
     for ((key, value) <- extraEnvironment) {
       environment.put(key, value)
     }
+
     val process = builder.start()
     new Thread("read stderr for " + command(0)) {
       override def run() {
         for (line <- Source.fromInputStream(process.getErrorStream).getLines()) {
-          System.err.println(line)
+          logInfo(line)
         }
       }
     }.start()
@@ -1024,13 +1081,6 @@ private[spark] object Utils extends Logging {
    * @param skipClass Function that is used to exclude non-user-code classes.
    */
   def getCallSite(skipClass: String => Boolean = coreExclusionFunction): CallSite = {
-    val trace = Thread.currentThread.getStackTrace().filterNot { ste: StackTraceElement =>
-      // When running under some profilers, the current stack trace might contain some bogus
-      // frames. This is intended to ensure that we don't crash in these situations by
-      // ignoring any frames that we can't examine.
-      ste == null || ste.getMethodName == null || ste.getMethodName.contains("getStackTrace")
-    }
-
     // Keep crawling up the stack trace until we find the first function not inside of the spark
     // package. We track the last (shallowest) contiguous Spark method. This might be an RDD
     // transformation, a SparkContext function (such as parallelize), or anything else that leads
@@ -1041,26 +1091,33 @@ private[spark] object Utils extends Logging {
     var insideSpark = true
     var callStack = new ArrayBuffer[String]() :+ "<unknown>"
 
-    for (el <- trace) {
-      if (insideSpark) {
-        if (skipClass(el.getClassName)) {
-          lastSparkMethod = if (el.getMethodName == "<init>") {
-            // Spark method is a constructor; get its class name
-            el.getClassName.substring(el.getClassName.lastIndexOf('.') + 1)
+    Thread.currentThread.getStackTrace().foreach { ste: StackTraceElement =>
+      // When running under some profilers, the current stack trace might contain some bogus
+      // frames. This is intended to ensure that we don't crash in these situations by
+      // ignoring any frames that we can't examine.
+      if (ste != null && ste.getMethodName != null
+        && !ste.getMethodName.contains("getStackTrace")) {
+        if (insideSpark) {
+          if (skipClass(ste.getClassName)) {
+            lastSparkMethod = if (ste.getMethodName == "<init>") {
+              // Spark method is a constructor; get its class name
+              ste.getClassName.substring(ste.getClassName.lastIndexOf('.') + 1)
+            } else {
+              ste.getMethodName
+            }
+            callStack(0) = ste.toString // Put last Spark method on top of the stack trace.
           } else {
-            el.getMethodName
+            firstUserLine = ste.getLineNumber
+            firstUserFile = ste.getFileName
+            callStack += ste.toString
+            insideSpark = false
           }
-          callStack(0) = el.toString // Put last Spark method on top of the stack trace.
         } else {
-          firstUserLine = el.getLineNumber
-          firstUserFile = el.getFileName
-          callStack += el.toString
-          insideSpark = false
+          callStack += ste.toString
         }
-      } else {
-        callStack += el.toString
       }
     }
+
     val callStackDepth = System.getProperty("spark.callstack.depth", "20").toInt
     CallSite(
       shortForm = s"$lastSparkMethod at $firstUserFile:$firstUserLine",
@@ -1633,17 +1690,15 @@ private[spark] object Utils extends Logging {
   }
 
   /**
-   * Default maximum number of retries when binding to a port before giving up.
+   * Maximum number of retries when binding to a port before giving up.
    */
-  val portMaxRetries: Int = {
-    if (sys.props.contains("spark.testing")) {
+  def portMaxRetries(conf: SparkConf): Int = {
+    val maxRetries = conf.getOption("spark.port.maxRetries").map(_.toInt)
+    if (conf.contains("spark.testing")) {
       // Set a higher number of retries for tests...
-      sys.props.get("spark.port.maxRetries").map(_.toInt).getOrElse(100)
+      maxRetries.getOrElse(100)
     } else {
-      Option(SparkEnv.get)
-        .flatMap(_.conf.getOption("spark.port.maxRetries"))
-        .map(_.toInt)
-        .getOrElse(16)
+      maxRetries.getOrElse(16)
     }
   }
 
@@ -1652,17 +1707,18 @@ private[spark] object Utils extends Logging {
    * Each subsequent attempt uses 1 + the port used in the previous attempt (unless the port is 0).
    *
    * @param startPort The initial port to start the service on.
-   * @param maxRetries Maximum number of retries to attempt.
-   *                   A value of 3 means attempting ports n, n+1, n+2, and n+3, for example.
    * @param startService Function to start service on a given port.
    *                     This is expected to throw java.net.BindException on port collision.
+   * @param conf A SparkConf used to get the maximum number of retries when binding to a port.
+   * @param serviceName Name of the service.
    */
   def startServiceOnPort[T](
       startPort: Int,
       startService: Int => (T, Int),
-      serviceName: String = "",
-      maxRetries: Int = portMaxRetries): (T, Int) = {
+      conf: SparkConf,
+      serviceName: String = ""): (T, Int) = {
     val serviceString = if (serviceName.isEmpty) "" else s" '$serviceName'"
+    val maxRetries = portMaxRetries(conf)
     for (offset <- 0 to maxRetries) {
       // Do not increment port if startPort is 0, which is treated as a special port
       val tryPort = if (startPort == 0) {
@@ -1786,24 +1842,63 @@ private[spark] object Utils extends Logging {
       sparkValue
     }
   }
+
+  /**
+   * Return a pair of host and port extracted from the `sparkUrl`.
+   *
+   * A spark url (`spark://host:port`) is a special URI that its scheme is `spark` and only contains
+   * host and port.
+   *
+   * @throws SparkException if `sparkUrl` is invalid.
+   */
+  def extractHostPortFromSparkUrl(sparkUrl: String): (String, Int) = {
+    try {
+      val uri = new java.net.URI(sparkUrl)
+      val host = uri.getHost
+      val port = uri.getPort
+      if (uri.getScheme != "spark" ||
+        host == null ||
+        port < 0 ||
+        (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null
+        uri.getFragment != null ||
+        uri.getQuery != null ||
+        uri.getUserInfo != null) {
+        throw new SparkException("Invalid master URL: " + sparkUrl)
+      }
+      (host, port)
+    } catch {
+      case e: java.net.URISyntaxException =>
+        throw new SparkException("Invalid master URL: " + sparkUrl, e)
+    }
+  }
 }
 
 /**
  * A utility class to redirect the child process's stdout or stderr.
  */
-private[spark] class RedirectThread(in: InputStream, out: OutputStream, name: String)
+private[spark] class RedirectThread(
+    in: InputStream,
+    out: OutputStream,
+    name: String,
+    propagateEof: Boolean = false)
   extends Thread(name) {
 
   setDaemon(true)
   override def run() {
     scala.util.control.Exception.ignoring(classOf[IOException]) {
       // FIXME: We copy the stream on the level of bytes to avoid encoding problems.
-      val buf = new Array[Byte](1024)
-      var len = in.read(buf)
-      while (len != -1) {
-        out.write(buf, 0, len)
-        out.flush()
-        len = in.read(buf)
+      try {
+        val buf = new Array[Byte](1024)
+        var len = in.read(buf)
+        while (len != -1) {
+          out.write(buf, 0, len)
+          out.flush()
+          len = in.read(buf)
+        }
+      } finally {
+        if (propagateEof) {
+          out.close()
+        }
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/util/Vector.scala b/core/src/main/scala/org/apache/spark/util/Vector.scala
index c6cab82c3e546..2ed827eab46df 100644
--- a/core/src/main/scala/org/apache/spark/util/Vector.scala
+++ b/core/src/main/scala/org/apache/spark/util/Vector.scala
@@ -24,9 +24,9 @@ import org.apache.spark.util.random.XORShiftRandom
 
 @deprecated("Use Vectors.dense from Spark's mllib.linalg package instead.", "1.0.0")
 class Vector(val elements: Array[Double]) extends Serializable {
-  def length = elements.length
+  def length: Int = elements.length
 
-  def apply(index: Int) = elements(index)
+  def apply(index: Int): Double = elements(index)
 
   def + (other: Vector): Vector = {
     if (length != other.length) {
@@ -35,7 +35,7 @@ class Vector(val elements: Array[Double]) extends Serializable {
     Vector(length, i => this(i) + other(i))
   }
 
-  def add(other: Vector) = this + other
+  def add(other: Vector): Vector = this + other
 
   def - (other: Vector): Vector = {
     if (length != other.length) {
@@ -44,7 +44,7 @@ class Vector(val elements: Array[Double]) extends Serializable {
     Vector(length, i => this(i) - other(i))
   }
 
-  def subtract(other: Vector) = this - other
+  def subtract(other: Vector): Vector = this - other
 
   def dot(other: Vector): Double = {
     if (length != other.length) {
@@ -93,19 +93,19 @@ class Vector(val elements: Array[Double]) extends Serializable {
     this
   }
 
-  def addInPlace(other: Vector) = this +=other
+  def addInPlace(other: Vector): Vector = this +=other
 
   def * (scale: Double): Vector = Vector(length, i => this(i) * scale)
 
-  def multiply (d: Double) = this * d
+  def multiply (d: Double): Vector = this * d
 
   def / (d: Double): Vector = this * (1 / d)
 
-  def divide (d: Double) = this / d
+  def divide (d: Double): Vector = this / d
 
-  def unary_- = this * -1
+  def unary_- : Vector = this * -1
 
-  def sum = elements.reduceLeft(_ + _)
+  def sum: Double = elements.reduceLeft(_ + _)
 
   def squaredDist(other: Vector): Double = {
     var ans = 0.0
@@ -119,40 +119,40 @@ class Vector(val elements: Array[Double]) extends Serializable {
 
   def dist(other: Vector): Double = math.sqrt(squaredDist(other))
 
-  override def toString = elements.mkString("(", ", ", ")")
+  override def toString: String = elements.mkString("(", ", ", ")")
 }
 
 object Vector {
-  def apply(elements: Array[Double]) = new Vector(elements)
+  def apply(elements: Array[Double]): Vector = new Vector(elements)
 
-  def apply(elements: Double*) = new Vector(elements.toArray)
+  def apply(elements: Double*): Vector = new Vector(elements.toArray)
 
   def apply(length: Int, initializer: Int => Double): Vector = {
     val elements: Array[Double] = Array.tabulate(length)(initializer)
     new Vector(elements)
   }
 
-  def zeros(length: Int) = new Vector(new Array[Double](length))
+  def zeros(length: Int): Vector = new Vector(new Array[Double](length))
 
-  def ones(length: Int) = Vector(length, _ => 1)
+  def ones(length: Int): Vector = Vector(length, _ => 1)
 
   /**
    * Creates this [[org.apache.spark.util.Vector]] of given length containing random numbers
    * between 0.0 and 1.0. Optional scala.util.Random number generator can be provided.
    */
-  def random(length: Int, random: Random = new XORShiftRandom()) =
+  def random(length: Int, random: Random = new XORShiftRandom()): Vector =
     Vector(length, _ => random.nextDouble())
 
   class Multiplier(num: Double) {
-    def * (vec: Vector) = vec * num
+    def * (vec: Vector): Vector = vec * num
   }
 
-  implicit def doubleToMultiplier(num: Double) = new Multiplier(num)
+  implicit def doubleToMultiplier(num: Double): Multiplier = new Multiplier(num)
 
   implicit object VectorAccumParam extends org.apache.spark.AccumulatorParam[Vector] {
-    def addInPlace(t1: Vector, t2: Vector) = t1 + t2
+    def addInPlace(t1: Vector, t2: Vector): Vector = t1 + t2
 
-    def zero(initialValue: Vector) = Vector.zeros(initialValue.length)
+    def zero(initialValue: Vector): Vector = Vector.zeros(initialValue.length)
   }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index 15bda1c9cc29c..6ba03841f746b 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -757,12 +757,12 @@ private[spark] class ExternalSorter[K, V, C](
       }
     }
 
-    context.taskMetrics.memoryBytesSpilled += memoryBytesSpilled
-    context.taskMetrics.diskBytesSpilled += diskBytesSpilled
+    context.taskMetrics.incMemoryBytesSpilled(memoryBytesSpilled)
+    context.taskMetrics.incDiskBytesSpilled(diskBytesSpilled)
     context.taskMetrics.shuffleWriteMetrics.filter(_ => bypassMergeSort).foreach { m =>
       if (curWriteMetrics != null) {
-        m.shuffleBytesWritten += curWriteMetrics.shuffleBytesWritten
-        m.shuffleWriteTime += curWriteMetrics.shuffleWriteTime
+        m.incShuffleBytesWritten(curWriteMetrics.shuffleBytesWritten)
+        m.incShuffleWriteTime(curWriteMetrics.shuffleWriteTime)
       }
     }
 
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index e5bdad6bda2fa..004de05c10ee1 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -184,6 +184,7 @@ public void sortByKey() {
     Assert.assertEquals(new Tuple2<Integer, Integer>(3, 2), sortedPairs.get(2));
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void repartitionAndSortWithinPartitions() {
     List<Tuple2<Integer, Integer>> pairs = new ArrayList<Tuple2<Integer, Integer>>();
@@ -491,6 +492,7 @@ public Integer call(Integer a, Integer b) {
     Assert.assertEquals(33, sum);
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void aggregateByKey() {
     JavaPairRDD<Integer, Integer> pairs = sc.parallelizePairs(
@@ -604,6 +606,27 @@ public void take() {
     rdd.takeSample(false, 2, 42);
   }
 
+  @Test
+  public void isEmpty() {
+    Assert.assertTrue(sc.emptyRDD().isEmpty());
+    Assert.assertTrue(sc.parallelize(new ArrayList<Integer>()).isEmpty());
+    Assert.assertFalse(sc.parallelize(Arrays.asList(1)).isEmpty());
+    Assert.assertTrue(sc.parallelize(Arrays.asList(1, 2, 3), 3).filter(
+        new Function<Integer,Boolean>() {
+          @Override
+          public Boolean call(Integer i) {
+            return i < 0;
+          }
+        }).isEmpty());
+    Assert.assertFalse(sc.parallelize(Arrays.asList(1, 2, 3)).filter(
+        new Function<Integer, Boolean>() {
+          @Override
+          public Boolean call(Integer i) {
+            return i > 1;
+          }
+        }).isEmpty());
+  }
+
   @Test
   public void cartesian() {
     JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0));
@@ -818,7 +841,7 @@ public void persist() {
   @Test
   public void iterator() {
     JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5), 2);
-    TaskContext context = new TaskContextImpl(0, 0, 0L, false, new TaskMetrics());
+    TaskContext context = new TaskContextImpl(0, 0, 0L, 0, false, new TaskMetrics());
     Assert.assertEquals(1, rdd.iterator(rdd.partitions().get(0), context).next().intValue());
   }
 
@@ -1556,7 +1579,7 @@ static class Class2 {}
   @Test
   public void testRegisterKryoClasses() {
     SparkConf conf = new SparkConf();
-    conf.registerKryoClasses(new Class[]{ Class1.class, Class2.class });
+    conf.registerKryoClasses(new Class<?>[]{ Class1.class, Class2.class });
     Assert.assertEquals(
         Class1.class.getName() + "," + Class2.class.getName(),
         conf.get("spark.kryo.classesToRegister"));
diff --git a/core/src/test/resources/log4j.properties b/core/src/test/resources/log4j.properties
index 9dd05f17f012b..287c8e3563503 100644
--- a/core/src/test/resources/log4j.properties
+++ b/core/src/test/resources/log4j.properties
@@ -15,10 +15,10 @@
 # limitations under the License.
 #
 
-# Set everything to be logged to the file core/target/unit-tests.log
+# Set everything to be logged to the file target/unit-tests.log
 log4j.rootCategory=INFO, file
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala b/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
index c0735f448d193..d7d9dc7b50f30 100644
--- a/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
@@ -66,7 +66,7 @@ class CacheManagerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar
     // in blockManager.put is a losing battle. You have been warned.
     blockManager = sc.env.blockManager
     cacheManager = sc.env.cacheManager
-    val context = new TaskContextImpl(0, 0, 0)
+    val context = new TaskContextImpl(0, 0, 0, 0)
     val computeValue = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY)
     val getValue = blockManager.get(RDDBlockId(rdd.id, split.index))
     assert(computeValue.toList === List(1, 2, 3, 4))
@@ -81,7 +81,7 @@ class CacheManagerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar
     }
 
     whenExecuting(blockManager) {
-      val context = new TaskContextImpl(0, 0, 0)
+      val context = new TaskContextImpl(0, 0, 0, 0)
       val value = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY)
       assert(value.toList === List(5, 6, 7))
     }
@@ -94,7 +94,7 @@ class CacheManagerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar
     }
 
     whenExecuting(blockManager) {
-      val context = new TaskContextImpl(0, 0, 0, true)
+      val context = new TaskContextImpl(0, 0, 0, 0, true)
       val value = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY)
       assert(value.toList === List(1, 2, 3, 4))
     }
@@ -102,7 +102,7 @@ class CacheManagerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar
 
   test("verify task metrics updated correctly") {
     cacheManager = sc.env.cacheManager
-    val context = new TaskContextImpl(0, 0, 0)
+    val context = new TaskContextImpl(0, 0, 0, 0)
     cacheManager.getOrCompute(rdd3, split, context, StorageLevel.MEMORY_ONLY)
     assert(context.taskMetrics.updatedBlocks.getOrElse(Seq()).size === 2)
   }
diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index 998f3008ec0ea..97ea3578aa8ba 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark
 
-import org.scalatest.BeforeAndAfter
 import org.scalatest.FunSuite
 import org.scalatest.concurrent.Timeouts._
 import org.scalatest.Matchers
@@ -29,16 +28,10 @@ class NotSerializableClass
 class NotSerializableExn(val notSer: NotSerializableClass) extends Throwable() {}
 
 
-class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter
-  with LocalSparkContext {
+class DistributedSuite extends FunSuite with Matchers with LocalSparkContext {
 
   val clusterUrl = "local-cluster[2,1,512]"
 
-  after {
-    System.clearProperty("spark.reducer.maxMbInFlight")
-    System.clearProperty("spark.storage.memoryFraction")
-  }
-
   test("task throws not serializable exception") {
     // Ensures that executors do not crash when an exn is not serializable. If executors crash,
     // this test will hang. Correct behavior is that executors don't crash but fail tasks
@@ -84,15 +77,14 @@ class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("groupByKey where map output sizes exceed maxMbInFlight") {
-    System.setProperty("spark.reducer.maxMbInFlight", "1")
-    sc = new SparkContext(clusterUrl, "test")
+    val conf = new SparkConf().set("spark.reducer.maxMbInFlight", "1")
+    sc = new SparkContext(clusterUrl, "test", conf)
     // This data should be around 20 MB, so even with 4 mappers and 2 reducers, each map output
     // file should be about 2.5 MB
     val pairs = sc.parallelize(1 to 2000, 4).map(x => (x % 16, new Array[Byte](10000)))
     val groups = pairs.groupByKey(2).map(x => (x._1, x._2.size)).collect()
     assert(groups.length === 16)
     assert(groups.map(_._2).sum === 2000)
-    // Note that spark.reducer.maxMbInFlight will be cleared in the test suite's after{} block
   }
 
   test("accumulators") {
@@ -210,7 +202,6 @@ class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("compute without caching when no partitions fit in memory") {
-    System.setProperty("spark.storage.memoryFraction", "0.0001")
     sc = new SparkContext(clusterUrl, "test")
     // data will be 4 million * 4 bytes = 16 MB in size, but our memoryFraction set the cache
     // to only 50 KB (0.0001 of 512 MB), so no partitions should fit in memory
@@ -218,12 +209,11 @@ class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter
     assert(data.count() === 4000000)
     assert(data.count() === 4000000)
     assert(data.count() === 4000000)
-    System.clearProperty("spark.storage.memoryFraction")
   }
 
   test("compute when only some partitions fit in memory") {
-    System.setProperty("spark.storage.memoryFraction", "0.01")
-    sc = new SparkContext(clusterUrl, "test")
+    val conf = new SparkConf().set("spark.storage.memoryFraction", "0.01")
+    sc = new SparkContext(clusterUrl, "test", conf)
     // data will be 4 million * 4 bytes = 16 MB in size, but our memoryFraction set the cache
     // to only 5 MB (0.01 of 512 MB), so not all of it will fit in memory; we use 20 partitions
     // to make sure that *some* of them do fit though
@@ -231,7 +221,6 @@ class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter
     assert(data.count() === 4000000)
     assert(data.count() === 4000000)
     assert(data.count() === 4000000)
-    System.clearProperty("spark.storage.memoryFraction")
   }
 
   test("passing environment variables to cluster") {
diff --git a/core/src/test/scala/org/apache/spark/DriverSuite.scala b/core/src/test/scala/org/apache/spark/DriverSuite.scala
index 5265ba904032f..8a54360e81795 100644
--- a/core/src/test/scala/org/apache/spark/DriverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DriverSuite.scala
@@ -35,7 +35,7 @@ class DriverSuite extends FunSuite with Timeouts {
     forAll(masters) { (master: String) =>
       failAfter(60 seconds) {
         Utils.executeAndGetOutput(
-          Seq("./bin/spark-class", "org.apache.spark.DriverWithoutCleanup", master),
+          Seq(s"$sparkHome/bin/spark-class", "org.apache.spark.DriverWithoutCleanup", master),
           new File(sparkHome),
           Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome))
       }
@@ -50,7 +50,10 @@ class DriverSuite extends FunSuite with Timeouts {
 object DriverWithoutCleanup {
   def main(args: Array[String]) {
     Utils.configTestLog4j("INFO")
-    val sc = new SparkContext(args(0), "DriverWithoutCleanup")
+    // Bind the web UI to an ephemeral port in order to avoid conflicts with other tests running on
+    // the same machine (we shouldn't just disable the UI here, since that might mask bugs):
+    val conf = new SparkConf().set("spark.ui.port", "0")
+    val sc = new SparkContext(args(0), "DriverWithoutCleanup", conf)
     sc.parallelize(1 to 100, 4).count()
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index ce804f94f3267..0e4df17c1bf87 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark
 
+import scala.collection.mutable
+
 import org.scalatest.{FunSuite, PrivateMethodTester}
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
@@ -35,6 +37,7 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
       .setMaster("local")
       .setAppName("test-executor-allocation-manager")
       .set("spark.dynamicAllocation.enabled", "true")
+      .set("spark.dynamicAllocation.testing", "true")
     intercept[SparkException] { new SparkContext(conf) }
     SparkEnv.get.stop() // cleanup the created environment
     SparkContext.clearActiveContext()
@@ -142,11 +145,17 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
 
     // Verify that running a task reduces the cap
     sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(1, 3)))
+    sc.listenerBus.postToAll(SparkListenerBlockManagerAdded(
+      0L, BlockManagerId("executor-1", "host1", 1), 100L))
     sc.listenerBus.postToAll(SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-1")))
+    assert(numExecutorsPending(manager) === 4)
     assert(addExecutors(manager) === 1)
-    assert(numExecutorsPending(manager) === 6)
+    assert(numExecutorsPending(manager) === 5)
     assert(numExecutorsToAdd(manager) === 2)
-    assert(addExecutors(manager) === 1)
+    assert(addExecutors(manager) === 2)
+    assert(numExecutorsPending(manager) === 7)
+    assert(numExecutorsToAdd(manager) === 4)
+    assert(addExecutors(manager) === 0)
     assert(numExecutorsPending(manager) === 7)
     assert(numExecutorsToAdd(manager) === 1)
 
@@ -324,6 +333,8 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     val manager = sc.executorAllocationManager.get
     manager.setClock(clock)
 
+    executorIds(manager).asInstanceOf[mutable.Set[String]] ++= List("1", "2", "3")
+
     // Starting remove timer is idempotent for each executor
     assert(removeTimes(manager).isEmpty)
     onExecutorIdle(manager, "1")
@@ -596,6 +607,41 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     assert(removeTimes(manager).size === 1)
   }
 
+  test("SPARK-4951: call onTaskStart before onBlockManagerAdded") {
+    sc = createSparkContext(2, 10)
+    val manager = sc.executorAllocationManager.get
+    assert(executorIds(manager).isEmpty)
+    assert(removeTimes(manager).isEmpty)
+
+    sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, createTaskInfo(0, 0, "executor-1")))
+    sc.listenerBus.postToAll(SparkListenerBlockManagerAdded(
+      0L, BlockManagerId("executor-1", "host1", 1), 100L))
+    assert(executorIds(manager).size === 1)
+    assert(executorIds(manager).contains("executor-1"))
+    assert(removeTimes(manager).size === 0)
+  }
+
+  test("SPARK-4951: onExecutorAdded should not add a busy executor to removeTimes") {
+    sc = createSparkContext(2, 10)
+    val manager = sc.executorAllocationManager.get
+    assert(executorIds(manager).isEmpty)
+    assert(removeTimes(manager).isEmpty)
+    sc.listenerBus.postToAll(SparkListenerBlockManagerAdded(
+      0L, BlockManagerId("executor-1", "host1", 1), 100L))
+    sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, createTaskInfo(0, 0, "executor-1")))
+
+    assert(executorIds(manager).size === 1)
+    assert(executorIds(manager).contains("executor-1"))
+    assert(removeTimes(manager).size === 0)
+
+    sc.listenerBus.postToAll(SparkListenerBlockManagerAdded(
+      0L, BlockManagerId("executor-2", "host1", 1), 100L))
+    assert(executorIds(manager).size === 2)
+    assert(executorIds(manager).contains("executor-2"))
+    assert(removeTimes(manager).size === 1)
+    assert(removeTimes(manager).contains("executor-2"))
+    assert(!removeTimes(manager).contains("executor-1"))
+  }
 }
 
 /**
diff --git a/core/src/test/scala/org/apache/spark/FileServerSuite.scala b/core/src/test/scala/org/apache/spark/FileServerSuite.scala
index 49426545c767e..0f49ce4754fbb 100644
--- a/core/src/test/scala/org/apache/spark/FileServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileServerSuite.scala
@@ -31,10 +31,11 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
   @transient var tmpFile: File = _
   @transient var tmpJarUrl: String = _
 
+  def newConf: SparkConf = new SparkConf(loadDefaults = false).set("spark.authenticate", "false")
+
   override def beforeEach() {
     super.beforeEach()
     resetSparkContext()
-    System.setProperty("spark.authenticate", "false")
   }
 
   override def beforeAll() {
@@ -52,7 +53,6 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
     val jarFile = new File(testTempDir, "test.jar")
     val jarStream = new FileOutputStream(jarFile)
     val jar = new JarOutputStream(jarStream, new java.util.jar.Manifest())
-    System.setProperty("spark.authenticate", "false")
 
     val jarEntry = new JarEntry(textFile.getName)
     jar.putNextEntry(jarEntry)
@@ -74,7 +74,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
   }
 
   test("Distributing files locally") {
-    sc = new SparkContext("local[4]", "test")
+    sc = new SparkContext("local[4]", "test", newConf)
     sc.addFile(tmpFile.toString)
     val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0))
     val result = sc.parallelize(testData).reduceByKey {
@@ -108,7 +108,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
 
   test("Distributing files locally using URL as input") {
     // addFile("file:///....")
-    sc = new SparkContext("local[4]", "test")
+    sc = new SparkContext("local[4]", "test", newConf)
     sc.addFile(new File(tmpFile.toString).toURI.toString)
     val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0))
     val result = sc.parallelize(testData).reduceByKey {
@@ -122,7 +122,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
   }
 
   test ("Dynamically adding JARS locally") {
-    sc = new SparkContext("local[4]", "test")
+    sc = new SparkContext("local[4]", "test", newConf)
     sc.addJar(tmpJarUrl)
     val testData = Array((1, 1))
     sc.parallelize(testData).foreach { x =>
@@ -133,7 +133,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
   }
 
   test("Distributing files on a standalone cluster") {
-    sc = new SparkContext("local-cluster[1,1,512]", "test")
+    sc = new SparkContext("local-cluster[1,1,512]", "test", newConf)
     sc.addFile(tmpFile.toString)
     val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0))
     val result = sc.parallelize(testData).reduceByKey {
@@ -147,7 +147,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
   }
 
   test ("Dynamically adding JARS on a standalone cluster") {
-    sc = new SparkContext("local-cluster[1,1,512]", "test")
+    sc = new SparkContext("local-cluster[1,1,512]", "test", newConf)
     sc.addJar(tmpJarUrl)
     val testData = Array((1,1))
     sc.parallelize(testData).foreach { x =>
@@ -158,7 +158,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext {
   }
 
   test ("Dynamically adding JARS on a standalone cluster using local: URL") {
-    sc = new SparkContext("local-cluster[1,1,512]", "test")
+    sc = new SparkContext("local-cluster[1,1,512]", "test", newConf)
     sc.addJar(tmpJarUrl.replace("file", "local"))
     val testData = Array((1,1))
     sc.parallelize(testData).foreach { x =>
diff --git a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
index 41ed2bce55ce1..7584ae79fc920 100644
--- a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
@@ -40,12 +40,11 @@ class JobCancellationSuite extends FunSuite with Matchers with BeforeAndAfter
   override def afterEach() {
     super.afterEach()
     resetSparkContext()
-    System.clearProperty("spark.scheduler.mode")
   }
 
   test("local mode, FIFO scheduler") {
-    System.setProperty("spark.scheduler.mode", "FIFO")
-    sc = new SparkContext("local[2]", "test")
+    val conf = new SparkConf().set("spark.scheduler.mode", "FIFO")
+    sc = new SparkContext("local[2]", "test", conf)
     testCount()
     testTake()
     // Make sure we can still launch tasks.
@@ -53,10 +52,10 @@ class JobCancellationSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("local mode, fair scheduler") {
-    System.setProperty("spark.scheduler.mode", "FAIR")
+    val conf = new SparkConf().set("spark.scheduler.mode", "FAIR")
     val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile()
-    System.setProperty("spark.scheduler.allocation.file", xmlPath)
-    sc = new SparkContext("local[2]", "test")
+    conf.set("spark.scheduler.allocation.file", xmlPath)
+    sc = new SparkContext("local[2]", "test", conf)
     testCount()
     testTake()
     // Make sure we can still launch tasks.
@@ -64,8 +63,8 @@ class JobCancellationSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("cluster mode, FIFO scheduler") {
-    System.setProperty("spark.scheduler.mode", "FIFO")
-    sc = new SparkContext("local-cluster[2,1,512]", "test")
+    val conf = new SparkConf().set("spark.scheduler.mode", "FIFO")
+    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
     testCount()
     testTake()
     // Make sure we can still launch tasks.
@@ -73,10 +72,10 @@ class JobCancellationSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("cluster mode, fair scheduler") {
-    System.setProperty("spark.scheduler.mode", "FAIR")
+    val conf = new SparkConf().set("spark.scheduler.mode", "FAIR")
     val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile()
-    System.setProperty("spark.scheduler.allocation.file", xmlPath)
-    sc = new SparkContext("local-cluster[2,1,512]", "test")
+    conf.set("spark.scheduler.allocation.file", xmlPath)
+    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
     testCount()
     testTake()
     // Make sure we can still launch tasks.
diff --git a/core/src/test/scala/org/apache/spark/SharedSparkContext.scala b/core/src/test/scala/org/apache/spark/SharedSparkContext.scala
index 0b6511a80df1d..3d2700b7e6be4 100644
--- a/core/src/test/scala/org/apache/spark/SharedSparkContext.scala
+++ b/core/src/test/scala/org/apache/spark/SharedSparkContext.scala
@@ -30,7 +30,7 @@ trait SharedSparkContext extends BeforeAndAfterAll { self: Suite =>
   var conf = new SparkConf(false)
 
   override def beforeAll() {
-    _sc = new SparkContext("local", "test", conf)
+    _sc = new SparkContext("local[4]", "test", conf)
     super.beforeAll()
   }
 
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index 58a96245a9b53..f57921b768310 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -35,19 +35,15 @@ abstract class ShuffleSuite extends FunSuite with Matchers with LocalSparkContex
   conf.set("spark.test.noStageRetry", "true")
 
   test("groupByKey without compression") {
-    try {
-      System.setProperty("spark.shuffle.compress", "false")
-      sc = new SparkContext("local", "test", conf)
-      val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (2, 1)), 4)
-      val groups = pairs.groupByKey(4).collect()
-      assert(groups.size === 2)
-      val valuesFor1 = groups.find(_._1 == 1).get._2
-      assert(valuesFor1.toList.sorted === List(1, 2, 3))
-      val valuesFor2 = groups.find(_._1 == 2).get._2
-      assert(valuesFor2.toList.sorted === List(1))
-    } finally {
-      System.setProperty("spark.shuffle.compress", "true")
-    }
+    val myConf = conf.clone().set("spark.shuffle.compress", "false")
+    sc = new SparkContext("local", "test", myConf)
+    val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (2, 1)), 4)
+    val groups = pairs.groupByKey(4).collect()
+    assert(groups.size === 2)
+    val valuesFor1 = groups.find(_._1 == 1).get._2
+    assert(valuesFor1.toList.sorted === List(1, 2, 3))
+    val valuesFor2 = groups.find(_._1 == 2).get._2
+    assert(valuesFor2.toList.sorted === List(1))
   }
 
   test("shuffle non-zero block size") {
diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
index 5d018ea9868a7..790976a5ac308 100644
--- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
@@ -19,27 +19,20 @@ package org.apache.spark
 
 import org.scalatest.FunSuite
 import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer}
+import org.apache.spark.util.ResetSystemProperties
 import com.esotericsoftware.kryo.Kryo
 
-class SparkConfSuite extends FunSuite with LocalSparkContext {
+class SparkConfSuite extends FunSuite with LocalSparkContext with ResetSystemProperties {
   test("loading from system properties") {
-    try {
-      System.setProperty("spark.test.testProperty", "2")
-      val conf = new SparkConf()
-      assert(conf.get("spark.test.testProperty") === "2")
-    } finally {
-      System.clearProperty("spark.test.testProperty")
-    }
+    System.setProperty("spark.test.testProperty", "2")
+    val conf = new SparkConf()
+    assert(conf.get("spark.test.testProperty") === "2")
   }
 
   test("initializing without loading defaults") {
-    try {
-      System.setProperty("spark.test.testProperty", "2")
-      val conf = new SparkConf(false)
-      assert(!conf.contains("spark.test.testProperty"))
-    } finally {
-      System.clearProperty("spark.test.testProperty")
-    }
+    System.setProperty("spark.test.testProperty", "2")
+    val conf = new SparkConf(false)
+    assert(!conf.contains("spark.test.testProperty"))
   }
 
   test("named set methods") {
@@ -117,23 +110,17 @@ class SparkConfSuite extends FunSuite with LocalSparkContext {
 
   test("nested property names") {
     // This wasn't supported by some external conf parsing libraries
-    try {
-      System.setProperty("spark.test.a", "a")
-      System.setProperty("spark.test.a.b", "a.b")
-      System.setProperty("spark.test.a.b.c", "a.b.c")
-      val conf = new SparkConf()
-      assert(conf.get("spark.test.a") === "a")
-      assert(conf.get("spark.test.a.b") === "a.b")
-      assert(conf.get("spark.test.a.b.c") === "a.b.c")
-      conf.set("spark.test.a.b", "A.B")
-      assert(conf.get("spark.test.a") === "a")
-      assert(conf.get("spark.test.a.b") === "A.B")
-      assert(conf.get("spark.test.a.b.c") === "a.b.c")
-    } finally {
-      System.clearProperty("spark.test.a")
-      System.clearProperty("spark.test.a.b")
-      System.clearProperty("spark.test.a.b.c")
-    }
+    System.setProperty("spark.test.a", "a")
+    System.setProperty("spark.test.a.b", "a.b")
+    System.setProperty("spark.test.a.b.c", "a.b.c")
+    val conf = new SparkConf()
+    assert(conf.get("spark.test.a") === "a")
+    assert(conf.get("spark.test.a.b") === "a.b")
+    assert(conf.get("spark.test.a.b.c") === "a.b.c")
+    conf.set("spark.test.a.b", "A.B")
+    assert(conf.get("spark.test.a") === "a")
+    assert(conf.get("spark.test.a.b") === "A.B")
+    assert(conf.get("spark.test.a.b.c") === "a.b.c")
   }
 
   test("register kryo classes through registerKryoClasses") {
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
index 0390a2e4f1dbb..8ae4f243ec1ae 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
@@ -27,10 +27,13 @@ import org.apache.spark.scheduler.local.LocalBackend
 class SparkContextSchedulerCreationSuite
   extends FunSuite with LocalSparkContext with PrivateMethodTester with Logging {
 
-  def createTaskScheduler(master: String): TaskSchedulerImpl = {
+  def createTaskScheduler(master: String): TaskSchedulerImpl =
+    createTaskScheduler(master, new SparkConf())
+
+  def createTaskScheduler(master: String, conf: SparkConf): TaskSchedulerImpl = {
     // Create local SparkContext to setup a SparkEnv. We don't actually want to start() the
     // real schedulers, so we don't want to create a full SparkContext with the desired scheduler.
-    sc = new SparkContext("local", "test")
+    sc = new SparkContext("local", "test", conf)
     val createTaskSchedulerMethod =
       PrivateMethod[Tuple2[SchedulerBackend, TaskScheduler]]('createTaskScheduler)
     val (_, sched) = SparkContext invokePrivate createTaskSchedulerMethod(sc, master)
@@ -102,19 +105,13 @@ class SparkContextSchedulerCreationSuite
   }
 
   test("local-default-parallelism") {
-    val defaultParallelism = System.getProperty("spark.default.parallelism")
-    System.setProperty("spark.default.parallelism", "16")
-    val sched = createTaskScheduler("local")
+    val conf = new SparkConf().set("spark.default.parallelism", "16")
+    val sched = createTaskScheduler("local", conf)
 
     sched.backend match {
       case s: LocalBackend => assert(s.defaultParallelism() === 16)
       case _ => fail()
     }
-
-    Option(defaultParallelism) match {
-      case Some(v) => System.setProperty("spark.default.parallelism", v)
-      case _ => System.clearProperty("spark.default.parallelism")
-    }
   }
 
   test("simr") {
@@ -155,9 +152,10 @@ class SparkContextSchedulerCreationSuite
     testYarn("yarn-client", "org.apache.spark.scheduler.cluster.YarnClientClusterScheduler")
   }
 
-  def testMesos(master: String, expectedClass: Class[_]) {
+  def testMesos(master: String, expectedClass: Class[_], coarse: Boolean) {
+    val conf = new SparkConf().set("spark.mesos.coarse", coarse.toString)
     try {
-      val sched = createTaskScheduler(master)
+      val sched = createTaskScheduler(master, conf)
       assert(sched.backend.getClass === expectedClass)
     } catch {
       case e: UnsatisfiedLinkError =>
@@ -168,17 +166,14 @@ class SparkContextSchedulerCreationSuite
   }
 
   test("mesos fine-grained") {
-    System.setProperty("spark.mesos.coarse", "false")
-    testMesos("mesos://localhost:1234", classOf[MesosSchedulerBackend])
+    testMesos("mesos://localhost:1234", classOf[MesosSchedulerBackend], coarse = false)
   }
 
   test("mesos coarse-grained") {
-    System.setProperty("spark.mesos.coarse", "true")
-    testMesos("mesos://localhost:1234", classOf[CoarseMesosSchedulerBackend])
+    testMesos("mesos://localhost:1234", classOf[CoarseMesosSchedulerBackend], coarse = true)
   }
 
   test("mesos with zookeeper") {
-    System.setProperty("spark.mesos.coarse", "false")
-    testMesos("zk://localhost:1234,localhost:2345", classOf[MesosSchedulerBackend])
+    testMesos("zk://localhost:1234,localhost:2345", classOf[MesosSchedulerBackend], coarse = false)
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index 1362022104195..8b3c6871a7b39 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -23,55 +23,37 @@ import org.apache.hadoop.io.BytesWritable
 
 class SparkContextSuite extends FunSuite with LocalSparkContext {
 
-  /** Allows system properties to be changed in tests */
-  private def withSystemProperty[T](property: String, value: String)(block: => T): T = {
-    val originalValue = System.getProperty(property)
-    try {
-      System.setProperty(property, value)
-      block
-    } finally {
-      if (originalValue == null) {
-        System.clearProperty(property)
-      } else {
-        System.setProperty(property, originalValue)
-      }
-    }
-  }
-
   test("Only one SparkContext may be active at a time") {
     // Regression test for SPARK-4180
-    withSystemProperty("spark.driver.allowMultipleContexts", "false") {
-      val conf = new SparkConf().setAppName("test").setMaster("local")
-      sc = new SparkContext(conf)
-      // A SparkContext is already running, so we shouldn't be able to create a second one
-      intercept[SparkException] { new SparkContext(conf) }
-      // After stopping the running context, we should be able to create a new one
-      resetSparkContext()
-      sc = new SparkContext(conf)
-    }
+    val conf = new SparkConf().setAppName("test").setMaster("local")
+      .set("spark.driver.allowMultipleContexts", "false")
+    sc = new SparkContext(conf)
+    // A SparkContext is already running, so we shouldn't be able to create a second one
+    intercept[SparkException] { new SparkContext(conf) }
+    // After stopping the running context, we should be able to create a new one
+    resetSparkContext()
+    sc = new SparkContext(conf)
   }
 
   test("Can still construct a new SparkContext after failing to construct a previous one") {
-    withSystemProperty("spark.driver.allowMultipleContexts", "false") {
-      // This is an invalid configuration (no app name or master URL)
-      intercept[SparkException] {
-        new SparkContext(new SparkConf())
-      }
-      // Even though those earlier calls failed, we should still be able to create a new context
-      sc = new SparkContext(new SparkConf().setMaster("local").setAppName("test"))
+    val conf = new SparkConf().set("spark.driver.allowMultipleContexts", "false")
+    // This is an invalid configuration (no app name or master URL)
+    intercept[SparkException] {
+      new SparkContext(conf)
     }
+    // Even though those earlier calls failed, we should still be able to create a new context
+    sc = new SparkContext(conf.setMaster("local").setAppName("test"))
   }
 
   test("Check for multiple SparkContexts can be disabled via undocumented debug option") {
-    withSystemProperty("spark.driver.allowMultipleContexts", "true") {
-      var secondSparkContext: SparkContext = null
-      try {
-        val conf = new SparkConf().setAppName("test").setMaster("local")
-        sc = new SparkContext(conf)
-        secondSparkContext = new SparkContext(conf)
-      } finally {
-        Option(secondSparkContext).foreach(_.stop())
-      }
+    var secondSparkContext: SparkContext = null
+    try {
+      val conf = new SparkConf().setAppName("test").setMaster("local")
+        .set("spark.driver.allowMultipleContexts", "true")
+      sc = new SparkContext(conf)
+      secondSparkContext = new SparkContext(conf)
+    } finally {
+      Option(secondSparkContext).foreach(_.stop())
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala
new file mode 100644
index 0000000000000..8959a843dbd7d
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.python
+
+import scala.io.Source
+
+import java.io.{PrintWriter, File}
+
+import org.scalatest.{Matchers, FunSuite}
+
+import org.apache.spark.{SharedSparkContext, SparkConf}
+import org.apache.spark.serializer.KryoSerializer
+import org.apache.spark.util.Utils
+
+// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
+// a PythonBroadcast:
+class PythonBroadcastSuite extends FunSuite with Matchers with SharedSparkContext {
+  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
+    val tempDir = Utils.createTempDir()
+    val broadcastedString = "Hello, world!"
+    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
+      val source = Source.fromFile(broadcast.path)
+      val contents = source.mkString
+      source.close()
+      contents should be (broadcastedString)
+    }
+    try {
+      val broadcastDataFile: File = {
+        val file = new File(tempDir, "broadcastData")
+        val printWriter = new PrintWriter(file)
+        printWriter.write(broadcastedString)
+        printWriter.close()
+        file
+      }
+      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
+      assertBroadcastIsValid(broadcast)
+      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
+      val deserializedBroadcast =
+        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
+      assertBroadcastIsValid(deserializedBroadcast)
+    } finally {
+      Utils.deleteRecursively(tempDir)
+    }
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
index b0a70f012f1f3..af3272692d7a1 100644
--- a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
@@ -170,6 +170,15 @@ class BroadcastSuite extends FunSuite with LocalSparkContext {
     testPackage.runCallSiteTest(sc)
   }
 
+  test("Broadcast variables cannot be created after SparkContext is stopped (SPARK-5065)") {
+    sc = new SparkContext("local", "test")
+    sc.stop()
+    val thrown = intercept[IllegalStateException] {
+      sc.broadcast(Seq(1, 2, 3))
+    }
+    assert(thrown.getMessage.toLowerCase.contains("stopped"))
+  }
+
   /**
    * Verify the persistence of state associated with an HttpBroadcast in either local mode or
    * local-cluster mode (when distributed = true).
@@ -349,8 +358,7 @@ class BroadcastSuite extends FunSuite with LocalSparkContext {
 package object testPackage extends Assertions {
 
   def runCallSiteTest(sc: SparkContext) {
-    val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2)
-    val broadcast = sc.broadcast(rdd)
+    val broadcast = sc.broadcast(Array(1, 2, 3, 4))
     broadcast.destroy()
     val thrown = intercept[SparkException] { broadcast.value }
     assert(thrown.getMessage.contains("BroadcastSuite.scala"))
diff --git a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
index 3f1cd0752e766..aa65f7e8915e6 100644
--- a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
@@ -119,7 +119,7 @@ class JsonProtocolSuite extends FunSuite {
   def createExecutorRunner(): ExecutorRunner = {
     new ExecutorRunner("appId", 123, createAppDesc(), 4, 1234, null, "workerId", "host",
       new File("sparkHome"), new File("workDir"), "akka://worker",
-      new SparkConf, ExecutorState.RUNNING)
+      new SparkConf, Seq("localDir"), ExecutorState.RUNNING)
   }
 
   def createDriverRunner(): DriverRunner = {
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index eb7bd7ab3986e..065b7534cece6 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -23,11 +23,13 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark._
 import org.apache.spark.deploy.SparkSubmit._
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ResetSystemProperties, Utils}
 import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
-class SparkSubmitSuite extends FunSuite with Matchers {
+// Note: this suite mixes in ResetSystemProperties because SparkSubmit.main() sets a bunch
+// of properties that neeed to be cleared after tests.
+class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties {
   def beforeAll() {
     System.setProperty("spark.testing", "true")
   }
@@ -288,6 +290,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
       "--class", SimpleApplicationTest.getClass.getName.stripSuffix("$"),
       "--name", "testApp",
       "--master", "local",
+      "--conf", "spark.ui.enabled=false",
       unusedJar.toString)
     runSparkSubmit(args)
   }
@@ -302,6 +305,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
       "--name", "testApp",
       "--master", "local-cluster[2,1,512]",
       "--jars", jarsString,
+      "--conf", "spark.ui.enabled=false",
       unusedJar.toString)
     runSparkSubmit(args)
   }
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
new file mode 100644
index 0000000000000..8379883e065e7
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.history
+
+import java.io.{File, FileOutputStream, OutputStreamWriter}
+
+import scala.io.Source
+
+import com.google.common.io.Files
+import org.apache.hadoop.fs.Path
+import org.json4s.jackson.JsonMethods._
+import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.Matchers
+
+import org.apache.spark.{Logging, SparkConf}
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.io._
+import org.apache.spark.scheduler._
+import org.apache.spark.util.{JsonProtocol, Utils}
+
+class FsHistoryProviderSuite extends FunSuite with BeforeAndAfter with Matchers with Logging {
+
+  private var testDir: File = null
+
+  private var provider: FsHistoryProvider = null
+
+  before {
+    testDir = Utils.createTempDir()
+    provider = new FsHistoryProvider(new SparkConf()
+      .set("spark.history.fs.logDirectory", testDir.getAbsolutePath())
+      .set("spark.history.fs.updateInterval", "0"))
+  }
+
+  after {
+    Utils.deleteRecursively(testDir)
+  }
+
+  test("Parse new and old application logs") {
+    val conf = new SparkConf()
+      .set("spark.history.fs.logDirectory", testDir.getAbsolutePath())
+      .set("spark.history.fs.updateInterval", "0")
+    val provider = new FsHistoryProvider(conf)
+
+    // Write a new-style application log.
+    val logFile1 = new File(testDir, "new1")
+    writeFile(logFile1, true, None,
+      SparkListenerApplicationStart("app1-1", None, 1L, "test"),
+      SparkListenerApplicationEnd(2L)
+      )
+
+    // Write an unfinished app, new-style.
+    val logFile2 = new File(testDir, "new2" + EventLoggingListener.IN_PROGRESS)
+    writeFile(logFile2, true, None,
+      SparkListenerApplicationStart("app2-2", None, 1L, "test")
+      )
+
+    // Write an old-style application log.
+    val oldLog = new File(testDir, "old1")
+    oldLog.mkdir()
+    createEmptyFile(new File(oldLog, provider.SPARK_VERSION_PREFIX + "1.0"))
+    writeFile(new File(oldLog, provider.LOG_PREFIX + "1"), false, None,
+      SparkListenerApplicationStart("app3", None, 2L, "test"),
+      SparkListenerApplicationEnd(3L)
+      )
+    createEmptyFile(new File(oldLog, provider.APPLICATION_COMPLETE))
+
+    // Write an unfinished app, old-style.
+    val oldLog2 = new File(testDir, "old2")
+    oldLog2.mkdir()
+    createEmptyFile(new File(oldLog2, provider.SPARK_VERSION_PREFIX + "1.0"))
+    writeFile(new File(oldLog2, provider.LOG_PREFIX + "1"), false, None,
+      SparkListenerApplicationStart("app4", None, 2L, "test")
+      )
+
+    // Force a reload of data from the log directory, and check that both logs are loaded.
+    // Take the opportunity to check that the offset checks work as expected.
+    provider.checkForLogs()
+
+    val list = provider.getListing().toSeq
+    list should not be (null)
+    list.size should be (4)
+    list.count(e => e.completed) should be (2)
+
+    list(0) should be (ApplicationHistoryInfo(oldLog.getName(), "app3", 2L, 3L,
+      oldLog.lastModified(), "test", true))
+    list(1) should be (ApplicationHistoryInfo(logFile1.getName(), "app1-1", 1L, 2L,
+      logFile1.lastModified(), "test", true))
+    list(2) should be (ApplicationHistoryInfo(oldLog2.getName(), "app4", 2L, -1L,
+      oldLog2.lastModified(), "test", false))
+     list(3) should be (ApplicationHistoryInfo(logFile2.getName(), "app2-2", 1L, -1L,
+      logFile2.lastModified(), "test", false))
+
+    // Make sure the UI can be rendered.
+    list.foreach { case info =>
+      val appUi = provider.getAppUI(info.id)
+      appUi should not be null
+    }
+  }
+
+  test("Parse legacy logs with compression codec set") {
+    val testCodecs = List((classOf[LZFCompressionCodec].getName(), true),
+      (classOf[SnappyCompressionCodec].getName(), true),
+      ("invalid.codec", false))
+
+    testCodecs.foreach { case (codecName, valid) =>
+      val codec = if (valid) CompressionCodec.createCodec(new SparkConf(), codecName) else null
+      val logDir = new File(testDir, codecName)
+      logDir.mkdir()
+      createEmptyFile(new File(logDir, provider.SPARK_VERSION_PREFIX + "1.0"))
+      writeFile(new File(logDir, provider.LOG_PREFIX + "1"), false, Option(codec),
+        SparkListenerApplicationStart("app2", None, 2L, "test"),
+        SparkListenerApplicationEnd(3L)
+        )
+      createEmptyFile(new File(logDir, provider.COMPRESSION_CODEC_PREFIX + codecName))
+
+      val logPath = new Path(logDir.getAbsolutePath())
+      try {
+        val (logInput, sparkVersion) = provider.openLegacyEventLog(logPath)
+        try {
+          Source.fromInputStream(logInput).getLines().toSeq.size should be (2)
+        } finally {
+          logInput.close()
+        }
+      } catch {
+        case e: IllegalArgumentException =>
+          valid should be (false)
+      }
+    }
+  }
+
+  test("SPARK-3697: ignore directories that cannot be read.") {
+    val logFile1 = new File(testDir, "new1")
+    writeFile(logFile1, true, None,
+      SparkListenerApplicationStart("app1-1", None, 1L, "test"),
+      SparkListenerApplicationEnd(2L)
+      )
+    val logFile2 = new File(testDir, "new2")
+    writeFile(logFile2, true, None,
+      SparkListenerApplicationStart("app1-2", None, 1L, "test"),
+      SparkListenerApplicationEnd(2L)
+      )
+    logFile2.setReadable(false, false)
+
+    val conf = new SparkConf()
+      .set("spark.history.fs.logDirectory", testDir.getAbsolutePath())
+      .set("spark.history.fs.updateInterval", "0")
+    val provider = new FsHistoryProvider(conf)
+    provider.checkForLogs()
+
+    val list = provider.getListing().toSeq
+    list should not be (null)
+    list.size should be (1)
+  }
+
+  private def writeFile(file: File, isNewFormat: Boolean, codec: Option[CompressionCodec],
+    events: SparkListenerEvent*) = {
+    val out =
+      if (isNewFormat) {
+        EventLoggingListener.initEventLog(new FileOutputStream(file), codec)
+      } else {
+        val fileStream = new FileOutputStream(file)
+        codec.map(_.compressedOutputStream(fileStream)).getOrElse(fileStream)
+      }
+    val writer = new OutputStreamWriter(out, "UTF-8")
+    try {
+      events.foreach(e => writer.write(compact(render(JsonProtocol.sparkEventToJson(e))) + "\n"))
+    } finally {
+      writer.close()
+    }
+  }
+
+  private def createEmptyFile(file: File) = {
+    new FileOutputStream(file).close()
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
new file mode 100644
index 0000000000000..3d2335f9b3637
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.master
+
+import akka.actor.Address
+import org.scalatest.FunSuite
+
+import org.apache.spark.SparkException
+
+class MasterSuite extends FunSuite {
+
+  test("toAkkaUrl") {
+    val akkaUrl = Master.toAkkaUrl("spark://1.2.3.4:1234")
+    assert("akka.tcp://sparkMaster@1.2.3.4:1234/user/Master" === akkaUrl)
+  }
+
+  test("toAkkaUrl: a typo url") {
+    val e = intercept[SparkException] {
+      Master.toAkkaUrl("spark://1.2. 3.4:1234")
+    }
+    assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage)
+  }
+
+  test("toAkkaAddress") {
+    val address = Master.toAkkaAddress("spark://1.2.3.4:1234")
+    assert(Address("akka.tcp", "sparkMaster", "1.2.3.4", 1234) === address)
+  }
+
+  test("toAkkaAddress: a typo url") {
+    val e = intercept[SparkException] {
+      Master.toAkkaAddress("spark://1.2. 3.4:1234")
+    }
+    assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage)
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
index 196217062991e..6f233d7cf97aa 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
@@ -33,7 +33,8 @@ class ExecutorRunnerTest extends FunSuite {
     val appDesc = new ApplicationDescription("app name", Some(8), 500,
       Command("foo", Seq(appId), Map(), Seq(), Seq(), Seq()), "appUiUrl")
     val er = new ExecutorRunner(appId, 1, appDesc, 8, 500, null, "blah", "worker321",
-      new File(sparkHome), new File("ooga"), "blah", new SparkConf, ExecutorState.RUNNING)
+      new File(sparkHome), new File("ooga"), "blah", new SparkConf, Seq("localDir"),
+      ExecutorState.RUNNING)
     val builder = CommandUtils.buildProcessBuilder(appDesc.command, 512, sparkHome, er.substituteVariables)
     assert(builder.command().last === appId)
   }
diff --git a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
index 25be7f25c21bb..8c6035fb367fe 100644
--- a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
+++ b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
@@ -85,4 +85,10 @@ class CompressionCodecSuite extends FunSuite {
     assert(codec.getClass === classOf[SnappyCompressionCodec])
     testCodec(codec)
   }
+
+  test("bad compression codec") {
+    intercept[IllegalArgumentException] {
+      CompressionCodec.createCodec(conf, "foobar")
+    }
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
index ca226fd4e694f..10a39990f80ce 100644
--- a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
@@ -17,66 +17,185 @@
 
 package org.apache.spark.metrics
 
-import java.io.{FileWriter, PrintWriter, File}
+import java.io.{File, FileWriter, PrintWriter}
 
-import org.apache.spark.SharedSparkContext
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.scheduler.{SparkListenerTaskEnd, SparkListener}
+import scala.collection.mutable.ArrayBuffer
 
 import org.scalatest.FunSuite
-import org.scalatest.matchers.ShouldMatchers
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{Path, FileSystem}
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.io.{LongWritable, Text}
+import org.apache.hadoop.mapreduce.lib.input.{TextInputFormat => NewTextInputFormat}
 
-import scala.collection.mutable.ArrayBuffer
+import org.apache.spark.SharedSparkContext
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd}
+import org.apache.spark.util.Utils
+
+class InputOutputMetricsSuite extends FunSuite with SharedSparkContext {
 
-class InputOutputMetricsSuite extends FunSuite with SharedSparkContext with ShouldMatchers {
-  test("input metrics when reading text file with single split") {
-    val file = new File(getClass.getSimpleName + ".txt")
-    val pw = new PrintWriter(new FileWriter(file))
-    pw.println("some stuff")
-    pw.println("some other stuff")
-    pw.println("yet more stuff")
-    pw.println("too much stuff")
+  @transient var tmpDir: File = _
+  @transient var tmpFile: File = _
+  @transient var tmpFilePath: String = _
+
+  override def beforeAll() {
+    super.beforeAll()
+
+    tmpDir = Utils.createTempDir()
+    val testTempDir = new File(tmpDir, "test")
+    testTempDir.mkdir()
+
+    tmpFile = new File(testTempDir, getClass.getSimpleName + ".txt")
+    val pw = new PrintWriter(new FileWriter(tmpFile))
+    for (x <- 1 to 1000000) {
+      pw.println("s")
+    }
     pw.close()
-    file.deleteOnExit()
 
-    val taskBytesRead = new ArrayBuffer[Long]()
-    sc.addSparkListener(new SparkListener() {
-      override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
-        taskBytesRead += taskEnd.taskMetrics.inputMetrics.get.bytesRead
-      }
-    })
-    sc.textFile("file://" + file.getAbsolutePath, 2).count()
+    // Path to tmpFile
+    tmpFilePath = "file://" + tmpFile.getAbsolutePath
+  }
 
-    // Wait for task end events to come in
-    sc.listenerBus.waitUntilEmpty(500)
-    assert(taskBytesRead.length == 2)
-    assert(taskBytesRead.sum >= file.length())
+  override def afterAll() {
+    super.afterAll()
+    Utils.deleteRecursively(tmpDir)
   }
 
-  test("input metrics when reading text file with multiple splits") {
-    val file = new File(getClass.getSimpleName + ".txt")
-    val pw = new PrintWriter(new FileWriter(file))
-    for (i <- 0 until 10000) {
-      pw.println("some stuff")
+  test("input metrics for old hadoop with coalesce") {
+    val bytesRead = runAndReturnBytesRead {
+      sc.textFile(tmpFilePath, 4).count()
+    }
+    val bytesRead2 = runAndReturnBytesRead {
+      sc.textFile(tmpFilePath, 4).coalesce(2).count()
+    }
+    assert(bytesRead != 0)
+    assert(bytesRead == bytesRead2)
+    assert(bytesRead2 >= tmpFile.length())
+  }
+
+  test("input metrics with cache and coalesce") {
+    // prime the cache manager
+    val rdd = sc.textFile(tmpFilePath, 4).cache()
+    rdd.collect()
+
+    val bytesRead = runAndReturnBytesRead {
+      rdd.count()
+    }
+    val bytesRead2 = runAndReturnBytesRead {
+      rdd.coalesce(4).count()
     }
-    pw.close()
-    file.deleteOnExit()
 
+    // for count and coelesce, the same bytes should be read.
+    assert(bytesRead != 0)
+    assert(bytesRead2 == bytesRead)
+  }
+
+  /**
+   * This checks the situation where we have interleaved reads from
+   * different sources. Currently, we only accumulate fron the first
+   * read method we find in the task. This test uses cartesian to create
+   * the interleaved reads.
+   *
+   * Once https://issues.apache.org/jira/browse/SPARK-5225 is fixed
+   * this test should break.
+   */
+  test("input metrics with mixed read method") {
+    // prime the cache manager
+    val numPartitions = 2
+    val rdd = sc.parallelize(1 to 100, numPartitions).cache()
+    rdd.collect()
+
+    val rdd2 = sc.textFile(tmpFilePath, numPartitions)
+
+    val bytesRead = runAndReturnBytesRead {
+      rdd.count()
+    }
+    val bytesRead2 = runAndReturnBytesRead {
+      rdd2.count()
+    }
+
+    val cartRead = runAndReturnBytesRead {
+      rdd.cartesian(rdd2).count()
+    }
+
+    assert(cartRead != 0)
+    assert(bytesRead != 0)
+    // We read from the first rdd of the cartesian once per partition.
+    assert(cartRead == bytesRead * numPartitions)
+  }
+
+  test("input metrics for new Hadoop API with coalesce") {
+    val bytesRead = runAndReturnBytesRead {
+      sc.newAPIHadoopFile(tmpFilePath, classOf[NewTextInputFormat], classOf[LongWritable],
+        classOf[Text]).count()
+    }
+    val bytesRead2 = runAndReturnBytesRead {
+      sc.newAPIHadoopFile(tmpFilePath, classOf[NewTextInputFormat], classOf[LongWritable],
+        classOf[Text]).coalesce(5).count()
+    }
+    assert(bytesRead != 0)
+    assert(bytesRead2 == bytesRead)
+    assert(bytesRead >= tmpFile.length())
+  }
+
+  test("input metrics when reading text file") {
+    val bytesRead = runAndReturnBytesRead {
+      sc.textFile(tmpFilePath, 2).count()
+    }
+    assert(bytesRead >= tmpFile.length())
+  }
+
+  test("input metrics with interleaved reads") {
+    val numPartitions = 2
+    val cartVector = 0 to 9
+    val cartFile = new File(tmpDir, getClass.getSimpleName + "_cart.txt")
+    val cartFilePath = "file://" + cartFile.getAbsolutePath
+
+    // write files to disk so we can read them later.
+    sc.parallelize(cartVector).saveAsTextFile(cartFilePath)
+    val aRdd = sc.textFile(cartFilePath, numPartitions)
+
+    val tmpRdd = sc.textFile(tmpFilePath, numPartitions)
+
+    val firstSize= runAndReturnBytesRead {
+      aRdd.count()
+    }
+    val secondSize = runAndReturnBytesRead {
+      tmpRdd.count()
+    }
+
+    val cartesianBytes = runAndReturnBytesRead {
+      aRdd.cartesian(tmpRdd).count()
+    }
+
+    // Computing the amount of bytes read for a cartesian operation is a little involved.
+    // Cartesian interleaves reads between two partitions eg. p1 and p2.
+    // Here are the steps:
+    //  1) First it creates an iterator for p1
+    //  2) Creates an iterator for p2
+    //  3) Reads the first element of p1 and then all the elements of p2
+    //  4) proceeds to the next element of p1
+    //  5) Creates a new iterator for p2
+    //  6) rinse and repeat.
+    // As a result we read from the second partition n times where n is the number of keys in
+    // p1. Thus the math below for the test.
+    assert(cartesianBytes != 0)
+    assert(cartesianBytes == firstSize * numPartitions + (cartVector.length  * secondSize))
+  }
+
+  private def runAndReturnBytesRead(job : => Unit): Long = {
     val taskBytesRead = new ArrayBuffer[Long]()
     sc.addSparkListener(new SparkListener() {
       override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
         taskBytesRead += taskEnd.taskMetrics.inputMetrics.get.bytesRead
       }
     })
-    sc.textFile("file://" + file.getAbsolutePath, 2).count()
 
-    // Wait for task end events to come in
+    job
+
     sc.listenerBus.waitUntilEmpty(500)
-    assert(taskBytesRead.length == 2)
-    assert(taskBytesRead.sum >= file.length())
+    taskBytesRead.sum
   }
 
   test("output metrics when writing text file") {
diff --git a/core/src/test/scala/org/apache/spark/rdd/ParallelCollectionSplitSuite.scala b/core/src/test/scala/org/apache/spark/rdd/ParallelCollectionSplitSuite.scala
index 1b112f1a41ca9..cd193ae4f5238 100644
--- a/core/src/test/scala/org/apache/spark/rdd/ParallelCollectionSplitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/ParallelCollectionSplitSuite.scala
@@ -76,6 +76,7 @@ class ParallelCollectionSplitSuite extends FunSuite with Checkers {
     assert(slices(0).mkString(",") === (0 to 32).mkString(","))
     assert(slices(1).mkString(",") === (33 to 66).mkString(","))
     assert(slices(2).mkString(",") === (67 to 100).mkString(","))
+    assert(slices(2).isInstanceOf[Range.Inclusive])
   }
 
   test("empty data") {
@@ -227,4 +228,28 @@ class ParallelCollectionSplitSuite extends FunSuite with Checkers {
     assert(slices.map(_.size).reduceLeft(_+_) === 100)
     assert(slices.forall(_.isInstanceOf[NumericRange[_]]))
   }
+
+  test("inclusive ranges with Int.MaxValue and Int.MinValue") {
+    val data1 = 1 to Int.MaxValue
+    val slices1 = ParallelCollectionRDD.slice(data1, 3)
+    assert(slices1.size === 3)
+    assert(slices1.map(_.size).sum === Int.MaxValue)
+    assert(slices1(2).isInstanceOf[Range.Inclusive])
+    val data2 = -2 to Int.MinValue by -1
+    val slices2 = ParallelCollectionRDD.slice(data2, 3)
+    assert(slices2.size == 3)
+    assert(slices2.map(_.size).sum === Int.MaxValue)
+    assert(slices2(2).isInstanceOf[Range.Inclusive])
+  }
+
+  test("empty ranges with Int.MaxValue and Int.MinValue") {
+    val data1 = Int.MaxValue until Int.MaxValue
+    val slices1 = ParallelCollectionRDD.slice(data1, 5)
+    assert(slices1.size === 5)
+    for (i <- 0 until 5) assert(slices1(i).size === 0)
+    val data2 = Int.MaxValue until Int.MaxValue
+    val slices2 = ParallelCollectionRDD.slice(data2, 5)
+    assert(slices2.size === 5)
+    for (i <- 0 until 5) assert(slices2(i).size === 0)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
index 271a90c6646bb..1a9a0e857e546 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
@@ -174,7 +174,7 @@ class PipedRDDSuite extends FunSuite with SharedSparkContext {
       }
       val hadoopPart1 = generateFakeHadoopPartition()
       val pipedRdd = new PipedRDD(nums, "printenv " + varName)
-      val tContext = new TaskContextImpl(0, 0, 0)
+      val tContext = new TaskContextImpl(0, 0, 0, 0)
       val rddIter = pipedRdd.compute(hadoopPart1, tContext)
       val arr = rddIter.toArray
       assert(arr(0) == "/some/path")
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index 46fcb80fa1845..e33b4bbbb8e4c 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -17,6 +17,10 @@
 
 package org.apache.spark.rdd
 
+import java.io.{ObjectInputStream, ObjectOutputStream, IOException}
+
+import com.esotericsoftware.kryo.KryoException
+
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
@@ -48,6 +52,7 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     assert(nums.glom().map(_.toList).collect().toList === List(List(1, 2), List(3, 4)))
     assert(nums.collect({ case i if i >= 3 => i.toString }).collect().toList === List("3", "4"))
     assert(nums.keyBy(_.toString).collect().toList === List(("1", 1), ("2", 2), ("3", 3), ("4", 4)))
+    assert(!nums.isEmpty())
     assert(nums.max() === 4)
     assert(nums.min() === 1)
     val partitionSums = nums.mapPartitions(iter => Iterator(iter.reduceLeft(_ + _)))
@@ -294,7 +299,7 @@ class RDDSuite extends FunSuite with SharedSparkContext {
   test("coalesced RDDs with locality") {
     val data3 = sc.makeRDD(List((1,List("a","c")), (2,List("a","b","c")), (3,List("b"))))
     val coal3 = data3.coalesce(3)
-    val list3 = coal3.partitions.map(p => p.asInstanceOf[CoalescedRDDPartition].preferredLocation)
+    val list3 = coal3.partitions.flatMap(_.asInstanceOf[CoalescedRDDPartition].preferredLocation)
     assert(list3.sorted === Array("a","b","c"), "Locality preferences are dropped")
 
     // RDD with locality preferences spread (non-randomly) over 6 machines, m0 through m5
@@ -541,6 +546,14 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     assert(sortedTopK === nums.sorted(ord).take(5))
   }
 
+  test("isEmpty") {
+    assert(sc.emptyRDD.isEmpty())
+    assert(sc.parallelize(Seq[Int]()).isEmpty())
+    assert(!sc.parallelize(Seq(1)).isEmpty())
+    assert(sc.parallelize(Seq(1,2,3), 3).filter(_ < 0).isEmpty())
+    assert(!sc.parallelize(Seq(1,2,3), 3).filter(_ > 1).isEmpty())
+  }
+
   test("sample preserves partitioner") {
     val partitioner = new HashPartitioner(2)
     val rdd = sc.parallelize(Seq((0, 1), (2, 3))).partitionBy(partitioner)
@@ -887,6 +900,23 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     assert(ancestors6.count(_.isInstanceOf[CyclicalDependencyRDD[_]]) === 3)
   }
 
+  test("task serialization exception should not hang scheduler") {
+    class BadSerializable extends Serializable {
+      @throws(classOf[IOException])
+      private def writeObject(out: ObjectOutputStream): Unit = throw new KryoException("Bad serialization")
+
+      @throws(classOf[IOException])
+      private def readObject(in: ObjectInputStream): Unit = {}
+    }
+    // Note that in the original bug, SPARK-4349, that this verifies, the job would only hang if there were
+    // more threads in the Spark Context than there were number of objects in this sequence.
+    intercept[Throwable] {
+      sc.parallelize(Seq(new BadSerializable, new BadSerializable)).collect
+    }
+    // Check that the context has not crashed
+    sc.parallelize(1 to 100).map(x => x*2).collect
+  }
+
   /** A contrived RDD that allows the manual addition of dependencies after creation. */
   private class CyclicalDependencyRDD[T: ClassTag] extends RDD[T](sc, Nil) {
     private val mutableDependencies: ArrayBuffer[Dependency[_]] = ArrayBuffer.empty
@@ -897,4 +927,44 @@ class RDDSuite extends FunSuite with SharedSparkContext {
       mutableDependencies += dep
     }
   }
+
+  test("nested RDDs are not supported (SPARK-5063)") {
+    val rdd: RDD[Int] = sc.parallelize(1 to 100)
+    val rdd2: RDD[Int] = sc.parallelize(1 to 100)
+    val thrown = intercept[SparkException] {
+      val nestedRDD: RDD[RDD[Int]] = rdd.mapPartitions { x => Seq(rdd2.map(x => x)).iterator }
+      nestedRDD.count()
+    }
+    assert(thrown.getMessage.contains("SPARK-5063"))
+  }
+
+  test("actions cannot be performed inside of transformations (SPARK-5063)") {
+    val rdd: RDD[Int] = sc.parallelize(1 to 100)
+    val rdd2: RDD[Int] = sc.parallelize(1 to 100)
+    val thrown = intercept[SparkException] {
+      rdd.map(x => x * rdd2.count).collect()
+    }
+    assert(thrown.getMessage.contains("SPARK-5063"))
+  }
+
+  test("cannot run actions after SparkContext has been stopped (SPARK-5063)") {
+    val existingRDD = sc.parallelize(1 to 100)
+    sc.stop()
+    val thrown = intercept[IllegalStateException] {
+      existingRDD.count()
+    }
+    assert(thrown.getMessage.contains("shutdown"))
+  }
+
+  test("cannot call methods on a stopped SparkContext (SPARK-5063)") {
+    sc.stop()
+    def assertFails(block: => Any): Unit = {
+      val thrown = intercept[IllegalStateException] {
+        block
+      }
+      assert(thrown.getMessage.contains("stopped"))
+    }
+    assertFails { sc.parallelize(1 to 100) }
+    assertFails { sc.textFile("/nonexistent-path") }
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 436eea4f1fdcf..eb116213f69fc 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -19,9 +19,8 @@ package org.apache.spark.scheduler
 
 import scala.collection.mutable.{ArrayBuffer, HashSet, HashMap, Map}
 import scala.language.reflectiveCalls
+import scala.util.control.NonFatal
 
-import akka.actor._
-import akka.testkit.{ImplicitSender, TestKit, TestActorRef}
 import org.scalatest.{BeforeAndAfter, FunSuiteLike}
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
@@ -33,10 +32,16 @@ import org.apache.spark.storage.{BlockId, BlockManagerId, BlockManagerMaster}
 import org.apache.spark.util.CallSite
 import org.apache.spark.executor.TaskMetrics
 
-class BuggyDAGEventProcessActor extends Actor {
-  val state = 0
-  def receive = {
-    case _ => throw new SparkException("error")
+class DAGSchedulerEventProcessLoopTester(dagScheduler: DAGScheduler)
+  extends DAGSchedulerEventProcessLoop(dagScheduler) {
+
+  override def post(event: DAGSchedulerEvent): Unit = {
+    try {
+      // Forward event to `onReceive` directly to avoid processing event asynchronously.
+      onReceive(event)
+    } catch {
+      case NonFatal(e) => onError(e)
+    }
   }
 }
 
@@ -65,8 +70,7 @@ class MyRDD(
 
 class DAGSchedulerSuiteDummyException extends Exception
 
-class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with FunSuiteLike
-  with ImplicitSender with BeforeAndAfter with LocalSparkContext with Timeouts {
+class DAGSchedulerSuite extends FunSuiteLike  with BeforeAndAfter with LocalSparkContext with Timeouts {
 
   val conf = new SparkConf
   /** Set of TaskSets the DAGScheduler has requested executed. */
@@ -113,7 +117,7 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
 
   var mapOutputTracker: MapOutputTrackerMaster = null
   var scheduler: DAGScheduler = null
-  var dagEventProcessTestActor: TestActorRef[DAGSchedulerEventProcessActor] = null
+  var dagEventProcessLoopTester: DAGSchedulerEventProcessLoop = null
 
   /**
    * Set of cache locations to return from our mock BlockManagerMaster.
@@ -167,13 +171,11 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
         runLocallyWithinThread(job)
       }
     }
-    dagEventProcessTestActor = TestActorRef[DAGSchedulerEventProcessActor](
-      Props(classOf[DAGSchedulerEventProcessActor], scheduler))(system)
+    dagEventProcessLoopTester = new DAGSchedulerEventProcessLoopTester(scheduler)
   }
 
   override def afterAll() {
     super.afterAll()
-    TestKit.shutdownActorSystem(system)
   }
 
   /**
@@ -190,7 +192,7 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
    * DAGScheduler event loop.
    */
   private def runEvent(event: DAGSchedulerEvent) {
-    dagEventProcessTestActor.receive(event)
+    dagEventProcessLoopTester.post(event)
   }
 
   /**
@@ -247,6 +249,7 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
   test("[SPARK-3353] parent stage should have lower stage id") {
     sparkListener.stageByOrderOfExecution.clear()
     sc.parallelize(1 to 10).map(x => (x, x)).reduceByKey(_ + _, 4).count()
+    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
     assert(sparkListener.stageByOrderOfExecution.length === 2)
     assert(sparkListener.stageByOrderOfExecution(0) < sparkListener.stageByOrderOfExecution(1))
   }
@@ -396,8 +399,7 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
         runLocallyWithinThread(job)
       }
     }
-    dagEventProcessTestActor = TestActorRef[DAGSchedulerEventProcessActor](
-      Props(classOf[DAGSchedulerEventProcessActor], noKillScheduler))(system)
+    dagEventProcessLoopTester = new DAGSchedulerEventProcessLoopTester(noKillScheduler)
     val jobId = submit(new MyRDD(sc, 1, Nil), Array(0))
     cancel(jobId)
     // Because the job wasn't actually cancelled, we shouldn't have received a failure message.
@@ -725,21 +727,9 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
     assert(sc.parallelize(1 to 10, 2).first() === 1)
   }
 
-  test("DAGSchedulerActorSupervisor closes the SparkContext when EventProcessActor crashes") {
-    val actorSystem = ActorSystem("test")
-    val supervisor = actorSystem.actorOf(
-      Props(classOf[DAGSchedulerActorSupervisor], scheduler), "dagSupervisor")
-    supervisor ! Props[BuggyDAGEventProcessActor]
-    val child = expectMsgType[ActorRef]
-    watch(child)
-    child ! "hi"
-    expectMsgPF(){ case Terminated(child) => () }
-    assert(scheduler.sc.dagScheduler === null)
-  }
-
   test("accumulator not calculated for resubmitted result stage") {
     //just for register
-    val accum = new Accumulator[Int](0, SparkContext.IntAccumulatorParam)
+    val accum = new Accumulator[Int](0, AccumulatorParam.IntAccumulatorParam)
     val finalRdd = new MyRDD(sc, 1, Nil)
     submit(finalRdd, Array(0))
     completeWithAccumulator(accum.id, taskSets(0), Seq((Success, 42)))
diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
index abc300fcffaf9..437d8693c0b1f 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
@@ -17,69 +17,59 @@
 
 package org.apache.spark.scheduler
 
+import java.io.{File, FileOutputStream, InputStream, IOException}
+
 import scala.collection.mutable
 import scala.io.Source
 
-import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.fs.Path
 import org.json4s.jackson.JsonMethods._
 import org.scalatest.{BeforeAndAfter, FunSuite}
 
-import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.{Logging, SparkConf, SparkContext}
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.io.CompressionCodec
-import org.apache.spark.SPARK_VERSION
+import org.apache.spark.io._
 import org.apache.spark.util.{JsonProtocol, Utils}
 
-import java.io.File
-
 /**
  * Test whether EventLoggingListener logs events properly.
  *
- * This tests whether EventLoggingListener actually creates special files while logging events,
- * whether the parsing of these special files is correct, and whether the logged events can be
- * read and deserialized into actual SparkListenerEvents.
+ * This tests whether EventLoggingListener actually log files with expected name patterns while
+ * logging events, whether the parsing of the file names is correct, and whether the logged events
+ * can be read and deserialized into actual SparkListenerEvents.
  */
-class EventLoggingListenerSuite extends FunSuite with BeforeAndAfter {
+class EventLoggingListenerSuite extends FunSuite with BeforeAndAfter with Logging {
+  import EventLoggingListenerSuite._
+
   private val fileSystem = Utils.getHadoopFileSystem("/",
     SparkHadoopUtil.get.newConfiguration(new SparkConf()))
-  private val allCompressionCodecs = Seq[String](
-    "org.apache.spark.io.LZFCompressionCodec",
-    "org.apache.spark.io.SnappyCompressionCodec"
-  )
   private var testDir: File = _
-  private var logDirPath: Path = _
+  private var testDirPath: Path = _
 
   before {
     testDir = Utils.createTempDir()
-    logDirPath = Utils.getFilePath(testDir, "spark-events")
+    testDir.deleteOnExit()
+    testDirPath = new Path(testDir.getAbsolutePath())
   }
 
   after {
     Utils.deleteRecursively(testDir)
   }
 
-  test("Parse names of special files") {
-    testParsingFileName()
-  }
-
-  test("Verify special files exist") {
-    testSpecialFilesExist()
-  }
-
-  test("Verify special files exist with compression") {
-    allCompressionCodecs.foreach { codec =>
-      testSpecialFilesExist(compressionCodec = Some(codec))
-    }
-  }
+  test("Verify log file exist") {
+    // Verify logging directory exists
+    val conf = getLoggingConf(testDirPath)
+    val eventLogger = new EventLoggingListener("test", testDirPath.toUri().toString(), conf)
+    eventLogger.start()
 
-  test("Parse event logging info") {
-    testParsingLogInfo()
-  }
+    val logPath = new Path(eventLogger.logPath + EventLoggingListener.IN_PROGRESS)
+    assert(fileSystem.exists(logPath))
+    val logStatus = fileSystem.getFileStatus(logPath)
+    assert(!logStatus.isDir)
 
-  test("Parse event logging info with compression") {
-    allCompressionCodecs.foreach { codec =>
-      testParsingLogInfo(compressionCodec = Some(codec))
-    }
+    // Verify log is renamed after stop()
+    eventLogger.stop()
+    assert(!fileSystem.getFileStatus(new Path(eventLogger.logPath)).isDir)
   }
 
   test("Basic event logging") {
@@ -87,7 +77,7 @@ class EventLoggingListenerSuite extends FunSuite with BeforeAndAfter {
   }
 
   test("Basic event logging with compression") {
-    allCompressionCodecs.foreach { codec =>
+    CompressionCodec.ALL_COMPRESSION_CODECS.foreach { codec =>
       testEventLogging(compressionCodec = Some(codec))
     }
   }
@@ -97,11 +87,25 @@ class EventLoggingListenerSuite extends FunSuite with BeforeAndAfter {
   }
 
   test("End-to-end event logging with compression") {
-    allCompressionCodecs.foreach { codec =>
+    CompressionCodec.ALL_COMPRESSION_CODECS.foreach { codec =>
       testApplicationEventLogging(compressionCodec = Some(codec))
     }
   }
 
+  test("Log overwriting") {
+    val log = new FileOutputStream(new File(testDir, "test"))
+    log.close()
+    try {
+      testEventLogging()
+      assert(false)
+    } catch {
+      case e: IOException =>
+        // Expected, since we haven't enabled log overwrite.
+    }
+
+    // Try again, but enable overwriting.
+    testEventLogging(extraConf = Map("spark.eventLog.overwrite" -> "true"))
+  }
 
   /* ----------------- *
    * Actual test logic *
@@ -109,130 +113,19 @@ class EventLoggingListenerSuite extends FunSuite with BeforeAndAfter {
 
   import EventLoggingListenerSuite._
 
-  /**
-   * Test whether names of special files are correctly identified and parsed.
-   */
-  private def testParsingFileName() {
-    val logPrefix = EventLoggingListener.LOG_PREFIX
-    val sparkVersionPrefix = EventLoggingListener.SPARK_VERSION_PREFIX
-    val compressionCodecPrefix = EventLoggingListener.COMPRESSION_CODEC_PREFIX
-    val applicationComplete = EventLoggingListener.APPLICATION_COMPLETE
-    assert(EventLoggingListener.isEventLogFile(logPrefix + "0"))
-    assert(EventLoggingListener.isEventLogFile(logPrefix + "100"))
-    assert(EventLoggingListener.isEventLogFile(logPrefix + "ANYTHING"))
-    assert(EventLoggingListener.isSparkVersionFile(sparkVersionPrefix + "0.9.1"))
-    assert(EventLoggingListener.isSparkVersionFile(sparkVersionPrefix + "1.0.0"))
-    assert(EventLoggingListener.isSparkVersionFile(sparkVersionPrefix + "ANYTHING"))
-    assert(EventLoggingListener.isApplicationCompleteFile(applicationComplete))
-    allCompressionCodecs.foreach { codec =>
-      assert(EventLoggingListener.isCompressionCodecFile(compressionCodecPrefix + codec))
-    }
-
-    // Negatives
-    assert(!EventLoggingListener.isEventLogFile("The greatest man of all mankind"))
-    assert(!EventLoggingListener.isSparkVersionFile("Will never falter in the face of death!"))
-    assert(!EventLoggingListener.isCompressionCodecFile("Unless he chooses to leave behind"))
-    assert(!EventLoggingListener.isApplicationCompleteFile("The very treasure he calls Macbeth"))
-
-    // Verify that parsing is correct
-    assert(EventLoggingListener.parseSparkVersion(sparkVersionPrefix + "1.0.0") === "1.0.0")
-    allCompressionCodecs.foreach { codec =>
-      assert(EventLoggingListener.parseCompressionCodec(compressionCodecPrefix + codec) === codec)
-    }
-  }
-
-  /**
-   * Test whether the special files produced by EventLoggingListener exist.
-   *
-   * There should be exactly one event log and one spark version file throughout the entire
-   * execution. If a compression codec is specified, then the compression codec file should
-   * also exist. Only after the application has completed does the test expect the application
-   * completed file to be present.
-   */
-  private def testSpecialFilesExist(compressionCodec: Option[String] = None) {
-
-    def assertFilesExist(logFiles: Array[FileStatus], loggerStopped: Boolean) {
-      val numCompressionCodecFiles = if (compressionCodec.isDefined) 1 else 0
-      val numApplicationCompleteFiles = if (loggerStopped) 1 else 0
-      assert(logFiles.size === 2 + numCompressionCodecFiles + numApplicationCompleteFiles)
-      assert(eventLogsExist(logFiles))
-      assert(sparkVersionExists(logFiles))
-      assert(compressionCodecExists(logFiles) === compressionCodec.isDefined)
-      assert(applicationCompleteExists(logFiles) === loggerStopped)
-      assertSparkVersionIsValid(logFiles)
-      compressionCodec.foreach { codec =>
-        assertCompressionCodecIsValid(logFiles, codec)
-      }
-    }
-
-    // Verify logging directory exists
-    val conf = getLoggingConf(logDirPath, compressionCodec)
-    val logBaseDir = conf.get("spark.eventLog.dir")
-    val appId = EventLoggingListenerSuite.getUniqueApplicationId
-    val eventLogger = new EventLoggingListener(appId, logBaseDir, conf)
-    eventLogger.start()
-    val logPath = new Path(eventLogger.logDir)
-    assert(fileSystem.exists(logPath))
-    val logDir = fileSystem.getFileStatus(logPath)
-    assert(logDir.isDir)
-
-    // Verify special files are as expected before stop()
-    var logFiles = fileSystem.listStatus(logPath)
-    assert(logFiles != null)
-    assertFilesExist(logFiles, loggerStopped = false)
-
-    // Verify special files are as expected after stop()
-    eventLogger.stop()
-    logFiles = fileSystem.listStatus(logPath)
-    assertFilesExist(logFiles, loggerStopped = true)
-  }
-
-  /**
-   * Test whether EventLoggingListener correctly parses the correct information from the logs.
-   *
-   * This includes whether it returns the correct Spark version, compression codec (if any),
-   * and the application's completion status.
-   */
-  private def testParsingLogInfo(compressionCodec: Option[String] = None) {
-
-    def assertInfoCorrect(info: EventLoggingInfo, loggerStopped: Boolean) {
-      assert(info.logPaths.size > 0)
-      assert(info.sparkVersion === SPARK_VERSION)
-      assert(info.compressionCodec.isDefined === compressionCodec.isDefined)
-      info.compressionCodec.foreach { codec =>
-        assert(compressionCodec.isDefined)
-        val expectedCodec = compressionCodec.get.split('.').last
-        assert(codec.getClass.getSimpleName === expectedCodec)
-      }
-      assert(info.applicationComplete === loggerStopped)
-    }
-
-    // Verify that all information is correctly parsed before stop()
-    val conf = getLoggingConf(logDirPath, compressionCodec)
-    val logBaseDir = conf.get("spark.eventLog.dir")
-    val appId = EventLoggingListenerSuite.getUniqueApplicationId
-    val eventLogger = new EventLoggingListener(appId, logBaseDir, conf)
-    eventLogger.start()
-    var eventLoggingInfo = EventLoggingListener.parseLoggingInfo(eventLogger.logDir, fileSystem)
-    assertInfoCorrect(eventLoggingInfo, loggerStopped = false)
-
-    // Verify that all information is correctly parsed after stop()
-    eventLogger.stop()
-    eventLoggingInfo = EventLoggingListener.parseLoggingInfo(eventLogger.logDir, fileSystem)
-    assertInfoCorrect(eventLoggingInfo, loggerStopped = true)
-  }
-
   /**
    * Test basic event logging functionality.
    *
    * This creates two simple events, posts them to the EventLoggingListener, and verifies that
    * exactly these two events are logged in the expected file.
    */
-  private def testEventLogging(compressionCodec: Option[String] = None) {
-    val conf = getLoggingConf(logDirPath, compressionCodec)
-    val logBaseDir = conf.get("spark.eventLog.dir")
-    val appId = EventLoggingListenerSuite.getUniqueApplicationId
-    val eventLogger = new EventLoggingListener(appId, logBaseDir, conf)
+  private def testEventLogging(
+      compressionCodec: Option[String] = None,
+      extraConf: Map[String, String] = Map()) {
+    val conf = getLoggingConf(testDirPath, compressionCodec)
+    extraConf.foreach { case (k, v) => conf.set(k, v) }
+    val logName = compressionCodec.map("test-" + _).getOrElse("test")
+    val eventLogger = new EventLoggingListener(logName, testDirPath.toUri().toString(), conf)
     val listenerBus = new LiveListenerBus
     val applicationStart = SparkListenerApplicationStart("Greatest App (N)ever", None,
       125L, "Mickey")
@@ -244,17 +137,21 @@ class EventLoggingListenerSuite extends FunSuite with BeforeAndAfter {
     listenerBus.addListener(eventLogger)
     listenerBus.postToAll(applicationStart)
     listenerBus.postToAll(applicationEnd)
+    eventLogger.stop()
 
     // Verify file contains exactly the two events logged
-    val eventLoggingInfo = EventLoggingListener.parseLoggingInfo(eventLogger.logDir, fileSystem)
-    assert(eventLoggingInfo.logPaths.size > 0)
-    val lines = readFileLines(eventLoggingInfo.logPaths.head, eventLoggingInfo.compressionCodec)
-    assert(lines.size === 2)
-    assert(lines(0).contains("SparkListenerApplicationStart"))
-    assert(lines(1).contains("SparkListenerApplicationEnd"))
-    assert(JsonProtocol.sparkEventFromJson(parse(lines(0))) === applicationStart)
-    assert(JsonProtocol.sparkEventFromJson(parse(lines(1))) === applicationEnd)
-    eventLogger.stop()
+    val (logData, version) = EventLoggingListener.openEventLog(new Path(eventLogger.logPath),
+      fileSystem)
+    try {
+      val lines = readLines(logData)
+      assert(lines.size === 2)
+      assert(lines(0).contains("SparkListenerApplicationStart"))
+      assert(lines(1).contains("SparkListenerApplicationEnd"))
+      assert(JsonProtocol.sparkEventFromJson(parse(lines(0))) === applicationStart)
+      assert(JsonProtocol.sparkEventFromJson(parse(lines(1))) === applicationEnd)
+    } finally {
+      logData.close()
+    }
   }
 
   /**
@@ -262,12 +159,12 @@ class EventLoggingListenerSuite extends FunSuite with BeforeAndAfter {
    * This runs a simple Spark job and asserts that the expected events are logged when expected.
    */
   private def testApplicationEventLogging(compressionCodec: Option[String] = None) {
-    val conf = getLoggingConf(logDirPath, compressionCodec)
-    val sc = new SparkContext("local", "test", conf)
+    val conf = getLoggingConf(testDirPath, compressionCodec)
+    val sc = new SparkContext("local-cluster[2,2,512]", "test", conf)
     assert(sc.eventLogger.isDefined)
     val eventLogger = sc.eventLogger.get
-    val expectedLogDir = logDirPath.toString
-    assert(eventLogger.logDir.contains(expectedLogDir))
+    val expectedLogDir = testDir.toURI().toString()
+    assert(eventLogger.logPath.startsWith(expectedLogDir + "/"))
 
     // Begin listening for events that trigger asserts
     val eventExistenceListener = new EventExistenceListener(eventLogger)
@@ -279,16 +176,23 @@ class EventLoggingListenerSuite extends FunSuite with BeforeAndAfter {
 
     // Ensure all asserts have actually been triggered
     eventExistenceListener.assertAllCallbacksInvoked()
-  }
 
-  /**
-   * Assert that all of the specified events are logged by the given EventLoggingListener.
-   */
-  private def assertEventsExist(eventLogger: EventLoggingListener, events: Seq[String]) {
-    val eventLoggingInfo = EventLoggingListener.parseLoggingInfo(eventLogger.logDir, fileSystem)
-    assert(eventLoggingInfo.logPaths.size > 0)
-    val lines = readFileLines(eventLoggingInfo.logPaths.head, eventLoggingInfo.compressionCodec)
-    val eventSet = mutable.Set(events: _*)
+    // Make sure expected events exist in the log file.
+    val (logData, version) = EventLoggingListener.openEventLog(new Path(eventLogger.logPath),
+      fileSystem)
+    val lines = readLines(logData)
+    val eventSet = mutable.Set(
+      SparkListenerApplicationStart,
+      SparkListenerBlockManagerAdded,
+      SparkListenerExecutorAdded,
+      SparkListenerEnvironmentUpdate,
+      SparkListenerJobStart,
+      SparkListenerJobEnd,
+      SparkListenerStageSubmitted,
+      SparkListenerStageCompleted,
+      SparkListenerTaskStart,
+      SparkListenerTaskEnd,
+      SparkListenerApplicationEnd).map(Utils.getFormattedClassName)
     lines.foreach { line =>
       eventSet.foreach { event =>
         if (line.contains(event)) {
@@ -303,19 +207,8 @@ class EventLoggingListenerSuite extends FunSuite with BeforeAndAfter {
     assert(eventSet.isEmpty, "The following events are missing: " + eventSet.toSeq)
   }
 
-  /**
-   * Read all lines from the file specified by the given path.
-   * If a compression codec is specified, use it to read the file.
-   */
-  private def readFileLines(
-      filePath: Path,
-      compressionCodec: Option[CompressionCodec]): Seq[String] = {
-    val fstream = fileSystem.open(filePath)
-    val cstream =
-      compressionCodec.map { codec =>
-        codec.compressedInputStream(fstream)
-      }.getOrElse(fstream)
-    Source.fromInputStream(cstream).getLines().toSeq
+  private def readLines(in: InputStream): Seq[String] = {
+    Source.fromInputStream(in).getLines().toSeq
   }
 
   /**
@@ -328,30 +221,14 @@ class EventLoggingListenerSuite extends FunSuite with BeforeAndAfter {
     var appEnded = false
 
     override def onJobStart(jobStart: SparkListenerJobStart) {
-      assertEventsExist(eventLogger, Seq[String](
-        Utils.getFormattedClassName(SparkListenerApplicationStart),
-        Utils.getFormattedClassName(SparkListenerBlockManagerAdded),
-        Utils.getFormattedClassName(SparkListenerEnvironmentUpdate)
-      ))
       jobStarted = true
     }
 
     override def onJobEnd(jobEnd: SparkListenerJobEnd) {
-      assertEventsExist(eventLogger, Seq[String](
-        Utils.getFormattedClassName(SparkListenerJobStart),
-        Utils.getFormattedClassName(SparkListenerJobEnd),
-        Utils.getFormattedClassName(SparkListenerStageSubmitted),
-        Utils.getFormattedClassName(SparkListenerStageCompleted),
-        Utils.getFormattedClassName(SparkListenerTaskStart),
-        Utils.getFormattedClassName(SparkListenerTaskEnd)
-      ))
       jobEnded = true
     }
 
     override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd) {
-      assertEventsExist(eventLogger, Seq[String](
-        Utils.getFormattedClassName(SparkListenerApplicationEnd)
-      ))
       appEnded = true
     }
 
@@ -362,39 +239,6 @@ class EventLoggingListenerSuite extends FunSuite with BeforeAndAfter {
     }
   }
 
-
-  /* -------------------------------------------------------- *
-   * Helper methods for validating state of the special files *
-   * -------------------------------------------------------- */
-
-  private def eventLogsExist(logFiles: Array[FileStatus]): Boolean = {
-    logFiles.map(_.getPath.getName).exists(EventLoggingListener.isEventLogFile)
-  }
-
-  private def sparkVersionExists(logFiles: Array[FileStatus]): Boolean = {
-    logFiles.map(_.getPath.getName).exists(EventLoggingListener.isSparkVersionFile)
-  }
-
-  private def compressionCodecExists(logFiles: Array[FileStatus]): Boolean = {
-    logFiles.map(_.getPath.getName).exists(EventLoggingListener.isCompressionCodecFile)
-  }
-
-  private def applicationCompleteExists(logFiles: Array[FileStatus]): Boolean = {
-    logFiles.map(_.getPath.getName).exists(EventLoggingListener.isApplicationCompleteFile)
-  }
-
-  private def assertSparkVersionIsValid(logFiles: Array[FileStatus]) {
-    val file = logFiles.map(_.getPath.getName).find(EventLoggingListener.isSparkVersionFile)
-    assert(file.isDefined)
-    assert(EventLoggingListener.parseSparkVersion(file.get) === SPARK_VERSION)
-  }
-
-  private def assertCompressionCodecIsValid(logFiles: Array[FileStatus], compressionCodec: String) {
-    val file = logFiles.map(_.getPath.getName).find(EventLoggingListener.isCompressionCodecFile)
-    assert(file.isDefined)
-    assert(EventLoggingListener.parseCompressionCodec(file.get) === compressionCodec)
-  }
-
 }
 
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/BooleanType.java b/core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala
similarity index 52%
rename from sql/core/src/main/java/org/apache/spark/sql/api/java/BooleanType.java
rename to core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala
index 5a1f52725631b..6b75c98839e03 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/BooleanType.java
+++ b/core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala
@@ -15,13 +15,26 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.api.java;
+package org.apache.spark.scheduler
+
+import java.io.{ObjectInputStream, ObjectOutputStream, IOException}
+
+import org.apache.spark.TaskContext
 
 /**
- * The data type representing boolean and Boolean values.
- *
- * {@code BooleanType} is represented by the singleton object {@link DataType#BooleanType}.
+ * A Task implementation that fails to serialize.
  */
-public class BooleanType extends DataType {
-  protected BooleanType() {}
+private[spark] class NotSerializableFakeTask(myId: Int, stageId: Int) extends Task[Array[Byte]](stageId, 0) {
+  override def runTask(context: TaskContext): Array[Byte] = Array.empty[Byte]
+  override def preferredLocations: Seq[TaskLocation] = Seq[TaskLocation]()
+
+  @throws(classOf[IOException])
+  private def writeObject(out: ObjectOutputStream): Unit = {
+    if (stageId == 0) {
+      throw new IllegalStateException("Cannot serialize")
+    }
+  }
+
+  @throws(classOf[IOException])
+  private def readObject(in: ObjectInputStream): Unit = {}
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala
new file mode 100644
index 0000000000000..e8f461e2f56c9
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import java.util.Properties
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext}
+
+/**
+ * Tests that pools and the associated scheduling algorithms for FIFO and fair scheduling work
+ * correctly.
+ */
+class PoolSuite extends FunSuite with LocalSparkContext {
+
+  def createTaskSetManager(stageId: Int, numTasks: Int, taskScheduler: TaskSchedulerImpl)
+    : TaskSetManager = {
+    val tasks = Array.tabulate[Task[_]](numTasks) { i =>
+      new FakeTask(i, Nil)
+    }
+    new TaskSetManager(taskScheduler, new TaskSet(tasks, stageId, 0, 0, null), 0)
+  }
+
+  def scheduleTaskAndVerifyId(taskId: Int, rootPool: Pool, expectedStageId: Int) {
+    val taskSetQueue = rootPool.getSortedTaskSetQueue
+    val nextTaskSetToSchedule =
+      taskSetQueue.find(t => (t.runningTasks + t.tasksSuccessful) < t.numTasks)
+    assert(nextTaskSetToSchedule.isDefined)
+    nextTaskSetToSchedule.get.addRunningTask(taskId)
+    assert(nextTaskSetToSchedule.get.stageId === expectedStageId)
+  }
+
+  test("FIFO Scheduler Test") {
+    sc = new SparkContext("local", "TaskSchedulerImplSuite")
+    val taskScheduler = new TaskSchedulerImpl(sc)
+
+    val rootPool = new Pool("", SchedulingMode.FIFO, 0, 0)
+    val schedulableBuilder = new FIFOSchedulableBuilder(rootPool)
+    schedulableBuilder.buildPools()
+
+    val taskSetManager0 = createTaskSetManager(0, 2, taskScheduler)
+    val taskSetManager1 = createTaskSetManager(1, 2, taskScheduler)
+    val taskSetManager2 = createTaskSetManager(2, 2, taskScheduler)
+    schedulableBuilder.addTaskSetManager(taskSetManager0, null)
+    schedulableBuilder.addTaskSetManager(taskSetManager1, null)
+    schedulableBuilder.addTaskSetManager(taskSetManager2, null)
+
+    scheduleTaskAndVerifyId(0, rootPool, 0)
+    scheduleTaskAndVerifyId(1, rootPool, 0)
+    scheduleTaskAndVerifyId(2, rootPool, 1)
+    scheduleTaskAndVerifyId(3, rootPool, 1)
+    scheduleTaskAndVerifyId(4, rootPool, 2)
+    scheduleTaskAndVerifyId(5, rootPool, 2)
+  }
+
+  /**
+   * This test creates three scheduling pools, and creates task set managers in the first
+   * two scheduling pools. The test verifies that as tasks are scheduled, the fair scheduling
+   * algorithm properly orders the two scheduling pools.
+   */
+  test("Fair Scheduler Test") {
+    val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile()
+    val conf = new SparkConf().set("spark.scheduler.allocation.file", xmlPath)
+    sc = new SparkContext("local", "TaskSchedulerImplSuite", conf)
+    val taskScheduler = new TaskSchedulerImpl(sc)
+
+    val rootPool = new Pool("", SchedulingMode.FAIR, 0, 0)
+    val schedulableBuilder = new FairSchedulableBuilder(rootPool, sc.conf)
+    schedulableBuilder.buildPools()
+
+    // Ensure that the XML file was read in correctly.
+    assert(rootPool.getSchedulableByName("default") != null)
+    assert(rootPool.getSchedulableByName("1") != null)
+    assert(rootPool.getSchedulableByName("2") != null)
+    assert(rootPool.getSchedulableByName("3") != null)
+    assert(rootPool.getSchedulableByName("1").minShare === 2)
+    assert(rootPool.getSchedulableByName("1").weight === 1)
+    assert(rootPool.getSchedulableByName("2").minShare === 3)
+    assert(rootPool.getSchedulableByName("2").weight === 1)
+    assert(rootPool.getSchedulableByName("3").minShare === 0)
+    assert(rootPool.getSchedulableByName("3").weight === 1)
+
+    val properties1 = new Properties()
+    properties1.setProperty("spark.scheduler.pool","1")
+    val properties2 = new Properties()
+    properties2.setProperty("spark.scheduler.pool","2")
+
+    val taskSetManager10 = createTaskSetManager(0, 1, taskScheduler)
+    val taskSetManager11 = createTaskSetManager(1, 1, taskScheduler)
+    val taskSetManager12 = createTaskSetManager(2, 2, taskScheduler)
+    schedulableBuilder.addTaskSetManager(taskSetManager10, properties1)
+    schedulableBuilder.addTaskSetManager(taskSetManager11, properties1)
+    schedulableBuilder.addTaskSetManager(taskSetManager12, properties1)
+
+    val taskSetManager23 = createTaskSetManager(3, 2, taskScheduler)
+    val taskSetManager24 = createTaskSetManager(4, 2, taskScheduler)
+    schedulableBuilder.addTaskSetManager(taskSetManager23, properties2)
+    schedulableBuilder.addTaskSetManager(taskSetManager24, properties2)
+
+    // Pool 1 share ratio: 0. Pool 2 share ratio: 0. 1 gets scheduled based on ordering of names.
+    scheduleTaskAndVerifyId(0, rootPool, 0)
+    // Pool 1 share ratio: 1/2. Pool 2 share ratio: 0. 2 gets scheduled because ratio is lower.
+    scheduleTaskAndVerifyId(1, rootPool, 3)
+    // Pool 1 share ratio: 1/2. Pool 2 share ratio: 1/3. 2 gets scheduled because ratio is lower.
+    scheduleTaskAndVerifyId(2, rootPool, 3)
+    // Pool 1 share ratio: 1/2. Pool 2 share ratio: 2/3. 1 gets scheduled because ratio is lower.
+    scheduleTaskAndVerifyId(3, rootPool, 1)
+    // Pool 1 share ratio: 1. Pool 2 share ratio: 2/3. 2 gets scheduled because ratio is lower.
+    scheduleTaskAndVerifyId(4, rootPool, 4)
+    // Neither pool is needy so ordering is based on number of running tasks.
+    // Pool 1 running tasks: 2, Pool 2 running tasks: 3. 1 gets scheduled because fewer running
+    // tasks.
+    scheduleTaskAndVerifyId(5, rootPool, 2)
+    // Pool 1 running tasks: 3, Pool 2 running tasks: 3. 1 gets scheduled because of naming
+    // ordering.
+    scheduleTaskAndVerifyId(6, rootPool, 2)
+    // Pool 1 running tasks: 4, Pool 2 running tasks: 3. 2 gets scheduled because fewer running
+    // tasks.
+    scheduleTaskAndVerifyId(7, rootPool, 4)
+  }
+
+  test("Nested Pool Test") {
+    sc = new SparkContext("local", "TaskSchedulerImplSuite")
+    val taskScheduler = new TaskSchedulerImpl(sc)
+
+    val rootPool = new Pool("", SchedulingMode.FAIR, 0, 0)
+    val pool0 = new Pool("0", SchedulingMode.FAIR, 3, 1)
+    val pool1 = new Pool("1", SchedulingMode.FAIR, 4, 1)
+    rootPool.addSchedulable(pool0)
+    rootPool.addSchedulable(pool1)
+
+    val pool00 = new Pool("00", SchedulingMode.FAIR, 2, 2)
+    val pool01 = new Pool("01", SchedulingMode.FAIR, 1, 1)
+    pool0.addSchedulable(pool00)
+    pool0.addSchedulable(pool01)
+
+    val pool10 = new Pool("10", SchedulingMode.FAIR, 2, 2)
+    val pool11 = new Pool("11", SchedulingMode.FAIR, 2, 1)
+    pool1.addSchedulable(pool10)
+    pool1.addSchedulable(pool11)
+
+    val taskSetManager000 = createTaskSetManager(0, 5, taskScheduler)
+    val taskSetManager001 = createTaskSetManager(1, 5, taskScheduler)
+    pool00.addSchedulable(taskSetManager000)
+    pool00.addSchedulable(taskSetManager001)
+
+    val taskSetManager010 = createTaskSetManager(2, 5, taskScheduler)
+    val taskSetManager011 = createTaskSetManager(3, 5, taskScheduler)
+    pool01.addSchedulable(taskSetManager010)
+    pool01.addSchedulable(taskSetManager011)
+
+    val taskSetManager100 = createTaskSetManager(4, 5, taskScheduler)
+    val taskSetManager101 = createTaskSetManager(5, 5, taskScheduler)
+    pool10.addSchedulable(taskSetManager100)
+    pool10.addSchedulable(taskSetManager101)
+
+    val taskSetManager110 = createTaskSetManager(6, 5, taskScheduler)
+    val taskSetManager111 = createTaskSetManager(7, 5, taskScheduler)
+    pool11.addSchedulable(taskSetManager110)
+    pool11.addSchedulable(taskSetManager111)
+
+    scheduleTaskAndVerifyId(0, rootPool, 0)
+    scheduleTaskAndVerifyId(1, rootPool, 4)
+    scheduleTaskAndVerifyId(2, rootPool, 6)
+    scheduleTaskAndVerifyId(3, rootPool, 2)
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
index 90bdfe07f61c9..7e360cc6082ec 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
@@ -22,6 +22,7 @@ import java.io.{File, PrintWriter}
 import org.json4s.jackson.JsonMethods._
 import org.scalatest.{BeforeAndAfter, FunSuite}
 
+import org.apache.spark.{SparkConf, SparkContext, SPARK_VERSION}
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.io.CompressionCodec
@@ -33,7 +34,6 @@ import org.apache.spark.util.{JsonProtocol, Utils}
 class ReplayListenerSuite extends FunSuite with BeforeAndAfter {
   private val fileSystem = Utils.getHadoopFileSystem("/",
     SparkHadoopUtil.get.newConfiguration(new SparkConf()))
-  private val allCompressionCodecs = CompressionCodec.ALL_COMPRESSION_CODECS
   private var testDir: File = _
 
   before {
@@ -45,13 +45,29 @@ class ReplayListenerSuite extends FunSuite with BeforeAndAfter {
   }
 
   test("Simple replay") {
-    testSimpleReplay()
-  }
+    val logFilePath = Utils.getFilePath(testDir, "events.txt")
+    val fstream = fileSystem.create(logFilePath)
+    val writer = new PrintWriter(fstream)
+    val applicationStart = SparkListenerApplicationStart("Greatest App (N)ever", None,
+      125L, "Mickey")
+    val applicationEnd = SparkListenerApplicationEnd(1000L)
+    writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationStart))))
+    writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationEnd))))
+    writer.close()
 
-  test("Simple replay with compression") {
-    allCompressionCodecs.foreach { codec =>
-      testSimpleReplay(Some(codec))
+    val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath)
+    val logData = fileSystem.open(logFilePath)
+    val eventMonster = new EventMonster(conf)
+    try {
+      val replayer = new ReplayListenerBus()
+      replayer.addListener(eventMonster)
+      replayer.replay(logData, SPARK_VERSION)
+    } finally {
+      logData.close()
     }
+    assert(eventMonster.loggedEvents.size === 2)
+    assert(eventMonster.loggedEvents(0) === JsonProtocol.sparkEventToJson(applicationStart))
+    assert(eventMonster.loggedEvents(1) === JsonProtocol.sparkEventToJson(applicationEnd))
   }
 
   // This assumes the correctness of EventLoggingListener
@@ -61,7 +77,7 @@ class ReplayListenerSuite extends FunSuite with BeforeAndAfter {
 
   // This assumes the correctness of EventLoggingListener
   test("End-to-end replay with compression") {
-    allCompressionCodecs.foreach { codec =>
+    CompressionCodec.ALL_COMPRESSION_CODECS.foreach { codec =>
       testApplicationReplay(Some(codec))
     }
   }
@@ -71,31 +87,6 @@ class ReplayListenerSuite extends FunSuite with BeforeAndAfter {
    * Actual test logic *
    * ----------------- */
 
-  /**
-   * Test simple replaying of events.
-   */
-  private def testSimpleReplay(codecName: Option[String] = None) {
-    val logFilePath = Utils.getFilePath(testDir, "events.txt")
-    val codec = codecName.map(getCompressionCodec)
-    val fstream = fileSystem.create(logFilePath)
-    val cstream = codec.map(_.compressedOutputStream(fstream)).getOrElse(fstream)
-    val writer = new PrintWriter(cstream)
-    val applicationStart = SparkListenerApplicationStart("Greatest App (N)ever", None,
-      125L, "Mickey")
-    val applicationEnd = SparkListenerApplicationEnd(1000L)
-    writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationStart))))
-    writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationEnd))))
-    writer.close()
-    val replayer = new ReplayListenerBus(Seq(logFilePath), fileSystem, codec)
-    val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath, codecName)
-    val eventMonster = new EventMonster(conf)
-    replayer.addListener(eventMonster)
-    replayer.replay()
-    assert(eventMonster.loggedEvents.size === 2)
-    assert(eventMonster.loggedEvents(0) === JsonProtocol.sparkEventToJson(applicationStart))
-    assert(eventMonster.loggedEvents(1) === JsonProtocol.sparkEventToJson(applicationEnd))
-  }
-
   /**
    * Test end-to-end replaying of events.
    *
@@ -105,6 +96,8 @@ class ReplayListenerSuite extends FunSuite with BeforeAndAfter {
    */
   private def testApplicationReplay(codecName: Option[String] = None) {
     val logDirPath = Utils.getFilePath(testDir, "test-replay")
+    fileSystem.mkdirs(logDirPath)
+
     val conf = EventLoggingListenerSuite.getLoggingConf(logDirPath, codecName)
     val sc = new SparkContext("local-cluster[2,1,512]", "Test replay", conf)
 
@@ -116,22 +109,21 @@ class ReplayListenerSuite extends FunSuite with BeforeAndAfter {
     sc.stop()
 
     // Prepare information needed for replay
-    val codec = codecName.map(getCompressionCodec)
     val applications = fileSystem.listStatus(logDirPath)
     assert(applications != null && applications.size > 0)
-    val eventLogDir = applications.sortBy(_.getAccessTime).last
-    assert(eventLogDir.isDir)
-    val logFiles = fileSystem.listStatus(eventLogDir.getPath)
-    assert(logFiles != null && logFiles.size > 0)
-    val logFile = logFiles.find(_.getPath.getName.startsWith("EVENT_LOG_"))
-    assert(logFile.isDefined)
-    val logFilePath = logFile.get.getPath
+    val eventLog = applications.sortBy(_.getModificationTime).last
+    assert(!eventLog.isDir)
 
     // Replay events
-    val replayer = new ReplayListenerBus(Seq(logFilePath), fileSystem, codec)
+    val (logData, version) = EventLoggingListener.openEventLog(eventLog.getPath(), fileSystem)
     val eventMonster = new EventMonster(conf)
-    replayer.addListener(eventMonster)
-    replayer.replay()
+    try {
+      val replayer = new ReplayListenerBus()
+      replayer.addListener(eventMonster)
+      replayer.replay(logData, version)
+    } finally {
+      logData.close()
+    }
 
     // Verify the same events are replayed in the same order
     assert(sc.eventLogger.isDefined)
@@ -154,7 +146,9 @@ class ReplayListenerSuite extends FunSuite with BeforeAndAfter {
    */
   private class EventMonster(conf: SparkConf)
     extends EventLoggingListener("test", "testdir", conf) {
-    logger.close()
+
+    override def start() { }
+
   }
 
   private def getCompressionCodec(codecName: String) = {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index b276343cb412c..0fb1bdd30d975 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -26,28 +26,27 @@ import org.scalatest.Matchers
 
 import org.apache.spark.{LocalSparkContext, SparkContext}
 import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.util.ResetSystemProperties
 
-class SparkListenerSuite extends FunSuite with LocalSparkContext with Matchers
-  with BeforeAndAfter with BeforeAndAfterAll {
+class SparkListenerSuite extends FunSuite  with LocalSparkContext with Matchers with BeforeAndAfter
+  with BeforeAndAfterAll with ResetSystemProperties {
 
   /** Length of time to wait while draining listener events. */
   val WAIT_TIMEOUT_MILLIS = 10000
 
+  val jobCompletionTime = 1421191296660L
+
   before {
     sc = new SparkContext("local", "SparkListenerSuite")
   }
 
-  override def afterAll() {
-    System.clearProperty("spark.akka.frameSize")
-  }
-
   test("basic creation and shutdown of LiveListenerBus") {
     val counter = new BasicJobCounter
     val bus = new LiveListenerBus
     bus.addListener(counter)
 
     // Listener bus hasn't started yet, so posting events should not increment counter
-    (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, JobSucceeded)) }
+    (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) }
     assert(counter.count === 0)
 
     // Starting listener bus should flush all buffered events
@@ -57,7 +56,7 @@ class SparkListenerSuite extends FunSuite with LocalSparkContext with Matchers
 
     // After listener bus has stopped, posting events should not increment counter
     bus.stop()
-    (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, JobSucceeded)) }
+    (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) }
     assert(counter.count === 5)
 
     // Listener bus must not be started twice
@@ -102,7 +101,7 @@ class SparkListenerSuite extends FunSuite with LocalSparkContext with Matchers
 
     bus.addListener(blockingListener)
     bus.start()
-    bus.post(SparkListenerJobEnd(0, JobSucceeded))
+    bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
 
     listenerStarted.acquire()
     // Listener should be blocked after start
@@ -348,7 +347,7 @@ class SparkListenerSuite extends FunSuite with LocalSparkContext with Matchers
     bus.start()
 
     // Post events to all listeners, and wait until the queue is drained
-    (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, JobSucceeded)) }
+    (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) }
     assert(bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
 
     // The exception should be caught, and the event should be propagated to other listeners
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
new file mode 100644
index 0000000000000..623a687c359a2
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import org.apache.spark.scheduler.cluster.ExecutorInfo
+import org.apache.spark.{SparkContext, LocalSparkContext}
+
+import org.scalatest.{FunSuite, BeforeAndAfter, BeforeAndAfterAll}
+
+import scala.collection.mutable
+
+/**
+ * Unit tests for SparkListener that require a local cluster.
+ */
+class SparkListenerWithClusterSuite extends FunSuite with LocalSparkContext
+  with BeforeAndAfter with BeforeAndAfterAll {
+
+  /** Length of time to wait while draining listener events. */
+  val WAIT_TIMEOUT_MILLIS = 10000
+
+  before {
+    sc = new SparkContext("local-cluster[2,1,512]", "SparkListenerSuite")
+  }
+
+  test("SparkListener sends executor added message") {
+    val listener = new SaveExecutorInfo
+    sc.addSparkListener(listener)
+
+    val rdd1 = sc.parallelize(1 to 100, 4)
+    val rdd2 = rdd1.map(_.toString)
+    rdd2.setName("Target RDD")
+    rdd2.count()
+
+    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    assert(listener.addedExecutorInfo.size == 2)
+    assert(listener.addedExecutorInfo("0").totalCores == 1)
+    assert(listener.addedExecutorInfo("1").totalCores == 1)
+  }
+
+  private class SaveExecutorInfo extends SparkListener {
+    val addedExecutorInfo = mutable.Map[String, ExecutorInfo]()
+
+    override def onExecutorAdded(executor: SparkListenerExecutorAdded) {
+      addedExecutorInfo(executor.executorId) = executor.executorInfo
+    }
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
index 561a5e9cd90c4..057e226916027 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
@@ -45,13 +45,13 @@ class TaskContextSuite extends FunSuite with BeforeAndAfter with LocalSparkConte
     val task = new ResultTask[String, String](
       0, sc.broadcast(closureSerializer.serialize((rdd, func)).array), rdd.partitions(0), Seq(), 0)
     intercept[RuntimeException] {
-      task.run(0)
+      task.run(0, 0)
     }
     assert(TaskContextSuite.completed === true)
   }
 
   test("all TaskCompletionListeners should be called even if some fail") {
-    val context = new TaskContextImpl(0, 0, 0)
+    val context = new TaskContextImpl(0, 0, 0, 0)
     val listener = mock(classOf[TaskCompletionListener])
     context.addTaskCompletionListener(_ => throw new Exception("blah"))
     context.addTaskCompletionListener(listener)
@@ -63,6 +63,33 @@ class TaskContextSuite extends FunSuite with BeforeAndAfter with LocalSparkConte
 
     verify(listener, times(1)).onTaskCompletion(any())
   }
+
+  test("TaskContext.attemptNumber should return attempt number, not task id (SPARK-4014)") {
+    sc = new SparkContext("local[1,2]", "test")  // use maxRetries = 2 because we test failed tasks
+    // Check that attemptIds are 0 for all tasks' initial attempts
+    val attemptIds = sc.parallelize(Seq(1, 2), 2).mapPartitions { iter =>
+      Seq(TaskContext.get().attemptNumber).iterator
+    }.collect()
+    assert(attemptIds.toSet === Set(0))
+
+    // Test a job with failed tasks
+    val attemptIdsWithFailedTask = sc.parallelize(Seq(1, 2), 2).mapPartitions { iter =>
+      val attemptId = TaskContext.get().attemptNumber
+      if (iter.next() == 1 && attemptId == 0) {
+        throw new Exception("First execution of task failed")
+      }
+      Seq(attemptId).iterator
+    }.collect()
+    assert(attemptIdsWithFailedTask.toSet === Set(0, 1))
+  }
+
+  test("TaskContext.attemptId returns taskAttemptId for backwards-compatibility (SPARK-4014)") {
+    sc = new SparkContext("local", "test")
+    val attemptIds = sc.parallelize(Seq(1, 2, 3, 4), 4).mapPartitions { iter =>
+      Seq(TaskContext.get().attemptId).iterator
+    }.collect()
+    assert(attemptIds.toSet === Set(0, 1, 2, 3))
+  }
 }
 
 private object TaskContextSuite {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala
index 5768a3a733f00..e3a3803e6483a 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala
@@ -19,9 +19,14 @@ package org.apache.spark.scheduler
 
 import java.nio.ByteBuffer
 
-import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite}
+import scala.concurrent.duration._
+import scala.language.postfixOps
+import scala.util.control.NonFatal
 
-import org.apache.spark.{LocalSparkContext, SparkContext, SparkEnv}
+import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.concurrent.Eventually._
+
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv}
 import org.apache.spark.storage.TaskResultBlockId
 
 /**
@@ -34,6 +39,8 @@ class ResultDeletingTaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedule
   extends TaskResultGetter(sparkEnv, scheduler) {
   var removedResult = false
 
+  @volatile var removeBlockSuccessfully = false
+
   override def enqueueSuccessfulTask(
     taskSetManager: TaskSetManager, tid: Long, serializedData: ByteBuffer) {
     if (!removedResult) {
@@ -42,6 +49,15 @@ class ResultDeletingTaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedule
       serializer.get().deserialize[TaskResult[_]](serializedData) match {
         case IndirectTaskResult(blockId, size) =>
           sparkEnv.blockManager.master.removeBlock(blockId)
+          // removeBlock is asynchronous. Need to wait it's removed successfully
+          try {
+            eventually(timeout(3 seconds), interval(200 milliseconds)) {
+              assert(!sparkEnv.blockManager.master.contains(blockId))
+            }
+            removeBlockSuccessfully = true
+          } catch {
+            case NonFatal(e) => removeBlockSuccessfully = false
+          }
         case directResult: DirectTaskResult[_] =>
           taskSetManager.abort("Internal error: expect only indirect results")
       }
@@ -55,27 +71,20 @@ class ResultDeletingTaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedule
 /**
  * Tests related to handling task results (both direct and indirect).
  */
-class TaskResultGetterSuite extends FunSuite with BeforeAndAfter with BeforeAndAfterAll
-  with LocalSparkContext {
+class TaskResultGetterSuite extends FunSuite with BeforeAndAfter with LocalSparkContext {
 
-  override def beforeAll {
-    // Set the Akka frame size to be as small as possible (it must be an integer, so 1 is as small
-    // as we can make it) so the tests don't take too long.
-    System.setProperty("spark.akka.frameSize", "1")
-  }
-
-  override def afterAll {
-    System.clearProperty("spark.akka.frameSize")
-  }
+  // Set the Akka frame size to be as small as possible (it must be an integer, so 1 is as small
+  // as we can make it) so the tests don't take too long.
+  def conf: SparkConf = new SparkConf().set("spark.akka.frameSize", "1")
 
   test("handling results smaller than Akka frame size") {
-    sc = new SparkContext("local", "test")
+    sc = new SparkContext("local", "test", conf)
     val result = sc.parallelize(Seq(1), 1).map(x => 2 * x).reduce((x, y) => x)
     assert(result === 2)
   }
 
   test("handling results larger than Akka frame size") {
-    sc = new SparkContext("local", "test")
+    sc = new SparkContext("local", "test", conf)
     val akkaFrameSize =
       sc.env.actorSystem.settings.config.getBytes("akka.remote.netty.tcp.maximum-frame-size").toInt
     val result = sc.parallelize(Seq(1), 1).map(x => 1.to(akkaFrameSize).toArray).reduce((x, y) => x)
@@ -89,7 +98,7 @@ class TaskResultGetterSuite extends FunSuite with BeforeAndAfter with BeforeAndA
   test("task retried if result missing from block manager") {
     // Set the maximum number of task failures to > 0, so that the task set isn't aborted
     // after the result is missing.
-    sc = new SparkContext("local[1,2]", "test")
+    sc = new SparkContext("local[1,2]", "test", conf)
     // If this test hangs, it's probably because no resource offers were made after the task
     // failed.
     val scheduler: TaskSchedulerImpl = sc.taskScheduler match {
@@ -99,10 +108,12 @@ class TaskResultGetterSuite extends FunSuite with BeforeAndAfter with BeforeAndA
         assert(false, "Expect local cluster to use TaskSchedulerImpl")
         throw new ClassCastException
     }
-    scheduler.taskResultGetter = new ResultDeletingTaskResultGetter(sc.env, scheduler)
+    val resultGetter = new ResultDeletingTaskResultGetter(sc.env, scheduler)
+    scheduler.taskResultGetter = resultGetter
     val akkaFrameSize =
       sc.env.actorSystem.settings.config.getBytes("akka.remote.netty.tcp.maximum-frame-size").toInt
     val result = sc.parallelize(Seq(1), 1).map(x => 1.to(akkaFrameSize).toArray).reduce((x, y) => x)
+    assert(resultGetter.removeBlockSuccessfully)
     assert(result === 1.to(akkaFrameSize).toArray)
 
     // Make sure two tasks were run (one failed one, and a second retried one).
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index 7532da88c6065..add13f5b21765 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -30,238 +30,8 @@ class FakeSchedulerBackend extends SchedulerBackend {
   def defaultParallelism() = 1
 }
 
-class FakeTaskSetManager(
-    initPriority: Int,
-    initStageId: Int,
-    initNumTasks: Int,
-    taskScheduler: TaskSchedulerImpl,
-    taskSet: TaskSet)
-  extends TaskSetManager(taskScheduler, taskSet, 0) {
-
-  parent = null
-  weight = 1
-  minShare = 2
-  priority = initPriority
-  stageId = initStageId
-  name = "TaskSet_"+stageId
-  override val numTasks = initNumTasks
-  tasksSuccessful = 0
-
-  var numRunningTasks = 0
-  override def runningTasks = numRunningTasks
-
-  def increaseRunningTasks(taskNum: Int) {
-    numRunningTasks += taskNum
-    if (parent != null) {
-      parent.increaseRunningTasks(taskNum)
-    }
-  }
-
-  def decreaseRunningTasks(taskNum: Int) {
-    numRunningTasks -= taskNum
-    if (parent != null) {
-      parent.decreaseRunningTasks(taskNum)
-    }
-  }
-
-  override def addSchedulable(schedulable: Schedulable) {
-  }
-
-  override def removeSchedulable(schedulable: Schedulable) {
-  }
-
-  override def getSchedulableByName(name: String): Schedulable = {
-    null
-  }
-
-  override def executorLost(executorId: String, host: String): Unit = {
-  }
-
-  override def resourceOffer(
-      execId: String,
-      host: String,
-      maxLocality: TaskLocality.TaskLocality)
-    : Option[TaskDescription] =
-  {
-    if (tasksSuccessful + numRunningTasks < numTasks) {
-      increaseRunningTasks(1)
-      Some(new TaskDescription(0, execId, "task 0:0", 0, null))
-    } else {
-      None
-    }
-  }
-
-  override def checkSpeculatableTasks(): Boolean = {
-    true
-  }
-
-  def taskFinished() {
-    decreaseRunningTasks(1)
-    tasksSuccessful +=1
-    if (tasksSuccessful == numTasks) {
-      parent.removeSchedulable(this)
-    }
-  }
-
-  def abort() {
-    decreaseRunningTasks(numRunningTasks)
-    parent.removeSchedulable(this)
-  }
-}
-
 class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Logging {
 
-  def createDummyTaskSetManager(priority: Int, stage: Int, numTasks: Int, cs: TaskSchedulerImpl,
-      taskSet: TaskSet): FakeTaskSetManager = {
-    new FakeTaskSetManager(priority, stage, numTasks, cs , taskSet)
-  }
-
-  def resourceOffer(rootPool: Pool): Int = {
-    val taskSetQueue = rootPool.getSortedTaskSetQueue
-    /* Just for Test*/
-    for (manager <- taskSetQueue) {
-       logInfo("parentName:%s, parent running tasks:%d, name:%s,runningTasks:%d".format(
-         manager.parent.name, manager.parent.runningTasks, manager.name, manager.runningTasks))
-    }
-    for (taskSet <- taskSetQueue) {
-      taskSet.resourceOffer("execId_1", "hostname_1", TaskLocality.ANY) match {
-        case Some(task) =>
-          return taskSet.stageId
-        case None => {}
-      }
-    }
-    -1
-  }
-
-  def checkTaskSetId(rootPool: Pool, expectedTaskSetId: Int) {
-    assert(resourceOffer(rootPool) === expectedTaskSetId)
-  }
-
-  test("FIFO Scheduler Test") {
-    sc = new SparkContext("local", "TaskSchedulerImplSuite")
-    val taskScheduler = new TaskSchedulerImpl(sc)
-    val taskSet = FakeTask.createTaskSet(1)
-
-    val rootPool = new Pool("", SchedulingMode.FIFO, 0, 0)
-    val schedulableBuilder = new FIFOSchedulableBuilder(rootPool)
-    schedulableBuilder.buildPools()
-
-    val taskSetManager0 = createDummyTaskSetManager(0, 0, 2, taskScheduler, taskSet)
-    val taskSetManager1 = createDummyTaskSetManager(0, 1, 2, taskScheduler, taskSet)
-    val taskSetManager2 = createDummyTaskSetManager(0, 2, 2, taskScheduler, taskSet)
-    schedulableBuilder.addTaskSetManager(taskSetManager0, null)
-    schedulableBuilder.addTaskSetManager(taskSetManager1, null)
-    schedulableBuilder.addTaskSetManager(taskSetManager2, null)
-
-    checkTaskSetId(rootPool, 0)
-    resourceOffer(rootPool)
-    checkTaskSetId(rootPool, 1)
-    resourceOffer(rootPool)
-    taskSetManager1.abort()
-    checkTaskSetId(rootPool, 2)
-  }
-
-  test("Fair Scheduler Test") {
-    sc = new SparkContext("local", "TaskSchedulerImplSuite")
-    val taskScheduler = new TaskSchedulerImpl(sc)
-    val taskSet = FakeTask.createTaskSet(1)
-
-    val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile()
-    System.setProperty("spark.scheduler.allocation.file", xmlPath)
-    val rootPool = new Pool("", SchedulingMode.FAIR, 0, 0)
-    val schedulableBuilder = new FairSchedulableBuilder(rootPool, sc.conf)
-    schedulableBuilder.buildPools()
-
-    assert(rootPool.getSchedulableByName("default") != null)
-    assert(rootPool.getSchedulableByName("1") != null)
-    assert(rootPool.getSchedulableByName("2") != null)
-    assert(rootPool.getSchedulableByName("3") != null)
-    assert(rootPool.getSchedulableByName("1").minShare === 2)
-    assert(rootPool.getSchedulableByName("1").weight === 1)
-    assert(rootPool.getSchedulableByName("2").minShare === 3)
-    assert(rootPool.getSchedulableByName("2").weight === 1)
-    assert(rootPool.getSchedulableByName("3").minShare === 0)
-    assert(rootPool.getSchedulableByName("3").weight === 1)
-
-    val properties1 = new Properties()
-    properties1.setProperty("spark.scheduler.pool","1")
-    val properties2 = new Properties()
-    properties2.setProperty("spark.scheduler.pool","2")
-
-    val taskSetManager10 = createDummyTaskSetManager(1, 0, 1, taskScheduler, taskSet)
-    val taskSetManager11 = createDummyTaskSetManager(1, 1, 1, taskScheduler, taskSet)
-    val taskSetManager12 = createDummyTaskSetManager(1, 2, 2, taskScheduler, taskSet)
-    schedulableBuilder.addTaskSetManager(taskSetManager10, properties1)
-    schedulableBuilder.addTaskSetManager(taskSetManager11, properties1)
-    schedulableBuilder.addTaskSetManager(taskSetManager12, properties1)
-
-    val taskSetManager23 = createDummyTaskSetManager(2, 3, 2, taskScheduler, taskSet)
-    val taskSetManager24 = createDummyTaskSetManager(2, 4, 2, taskScheduler, taskSet)
-    schedulableBuilder.addTaskSetManager(taskSetManager23, properties2)
-    schedulableBuilder.addTaskSetManager(taskSetManager24, properties2)
-
-    checkTaskSetId(rootPool, 0)
-    checkTaskSetId(rootPool, 3)
-    checkTaskSetId(rootPool, 3)
-    checkTaskSetId(rootPool, 1)
-    checkTaskSetId(rootPool, 4)
-    checkTaskSetId(rootPool, 2)
-    checkTaskSetId(rootPool, 2)
-    checkTaskSetId(rootPool, 4)
-
-    taskSetManager12.taskFinished()
-    assert(rootPool.getSchedulableByName("1").runningTasks === 3)
-    taskSetManager24.abort()
-    assert(rootPool.getSchedulableByName("2").runningTasks === 2)
-  }
-
-  test("Nested Pool Test") {
-    sc = new SparkContext("local", "TaskSchedulerImplSuite")
-    val taskScheduler = new TaskSchedulerImpl(sc)
-    val taskSet = FakeTask.createTaskSet(1)
-
-    val rootPool = new Pool("", SchedulingMode.FAIR, 0, 0)
-    val pool0 = new Pool("0", SchedulingMode.FAIR, 3, 1)
-    val pool1 = new Pool("1", SchedulingMode.FAIR, 4, 1)
-    rootPool.addSchedulable(pool0)
-    rootPool.addSchedulable(pool1)
-
-    val pool00 = new Pool("00", SchedulingMode.FAIR, 2, 2)
-    val pool01 = new Pool("01", SchedulingMode.FAIR, 1, 1)
-    pool0.addSchedulable(pool00)
-    pool0.addSchedulable(pool01)
-
-    val pool10 = new Pool("10", SchedulingMode.FAIR, 2, 2)
-    val pool11 = new Pool("11", SchedulingMode.FAIR, 2, 1)
-    pool1.addSchedulable(pool10)
-    pool1.addSchedulable(pool11)
-
-    val taskSetManager000 = createDummyTaskSetManager(0, 0, 5, taskScheduler, taskSet)
-    val taskSetManager001 = createDummyTaskSetManager(0, 1, 5, taskScheduler, taskSet)
-    pool00.addSchedulable(taskSetManager000)
-    pool00.addSchedulable(taskSetManager001)
-
-    val taskSetManager010 = createDummyTaskSetManager(1, 2, 5, taskScheduler, taskSet)
-    val taskSetManager011 = createDummyTaskSetManager(1, 3, 5, taskScheduler, taskSet)
-    pool01.addSchedulable(taskSetManager010)
-    pool01.addSchedulable(taskSetManager011)
-
-    val taskSetManager100 = createDummyTaskSetManager(2, 4, 5, taskScheduler, taskSet)
-    val taskSetManager101 = createDummyTaskSetManager(2, 5, 5, taskScheduler, taskSet)
-    pool10.addSchedulable(taskSetManager100)
-    pool10.addSchedulable(taskSetManager101)
-
-    val taskSetManager110 = createDummyTaskSetManager(3, 6, 5, taskScheduler, taskSet)
-    val taskSetManager111 = createDummyTaskSetManager(3, 7, 5, taskScheduler, taskSet)
-    pool11.addSchedulable(taskSetManager110)
-    pool11.addSchedulable(taskSetManager111)
-
-    checkTaskSetId(rootPool, 0)
-    checkTaskSetId(rootPool, 4)
-    checkTaskSetId(rootPool, 6)
-    checkTaskSetId(rootPool, 2)
-  }
-
   test("Scheduler does not always schedule tasks on the same workers") {
     sc = new SparkContext("local", "TaskSchedulerImplSuite")
     val taskScheduler = new TaskSchedulerImpl(sc)
@@ -305,7 +75,6 @@ class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Loggin
       override def taskStarted(task: Task[_], taskInfo: TaskInfo) {}
       override def executorAdded(execId: String, host: String) {}
     }
-    taskScheduler.setDAGScheduler(dagScheduler)
     // Give zero core offers. Should not generate any tasks
     val zeroCoreWorkerOffers = Seq(new WorkerOffer("executor0", "host0", 0),
       new WorkerOffer("executor1", "host1", 0))
@@ -331,4 +100,34 @@ class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Loggin
     assert(1 === taskDescriptions.length)
     assert("executor0" === taskDescriptions(0).executorId)
   }
+
+  test("Scheduler does not crash when tasks are not serializable") {
+    sc = new SparkContext("local", "TaskSchedulerImplSuite")
+    val taskCpus = 2
+
+    sc.conf.set("spark.task.cpus", taskCpus.toString)
+    val taskScheduler = new TaskSchedulerImpl(sc)
+    taskScheduler.initialize(new FakeSchedulerBackend)
+    // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks.
+    val dagScheduler = new DAGScheduler(sc, taskScheduler) {
+      override def taskStarted(task: Task[_], taskInfo: TaskInfo) {}
+      override def executorAdded(execId: String, host: String) {}
+    }
+    val numFreeCores = 1
+    taskScheduler.setDAGScheduler(dagScheduler)
+    var taskSet = new TaskSet(Array(new NotSerializableFakeTask(1, 0), new NotSerializableFakeTask(0, 1)), 0, 0, 0, null)
+    val multiCoreWorkerOffers = Seq(new WorkerOffer("executor0", "host0", taskCpus),
+      new WorkerOffer("executor1", "host1", numFreeCores))
+    taskScheduler.submitTasks(taskSet)
+    var taskDescriptions = taskScheduler.resourceOffers(multiCoreWorkerOffers).flatten
+    assert(0 === taskDescriptions.length)
+
+    // Now check that we can still submit tasks
+    // Even if one of the tasks has not-serializable tasks, the other task set should still be processed without error
+    taskScheduler.submitTasks(taskSet)
+    taskScheduler.submitTasks(FakeTask.createTaskSet(1))
+    taskDescriptions = taskScheduler.resourceOffers(multiCoreWorkerOffers).flatten
+    assert(taskDescriptions.map(_.executorId) === Seq("executor0"))
+  }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 472191551a01f..84b9b788237bf 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.scheduler
 
+import java.io.{ObjectInputStream, ObjectOutputStream, IOException}
 import java.util.Random
 
 import scala.collection.mutable.ArrayBuffer
@@ -563,6 +564,19 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
     assert(manager.emittedTaskSizeWarning)
   }
 
+  test("Not serializable exception thrown if the task cannot be serialized") {
+    sc = new SparkContext("local", "test")
+    val sched = new FakeTaskScheduler(sc, ("exec1", "host1"))
+
+    val taskSet = new TaskSet(Array(new NotSerializableFakeTask(1, 0), new NotSerializableFakeTask(0, 1)), 0, 0, 0, null)
+    val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES)
+
+    intercept[TaskNotSerializableException] {
+      manager.resourceOffer("exec1", "host1", ANY)
+    }
+    assert(manager.isZombie)
+  }
+
   test("abort the job if total size of results is too large") {
     val conf = new SparkConf().set("spark.driver.maxResultSize", "2m")
     sc = new SparkContext("local", "test", conf)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala
index e60e70afd3218..073814c127edc 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala
@@ -17,23 +17,58 @@
 
 package org.apache.spark.scheduler.mesos
 
+import org.apache.spark.executor.MesosExecutorBackend
 import org.scalatest.FunSuite
-import org.apache.spark.{scheduler, SparkConf, SparkContext, LocalSparkContext}
-import org.apache.spark.scheduler.{TaskDescription, WorkerOffer, TaskSchedulerImpl}
+import org.apache.spark.{SparkConf, SparkContext, LocalSparkContext}
+import org.apache.spark.scheduler.{SparkListenerExecutorAdded, LiveListenerBus,
+  TaskDescription, WorkerOffer, TaskSchedulerImpl}
+import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.scheduler.cluster.mesos.{MemoryUtils, MesosSchedulerBackend}
 import org.apache.mesos.SchedulerDriver
-import org.apache.mesos.Protos._
-import org.scalatest.mock.EasyMockSugar
+import org.apache.mesos.Protos.{ExecutorInfo => MesosExecutorInfo, _}
 import org.apache.mesos.Protos.Value.Scalar
 import org.easymock.{Capture, EasyMock}
 import java.nio.ByteBuffer
 import java.util.Collections
 import java.util
+import org.scalatest.mock.EasyMockSugar
+
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
 class MesosSchedulerBackendSuite extends FunSuite with LocalSparkContext with EasyMockSugar {
 
+  test("check spark-class location correctly") {
+    val conf = new SparkConf
+    conf.set("spark.mesos.executor.home" , "/mesos-home")
+
+    val listenerBus = EasyMock.createMock(classOf[LiveListenerBus])
+    listenerBus.post(SparkListenerExecutorAdded("s1", new ExecutorInfo("host1", 2)))
+    EasyMock.replay(listenerBus)
+
+    val sc = EasyMock.createMock(classOf[SparkContext])
+    EasyMock.expect(sc.getSparkHome()).andReturn(Option("/spark-home")).anyTimes()
+    EasyMock.expect(sc.conf).andReturn(conf).anyTimes()
+    EasyMock.expect(sc.executorEnvs).andReturn(new mutable.HashMap).anyTimes()
+    EasyMock.expect(sc.executorMemory).andReturn(100).anyTimes()
+    EasyMock.expect(sc.listenerBus).andReturn(listenerBus)
+    EasyMock.replay(sc)
+    val taskScheduler = EasyMock.createMock(classOf[TaskSchedulerImpl])
+    EasyMock.expect(taskScheduler.CPUS_PER_TASK).andReturn(2).anyTimes()
+    EasyMock.replay(taskScheduler)
+
+    val mesosSchedulerBackend = new MesosSchedulerBackend(taskScheduler, sc, "master")
+
+    // uri is null.
+    val executorInfo = mesosSchedulerBackend.createExecutorInfo("test-id")
+    assert(executorInfo.getCommand.getValue === s" /mesos-home/bin/spark-class ${classOf[MesosExecutorBackend].getName}")
+
+    // uri exists.
+    conf.set("spark.executor.uri", "hdfs:///test-app-1.0.0.tgz")
+    val executorInfo1 = mesosSchedulerBackend.createExecutorInfo("test-id")
+    assert(executorInfo1.getCommand.getValue === s"cd test-app-1*;  ./bin/spark-class ${classOf[MesosExecutorBackend].getName}")
+  }
+
   test("mesos resource offers result in launching tasks") {
     def createOffer(id: Int, mem: Int, cpu: Int) = {
       val builder = Offer.newBuilder()
@@ -52,11 +87,16 @@ class MesosSchedulerBackendSuite extends FunSuite with LocalSparkContext with Ea
     val driver = EasyMock.createMock(classOf[SchedulerDriver])
     val taskScheduler = EasyMock.createMock(classOf[TaskSchedulerImpl])
 
+    val listenerBus = EasyMock.createMock(classOf[LiveListenerBus])
+    listenerBus.post(SparkListenerExecutorAdded("s1", new ExecutorInfo("host1", 2)))
+    EasyMock.replay(listenerBus)
+
     val sc = EasyMock.createMock(classOf[SparkContext])
     EasyMock.expect(sc.executorMemory).andReturn(100).anyTimes()
     EasyMock.expect(sc.getSparkHome()).andReturn(Option("/path")).anyTimes()
     EasyMock.expect(sc.executorEnvs).andReturn(new mutable.HashMap).anyTimes()
     EasyMock.expect(sc.conf).andReturn(new SparkConf).anyTimes()
+    EasyMock.expect(sc.listenerBus).andReturn(listenerBus)
     EasyMock.replay(sc)
 
     val minMem = MemoryUtils.calculateTotalMemory(sc).toInt
@@ -80,7 +120,7 @@ class MesosSchedulerBackendSuite extends FunSuite with LocalSparkContext with Ea
       mesosOffers.get(2).getHostname,
       2
     ))
-    val taskDesc = new TaskDescription(1L, "s1", "n1", 0, ByteBuffer.wrap(new Array[Byte](0)))
+    val taskDesc = new TaskDescription(1L, 0, "s1", "n1", 0, ByteBuffer.wrap(new Array[Byte](0)))
     EasyMock.expect(taskScheduler.resourceOffers(EasyMock.eq(expectedWorkerOffers))).andReturn(Seq(Seq(taskDesc)))
     EasyMock.expect(taskScheduler.CPUS_PER_TASK).andReturn(2).anyTimes()
     EasyMock.replay(taskScheduler)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosTaskLaunchDataSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosTaskLaunchDataSuite.scala
new file mode 100644
index 0000000000000..86a42a7398e4d
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosTaskLaunchDataSuite.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.mesos
+
+import java.nio.ByteBuffer
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.scheduler.cluster.mesos.MesosTaskLaunchData
+
+class MesosTaskLaunchDataSuite extends FunSuite {
+  test("serialize and deserialize data must be same") {
+    val serializedTask = ByteBuffer.allocate(40)
+    (Range(100, 110).map(serializedTask.putInt(_)))
+    serializedTask.rewind
+    val attemptNumber = 100
+    val byteString = MesosTaskLaunchData(serializedTask, attemptNumber).toByteString
+    serializedTask.rewind
+    val mesosTaskLaunchData = MesosTaskLaunchData.fromByteString(byteString)
+    assert(mesosTaskLaunchData.attemptNumber == attemptNumber)
+    assert(mesosTaskLaunchData.serializedTask.equals(serializedTask))
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index 5554efbcbadf8..ffe6f039145ea 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -33,7 +33,7 @@ import akka.util.Timeout
 
 import org.mockito.Mockito.{mock, when}
 
-import org.scalatest.{BeforeAndAfter, FunSuite, Matchers, PrivateMethodTester}
+import org.scalatest._
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.Timeouts._
 
@@ -44,18 +44,17 @@ import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 import org.apache.spark.shuffle.hash.HashShuffleManager
 import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat
-import org.apache.spark.util.{AkkaUtils, ByteBufferInputStream, SizeEstimator, Utils}
+import org.apache.spark.util._
 
 
-class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
-  with PrivateMethodTester {
+class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfterEach
+  with PrivateMethodTester with ResetSystemProperties {
 
   private val conf = new SparkConf(false)
   var store: BlockManager = null
   var store2: BlockManager = null
   var actorSystem: ActorSystem = null
   var master: BlockManagerMaster = null
-  var oldArch: String = null
   conf.set("spark.authenticate", "false")
   val securityMgr = new SecurityManager(conf)
   val mapOutputTracker = new MapOutputTrackerMaster(conf)
@@ -79,13 +78,13 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
     manager
   }
 
-  before {
+  override def beforeEach(): Unit = {
     val (actorSystem, boundPort) = AkkaUtils.createActorSystem(
       "test", "localhost", 0, conf = conf, securityManager = securityMgr)
     this.actorSystem = actorSystem
 
     // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case
-    oldArch = System.setProperty("os.arch", "amd64")
+    System.setProperty("os.arch", "amd64")
     conf.set("os.arch", "amd64")
     conf.set("spark.test.useCompressedOops", "true")
     conf.set("spark.driver.port", boundPort.toString)
@@ -100,7 +99,7 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
     SizeEstimator invokePrivate initialize()
   }
 
-  after {
+  override def afterEach(): Unit = {
     if (store != null) {
       store.stop()
       store = null
@@ -113,14 +112,6 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
     actorSystem.awaitTermination()
     actorSystem = null
     master = null
-
-    if (oldArch != null) {
-      conf.set("os.arch", oldArch)
-    } else {
-      System.clearProperty("os.arch")
-    }
-
-    System.clearProperty("spark.test.useCompressedOops")
   }
 
   test("StorageLevel object caching") {
diff --git a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
index 1eaabb93adbed..37b593b2c5f79 100644
--- a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
@@ -89,7 +89,7 @@ class ShuffleBlockFetcherIteratorSuite extends FunSuite {
     )
 
     val iterator = new ShuffleBlockFetcherIterator(
-      new TaskContextImpl(0, 0, 0),
+      new TaskContextImpl(0, 0, 0, 0),
       transfer,
       blockManager,
       blocksByAddress,
@@ -154,7 +154,7 @@ class ShuffleBlockFetcherIteratorSuite extends FunSuite {
     val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])](
       (remoteBmId, blocks.keys.map(blockId => (blockId, 1.asInstanceOf[Long])).toSeq))
 
-    val taskContext = new TaskContextImpl(0, 0, 0)
+    val taskContext = new TaskContextImpl(0, 0, 0, 0)
     val iterator = new ShuffleBlockFetcherIterator(
       taskContext,
       transfer,
@@ -217,7 +217,7 @@ class ShuffleBlockFetcherIteratorSuite extends FunSuite {
     val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])](
       (remoteBmId, blocks.keys.map(blockId => (blockId, 1.asInstanceOf[Long])).toSeq))
 
-    val taskContext = new TaskContextImpl(0, 0, 0)
+    val taskContext = new TaskContextImpl(0, 0, 0, 0)
     val iterator = new ShuffleBlockFetcherIterator(
       taskContext,
       transfer,
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index 787f4c2b5a8b2..e85a436cdba17 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -173,7 +173,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers {
       // Simulate fetch failures:
       val mappedData = data.map { x =>
         val taskContext = TaskContext.get
-        if (taskContext.attemptId() == 1) {  // Cause this stage to fail on its first attempt.
+        if (taskContext.attemptNumber == 0) {  // Cause this stage to fail on its first attempt.
           val env = SparkEnv.get
           val bmAddress = env.blockManager.blockManagerId
           val shuffleId = shuffleHandle.shuffleId
diff --git a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
index 12af60caf7d54..68074ae32a672 100644
--- a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
@@ -28,6 +28,8 @@ import org.apache.spark.util.Utils
 
 class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matchers {
 
+  val jobSubmissionTime = 1421191042750L
+  val jobCompletionTime = 1421191296660L
 
   private def createStageStartEvent(stageId: Int) = {
     val stageInfo = new StageInfo(stageId, 0, stageId.toString, 0, null, "")
@@ -46,12 +48,12 @@ class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matc
     val stageInfos = stageIds.map { stageId =>
       new StageInfo(stageId, 0, stageId.toString, 0, null, "")
     }
-    SparkListenerJobStart(jobId, stageInfos)
+    SparkListenerJobStart(jobId, jobSubmissionTime, stageInfos)
   }
 
   private def createJobEndEvent(jobId: Int, failed: Boolean = false) = {
     val result = if (failed) JobFailed(new Exception("dummy failure")) else JobSucceeded
-    SparkListenerJobEnd(jobId, result)
+    SparkListenerJobEnd(jobId, jobCompletionTime, result)
   }
 
   private def runJob(listener: SparkListener, jobId: Int, shouldFail: Boolean = false) {
@@ -138,7 +140,7 @@ class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matc
     assert(listener.stageIdToData.size === 0)
 
     // finish this task, should get updated shuffleRead
-    shuffleReadMetrics.remoteBytesRead = 1000
+    shuffleReadMetrics.incRemoteBytesRead(1000)
     taskMetrics.setShuffleReadMetrics(Some(shuffleReadMetrics))
     var taskInfo = new TaskInfo(1234L, 0, 1, 0L, "exe-1", "host1", TaskLocality.NODE_LOCAL, false)
     taskInfo.finishTime = 1
@@ -224,18 +226,18 @@ class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matc
       val shuffleWriteMetrics = new ShuffleWriteMetrics()
       taskMetrics.setShuffleReadMetrics(Some(shuffleReadMetrics))
       taskMetrics.shuffleWriteMetrics = Some(shuffleWriteMetrics)
-      shuffleReadMetrics.remoteBytesRead = base + 1
-      shuffleReadMetrics.remoteBlocksFetched = base + 2
-      shuffleWriteMetrics.shuffleBytesWritten = base + 3
-      taskMetrics.executorRunTime = base + 4
-      taskMetrics.diskBytesSpilled = base + 5
-      taskMetrics.memoryBytesSpilled = base + 6
+      shuffleReadMetrics.incRemoteBytesRead(base + 1)
+      shuffleReadMetrics.incRemoteBlocksFetched(base + 2)
+      shuffleWriteMetrics.incShuffleBytesWritten(base + 3)
+      taskMetrics.setExecutorRunTime(base + 4)
+      taskMetrics.incDiskBytesSpilled(base + 5)
+      taskMetrics.incMemoryBytesSpilled(base + 6)
       val inputMetrics = new InputMetrics(DataReadMethod.Hadoop)
-      taskMetrics.inputMetrics = Some(inputMetrics)
-      inputMetrics.bytesRead = base + 7
+      taskMetrics.setInputMetrics(Some(inputMetrics))
+      inputMetrics.addBytesRead(base + 7)
       val outputMetrics = new OutputMetrics(DataWriteMethod.Hadoop)
       taskMetrics.outputMetrics = Some(outputMetrics)
-      outputMetrics.bytesWritten = base + 8
+      outputMetrics.setBytesWritten(base + 8)
       taskMetrics
     }
 
diff --git a/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
index 7bca1711ae226..6bbf72e929dcb 100644
--- a/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
@@ -31,7 +31,7 @@ import org.apache.spark.storage.BlockManagerId
 /**
   * Test the AkkaUtils with various security settings.
   */
-class AkkaUtilsSuite extends FunSuite with LocalSparkContext {
+class AkkaUtilsSuite extends FunSuite with LocalSparkContext with ResetSystemProperties {
 
   test("remote fetch security bad password") {
     val conf = new SparkConf
diff --git a/core/src/test/scala/org/apache/spark/util/EventLoopSuite.scala b/core/src/test/scala/org/apache/spark/util/EventLoopSuite.scala
new file mode 100644
index 0000000000000..1026cb2aa7cae
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/util/EventLoopSuite.scala
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.util.concurrent.CountDownLatch
+
+import scala.collection.mutable
+import scala.concurrent.duration._
+import scala.language.postfixOps
+
+import org.scalatest.concurrent.Eventually._
+import org.scalatest.concurrent.Timeouts
+import org.scalatest.FunSuite
+
+class EventLoopSuite extends FunSuite with Timeouts {
+
+  test("EventLoop") {
+    val buffer = new mutable.ArrayBuffer[Int] with mutable.SynchronizedBuffer[Int]
+    val eventLoop = new EventLoop[Int]("test") {
+
+      override def onReceive(event: Int): Unit = {
+        buffer += event
+      }
+
+      override def onError(e: Throwable): Unit = {}
+    }
+    eventLoop.start()
+    (1 to 100).foreach(eventLoop.post)
+    eventually(timeout(5 seconds), interval(5 millis)) {
+      assert((1 to 100) === buffer.toSeq)
+    }
+    eventLoop.stop()
+  }
+
+  test("EventLoop: start and stop") {
+    val eventLoop = new EventLoop[Int]("test") {
+
+      override def onReceive(event: Int): Unit = {}
+
+      override def onError(e: Throwable): Unit = {}
+    }
+    assert(false === eventLoop.isActive)
+    eventLoop.start()
+    assert(true === eventLoop.isActive)
+    eventLoop.stop()
+    assert(false === eventLoop.isActive)
+  }
+
+  test("EventLoop: onError") {
+    val e = new RuntimeException("Oops")
+    @volatile var receivedError: Throwable = null
+    val eventLoop = new EventLoop[Int]("test") {
+
+      override def onReceive(event: Int): Unit = {
+        throw e
+      }
+
+      override def onError(e: Throwable): Unit = {
+        receivedError = e
+      }
+    }
+    eventLoop.start()
+    eventLoop.post(1)
+    eventually(timeout(5 seconds), interval(5 millis)) {
+      assert(e === receivedError)
+    }
+    eventLoop.stop()
+  }
+
+  test("EventLoop: error thrown from onError should not crash the event thread") {
+    val e = new RuntimeException("Oops")
+    @volatile var receivedError: Throwable = null
+    val eventLoop = new EventLoop[Int]("test") {
+
+      override def onReceive(event: Int): Unit = {
+        throw e
+      }
+
+      override def onError(e: Throwable): Unit = {
+        receivedError = e
+        throw new RuntimeException("Oops")
+      }
+    }
+    eventLoop.start()
+    eventLoop.post(1)
+    eventually(timeout(5 seconds), interval(5 millis)) {
+      assert(e === receivedError)
+      assert(eventLoop.isActive)
+    }
+    eventLoop.stop()
+  }
+
+  test("EventLoop: calling stop multiple times should only call onStop once") {
+    var onStopTimes = 0
+    val eventLoop = new EventLoop[Int]("test") {
+
+      override def onReceive(event: Int): Unit = {
+      }
+
+      override def onError(e: Throwable): Unit = {
+      }
+
+      override def onStop(): Unit = {
+        onStopTimes += 1
+      }
+    }
+
+    eventLoop.start()
+
+    eventLoop.stop()
+    eventLoop.stop()
+    eventLoop.stop()
+
+    assert(1 === onStopTimes)
+  }
+
+  test("EventLoop: post event in multiple threads") {
+    @volatile var receivedEventsCount = 0
+    val eventLoop = new EventLoop[Int]("test") {
+
+      override def onReceive(event: Int): Unit = {
+        receivedEventsCount += 1
+      }
+
+      override def onError(e: Throwable): Unit = {
+      }
+
+    }
+    eventLoop.start()
+
+    val threadNum = 5
+    val eventsFromEachThread = 100
+    (1 to threadNum).foreach { _ =>
+      new Thread() {
+        override def run(): Unit = {
+          (1 to eventsFromEachThread).foreach(eventLoop.post)
+        }
+      }.start()
+    }
+
+    eventually(timeout(5 seconds), interval(5 millis)) {
+      assert(threadNum * eventsFromEachThread === receivedEventsCount)
+    }
+    eventLoop.stop()
+  }
+
+  test("EventLoop: onReceive swallows InterruptException") {
+    val onReceiveLatch = new CountDownLatch(1)
+    val eventLoop = new EventLoop[Int]("test") {
+
+      override def onReceive(event: Int): Unit = {
+        onReceiveLatch.countDown()
+        try {
+          Thread.sleep(5000)
+        } catch {
+          case ie: InterruptedException => // swallow
+        }
+      }
+
+      override def onError(e: Throwable): Unit = {
+      }
+
+    }
+    eventLoop.start()
+    eventLoop.post(1)
+    failAfter(5 seconds) {
+      // Wait until we enter `onReceive`
+      onReceiveLatch.await()
+      eventLoop.stop()
+    }
+    assert(false === eventLoop.isActive)
+  }
+
+  test("EventLoop: stop in eventThread") {
+    val eventLoop = new EventLoop[Int]("test") {
+
+      override def onReceive(event: Int): Unit = {
+        stop()
+      }
+
+      override def onError(e: Throwable): Unit = {
+      }
+
+    }
+    eventLoop.start()
+    eventLoop.post(1)
+    eventually(timeout(5 seconds), interval(5 millis)) {
+      assert(!eventLoop.isActive)
+    }
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/util/FileLoggerSuite.scala b/core/src/test/scala/org/apache/spark/util/FileLoggerSuite.scala
deleted file mode 100644
index 72466a3aa1130..0000000000000
--- a/core/src/test/scala/org/apache/spark/util/FileLoggerSuite.scala
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.util
-
-import java.io.{File, IOException}
-
-import scala.io.Source
-
-import org.apache.hadoop.fs.Path
-import org.scalatest.{BeforeAndAfter, FunSuite}
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.io.CompressionCodec
-
-/**
- * Test writing files through the FileLogger.
- */
-class FileLoggerSuite extends FunSuite with BeforeAndAfter {
-  private val fileSystem = Utils.getHadoopFileSystem("/",
-    SparkHadoopUtil.get.newConfiguration(new SparkConf()))
-  private val allCompressionCodecs = Seq[String](
-    "org.apache.spark.io.LZFCompressionCodec",
-    "org.apache.spark.io.SnappyCompressionCodec"
-  )
-  private var testDir: File = _
-  private var logDirPath: Path = _
-  private var logDirPathString: String = _
-
-  before {
-    testDir = Utils.createTempDir()
-    logDirPath = Utils.getFilePath(testDir, "test-file-logger")
-    logDirPathString = logDirPath.toString
-  }
-
-  after {
-    Utils.deleteRecursively(testDir)
-  }
-
-  test("Simple logging") {
-    testSingleFile()
-  }
-
-  test ("Simple logging with compression") {
-    allCompressionCodecs.foreach { codec =>
-      testSingleFile(Some(codec))
-    }
-  }
-
-  test("Logging multiple files") {
-    testMultipleFiles()
-  }
-
-  test("Logging multiple files with compression") {
-    allCompressionCodecs.foreach { codec =>
-      testMultipleFiles(Some(codec))
-    }
-  }
-
-  test("Logging when directory already exists") {
-    // Create the logging directory multiple times
-    new FileLogger(logDirPathString, new SparkConf, compress = false, overwrite = true).start()
-    new FileLogger(logDirPathString, new SparkConf, compress = false, overwrite = true).start()
-    new FileLogger(logDirPathString, new SparkConf, compress = false, overwrite = true).start()
-
-    // If overwrite is not enabled, an exception should be thrown
-    intercept[IOException] {
-      new FileLogger(logDirPathString, new SparkConf, compress = false, overwrite = false).start()
-    }
-  }
-
-
-  /* ----------------- *
-   * Actual test logic *
-   * ----------------- */
-
-  /**
-   * Test logging to a single file.
-   */
-  private def testSingleFile(codecName: Option[String] = None) {
-    val conf = getLoggingConf(codecName)
-    val codec = codecName.map { c => CompressionCodec.createCodec(conf) }
-    val logger =
-      if (codecName.isDefined) {
-        new FileLogger(logDirPathString, conf, compress = true)
-      } else {
-        new FileLogger(logDirPathString, conf)
-      }
-    logger.start()
-    assert(fileSystem.exists(logDirPath))
-    assert(fileSystem.getFileStatus(logDirPath).isDir)
-    assert(fileSystem.listStatus(logDirPath).size === 0)
-
-    logger.newFile()
-    val files = fileSystem.listStatus(logDirPath)
-    assert(files.size === 1)
-    val firstFile = files.head
-    val firstFilePath = firstFile.getPath
-
-    logger.log("hello")
-    logger.flush()
-    assert(readFileContent(firstFilePath, codec) === "hello")
-
-    logger.log(" world")
-    logger.close()
-    assert(readFileContent(firstFilePath, codec) === "hello world")
-  }
-
-  /**
-   * Test logging to multiple files.
-   */
-  private def testMultipleFiles(codecName: Option[String] = None) {
-    val conf = getLoggingConf(codecName)
-    val codec = codecName.map { c => CompressionCodec.createCodec(conf) }
-    val logger =
-      if (codecName.isDefined) {
-        new FileLogger(logDirPathString, conf, compress = true)
-      } else {
-        new FileLogger(logDirPathString, conf)
-      }
-    logger.start()
-    logger.newFile("Jean_Valjean")
-    logger.logLine("Who am I?")
-    logger.logLine("Destiny?")
-    logger.newFile("John_Valjohn")
-    logger.logLine("One")
-    logger.logLine("Two three...")
-    logger.newFile("Wolverine")
-    logger.logLine("There was a time")
-    logger.logLine("A time when our enemies knew honor.")
-    logger.close()
-    assert(readFileContent(new Path(logDirPath, "Jean_Valjean"), codec) === "Who am I?\nDestiny?")
-    assert(readFileContent(new Path(logDirPath, "John_Valjohn"), codec) === "One\nTwo three...")
-    assert(readFileContent(new Path(logDirPath, "Wolverine"), codec) ===
-      "There was a time\nA time when our enemies knew honor.")
-  }
-
-  /**
-   * Read the content of the file specified by the given path.
-   * If a compression codec is specified, use it to read the file.
-   */
-  private def readFileContent(logPath: Path, codec: Option[CompressionCodec] = None): String = {
-    val fstream = fileSystem.open(logPath)
-    val cstream = codec.map(_.compressedInputStream(fstream)).getOrElse(fstream)
-    Source.fromInputStream(cstream).getLines().mkString("\n")
-  }
-
-  private def getLoggingConf(codecName: Option[String]) = {
-    val conf = new SparkConf
-    codecName.foreach { c => conf.set("spark.io.compression.codec", c) }
-    conf
-  }
-
-}
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 593d6dd8c3794..0357fc6ce2780 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.util
 
 import java.util.Properties
 
+import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.shuffle.MetadataFetchFailedException
 
 import scala.collection.Map
@@ -33,6 +34,9 @@ import org.apache.spark.storage._
 
 class JsonProtocolSuite extends FunSuite {
 
+  val jobSubmissionTime = 1421191042750L
+  val jobCompletionTime = 1421191296660L
+
   test("SparkListenerEvent") {
     val stageSubmitted =
       SparkListenerStageSubmitted(makeStageInfo(100, 200, 300, 400L, 500L), properties)
@@ -53,9 +57,9 @@ class JsonProtocolSuite extends FunSuite {
       val stageIds = Seq[Int](1, 2, 3, 4)
       val stageInfos = stageIds.map(x =>
         makeStageInfo(x, x * 200, x * 300, x * 400L, x * 500L))
-      SparkListenerJobStart(10, stageInfos, properties)
+      SparkListenerJobStart(10, jobSubmissionTime, stageInfos, properties)
     }
-    val jobEnd = SparkListenerJobEnd(20, JobSucceeded)
+    val jobEnd = SparkListenerJobEnd(20, jobCompletionTime, JobSucceeded)
     val environmentUpdate = SparkListenerEnvironmentUpdate(Map[String, Seq[(String, String)]](
       "JVM Information" -> Seq(("GC speed", "9999 objects/s"), ("Java home", "Land of coffee")),
       "Spark Properties" -> Seq(("Job throughput", "80000 jobs/s, regardless of job type")),
@@ -69,6 +73,9 @@ class JsonProtocolSuite extends FunSuite {
     val unpersistRdd = SparkListenerUnpersistRDD(12345)
     val applicationStart = SparkListenerApplicationStart("The winner of all", None, 42L, "Garfield")
     val applicationEnd = SparkListenerApplicationEnd(42L)
+    val executorAdded = SparkListenerExecutorAdded("exec1",
+      new ExecutorInfo("Hostee.awesome.com", 11))
+    val executorRemoved = SparkListenerExecutorRemoved("exec2")
 
     testEvent(stageSubmitted, stageSubmittedJsonString)
     testEvent(stageCompleted, stageCompletedJsonString)
@@ -85,6 +92,8 @@ class JsonProtocolSuite extends FunSuite {
     testEvent(unpersistRdd, unpersistRDDJsonString)
     testEvent(applicationStart, applicationStartJsonString)
     testEvent(applicationEnd, applicationEndJsonString)
+    testEvent(executorAdded, executorAddedJsonString)
+    testEvent(executorRemoved, executorRemovedJsonString)
   }
 
   test("Dependent Classes") {
@@ -94,6 +103,7 @@ class JsonProtocolSuite extends FunSuite {
     testTaskMetrics(makeTaskMetrics(
       33333L, 44444L, 55555L, 66666L, 7, 8, hasHadoopInput = false, hasOutput = false))
     testBlockManagerId(BlockManagerId("Hong", "Kong", 500))
+    testExecutorInfo(new ExecutorInfo("host", 43))
 
     // StorageLevel
     testStorageLevel(StorageLevel.NONE)
@@ -240,13 +250,31 @@ class JsonProtocolSuite extends FunSuite {
     val stageInfos = stageIds.map(x => makeStageInfo(x, x * 200, x * 300, x * 400, x * 500))
     val dummyStageInfos =
       stageIds.map(id => new StageInfo(id, 0, "unknown", 0, Seq.empty, "unknown"))
-    val jobStart = SparkListenerJobStart(10, stageInfos, properties)
+    val jobStart = SparkListenerJobStart(10, jobSubmissionTime, stageInfos, properties)
     val oldEvent = JsonProtocol.jobStartToJson(jobStart).removeField({_._1 == "Stage Infos"})
     val expectedJobStart =
-      SparkListenerJobStart(10, dummyStageInfos, properties)
+      SparkListenerJobStart(10, jobSubmissionTime, dummyStageInfos, properties)
     assertEquals(expectedJobStart, JsonProtocol.jobStartFromJson(oldEvent))
   }
 
+  test("SparkListenerJobStart and SparkListenerJobEnd backward compatibility") {
+    // Prior to Spark 1.3.0, SparkListenerJobStart did not have a "Submission Time" property.
+    // Also, SparkListenerJobEnd did not have a "Completion Time" property.
+    val stageIds = Seq[Int](1, 2, 3, 4)
+    val stageInfos = stageIds.map(x => makeStageInfo(x * 10, x * 20, x * 30, x * 40, x * 50))
+    val jobStart = SparkListenerJobStart(11, jobSubmissionTime, stageInfos, properties)
+    val oldStartEvent = JsonProtocol.jobStartToJson(jobStart)
+      .removeField({ _._1 == "Submission Time"})
+    val expectedJobStart = SparkListenerJobStart(11, -1, stageInfos, properties)
+    assertEquals(expectedJobStart, JsonProtocol.jobStartFromJson(oldStartEvent))
+
+    val jobEnd = SparkListenerJobEnd(11, jobCompletionTime, JobSucceeded)
+    val oldEndEvent = JsonProtocol.jobEndToJson(jobEnd)
+      .removeField({ _._1 == "Completion Time"})
+    val expectedJobEnd = SparkListenerJobEnd(11, -1, JobSucceeded)
+    assertEquals(expectedJobEnd, JsonProtocol.jobEndFromJson(oldEndEvent))
+  }
+
   /** -------------------------- *
    | Helper test running methods |
    * --------------------------- */
@@ -280,7 +308,7 @@ class JsonProtocolSuite extends FunSuite {
 
   private def testBlockManagerId(id: BlockManagerId) {
     val newId = JsonProtocol.blockManagerIdFromJson(JsonProtocol.blockManagerIdToJson(id))
-    assertEquals(id, newId)
+    assert(id === newId)
   }
 
   private def testTaskInfo(info: TaskInfo) {
@@ -303,6 +331,10 @@ class JsonProtocolSuite extends FunSuite {
     assert(blockId === newBlockId)
   }
 
+  private def testExecutorInfo(info: ExecutorInfo) {
+    val newInfo = JsonProtocol.executorInfoFromJson(JsonProtocol.executorInfoToJson(info))
+    assertEquals(info, newInfo)
+  }
 
   /** -------------------------------- *
    | Util methods for comparing events |
@@ -335,22 +367,13 @@ class JsonProtocolSuite extends FunSuite {
         assertEquals(e1.jobResult, e2.jobResult)
       case (e1: SparkListenerEnvironmentUpdate, e2: SparkListenerEnvironmentUpdate) =>
         assertEquals(e1.environmentDetails, e2.environmentDetails)
-      case (e1: SparkListenerBlockManagerAdded, e2: SparkListenerBlockManagerAdded) =>
-        assert(e1.maxMem === e2.maxMem)
-        assert(e1.time === e2.time)
-        assertEquals(e1.blockManagerId, e2.blockManagerId)
-      case (e1: SparkListenerBlockManagerRemoved, e2: SparkListenerBlockManagerRemoved) =>
-        assert(e1.time === e2.time)
-        assertEquals(e1.blockManagerId, e2.blockManagerId)
-      case (e1: SparkListenerUnpersistRDD, e2: SparkListenerUnpersistRDD) =>
-        assert(e1.rddId == e2.rddId)
-      case (e1: SparkListenerApplicationStart, e2: SparkListenerApplicationStart) =>
-        assert(e1.appName == e2.appName)
-        assert(e1.time == e2.time)
-        assert(e1.sparkUser == e2.sparkUser)
-      case (e1: SparkListenerApplicationEnd, e2: SparkListenerApplicationEnd) =>
-        assert(e1.time == e2.time)
-      case (SparkListenerShutdown, SparkListenerShutdown) =>
+      case (e1: SparkListenerExecutorAdded, e2: SparkListenerExecutorAdded) =>
+        assert(e1.executorId == e1.executorId)
+        assertEquals(e1.executorInfo, e2.executorInfo)
+      case (e1: SparkListenerExecutorRemoved, e2: SparkListenerExecutorRemoved) =>
+        assert(e1.executorId == e1.executorId)
+      case (e1, e2) =>
+        assert(e1 === e2)
       case _ => fail("Events don't match in types!")
     }
   }
@@ -401,6 +424,11 @@ class JsonProtocolSuite extends FunSuite {
     assert(info1.accumulables === info2.accumulables)
   }
 
+  private def assertEquals(info1: ExecutorInfo, info2: ExecutorInfo) {
+    assert(info1.executorHost == info2.executorHost)
+    assert(info1.totalCores == info2.totalCores)
+  }
+
   private def assertEquals(metrics1: TaskMetrics, metrics2: TaskMetrics) {
     assert(metrics1.hostname === metrics2.hostname)
     assert(metrics1.executorDeserializeTime === metrics2.executorDeserializeTime)
@@ -435,16 +463,6 @@ class JsonProtocolSuite extends FunSuite {
     assert(metrics1.bytesRead === metrics2.bytesRead)
   }
 
-  private def assertEquals(bm1: BlockManagerId, bm2: BlockManagerId) {
-    if (bm1 == null || bm2 == null) {
-      assert(bm1 === bm2)
-    } else {
-      assert(bm1.executorId === bm2.executorId)
-      assert(bm1.host === bm2.host)
-      assert(bm1.port === bm2.port)
-    }
-  }
-
   private def assertEquals(result1: JobResult, result2: JobResult) {
     (result1, result2) match {
       case (JobSucceeded, JobSucceeded) =>
@@ -462,7 +480,7 @@ class JsonProtocolSuite extends FunSuite {
         assert(r1.shuffleId === r2.shuffleId)
         assert(r1.mapId === r2.mapId)
         assert(r1.reduceId === r2.reduceId)
-        assertEquals(r1.bmAddress, r2.bmAddress)
+        assert(r1.bmAddress === r2.bmAddress)
         assert(r1.message === r2.message)
       case (r1: ExceptionFailure, r2: ExceptionFailure) =>
         assert(r1.className === r2.className)
@@ -623,34 +641,34 @@ class JsonProtocolSuite extends FunSuite {
       hasHadoopInput: Boolean,
       hasOutput: Boolean) = {
     val t = new TaskMetrics
-    t.hostname = "localhost"
-    t.executorDeserializeTime = a
-    t.executorRunTime = b
-    t.resultSize = c
-    t.jvmGCTime = d
-    t.resultSerializationTime = a + b
-    t.memoryBytesSpilled = a + c
+    t.setHostname("localhost")
+    t.setExecutorDeserializeTime(a)
+    t.setExecutorRunTime(b)
+    t.setResultSize(c)
+    t.setJvmGCTime(d)
+    t.setResultSerializationTime(a + b)
+    t.incMemoryBytesSpilled(a + c)
 
     if (hasHadoopInput) {
       val inputMetrics = new InputMetrics(DataReadMethod.Hadoop)
-      inputMetrics.bytesRead = d + e + f
-      t.inputMetrics = Some(inputMetrics)
+      inputMetrics.addBytesRead(d + e + f)
+      t.setInputMetrics(Some(inputMetrics))
     } else {
       val sr = new ShuffleReadMetrics
-      sr.remoteBytesRead = b + d
-      sr.localBlocksFetched = e
-      sr.fetchWaitTime = a + d
-      sr.remoteBlocksFetched = f
+      sr.incRemoteBytesRead(b + d)
+      sr.incLocalBlocksFetched(e)
+      sr.incFetchWaitTime(a + d)
+      sr.incRemoteBlocksFetched(f)
       t.setShuffleReadMetrics(Some(sr))
     }
     if (hasOutput) {
       val outputMetrics = new OutputMetrics(DataWriteMethod.Hadoop)
-      outputMetrics.bytesWritten = a + b + c
+      outputMetrics.setBytesWritten(a + b + c)
       t.outputMetrics = Some(outputMetrics)
     } else {
       val sw = new ShuffleWriteMetrics
-      sw.shuffleBytesWritten = a + b + c
-      sw.shuffleWriteTime = b + c + d
+      sw.incShuffleBytesWritten(a + b + c)
+      sw.incShuffleWriteTime(b + c + d)
       t.shuffleWriteMetrics = Some(sw)
     }
     // Make at most 6 blocks
@@ -1078,6 +1096,7 @@ class JsonProtocolSuite extends FunSuite {
       |{
       |  "Event": "SparkListenerJobStart",
       |  "Job ID": 10,
+      |  "Submission Time": 1421191042750,
       |  "Stage Infos": [
       |    {
       |      "Stage ID": 1,
@@ -1352,6 +1371,7 @@ class JsonProtocolSuite extends FunSuite {
       |{
       |  "Event": "SparkListenerJobEnd",
       |  "Job ID": 20,
+      |  "Completion Time": 1421191296660,
       |  "Job Result": {
       |    "Result": "JobSucceeded"
       |  }
@@ -1431,4 +1451,24 @@ class JsonProtocolSuite extends FunSuite {
       |  "Timestamp": 42
       |}
     """
+
+  private val executorAddedJsonString =
+    """
+      |{
+      |  "Event": "SparkListenerExecutorAdded",
+      |  "Executor ID": "exec1",
+      |  "Executor Info": {
+      |    "Host": "Hostee.awesome.com",
+      |    "Total Cores": 11
+      |  }
+      |}
+    """
+
+  private val executorRemovedJsonString =
+    """
+      |{
+      |  "Event": "SparkListenerExecutorRemoved",
+      |  "Executor ID": "exec2"
+      |}
+    """
 }
diff --git a/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala b/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala
new file mode 100644
index 0000000000000..d4b92f33dd9e6
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.util.Properties
+
+import org.scalatest.{BeforeAndAfterEach, Suite}
+
+/**
+ * Mixin for automatically resetting system properties that are modified in ScalaTest tests.
+ * This resets the properties after each individual test.
+ *
+ * The order in which fixtures are mixed in affects the order in which they are invoked by tests.
+ * If we have a suite `MySuite extends FunSuite with Foo with Bar`, then
+ * Bar's `super` is Foo, so Bar's beforeEach() will and afterEach() methods will be invoked first
+ * by the rest runner.
+ *
+ * This means that ResetSystemProperties should appear as the last trait in test suites that it's
+ * mixed into in order to ensure that the system properties snapshot occurs as early as possible.
+ * ResetSystemProperties calls super.afterEach() before performing its own cleanup, ensuring that
+ * the old properties are restored as late as possible.
+ *
+ * See the "Composing fixtures by stacking traits" section at
+ * http://www.scalatest.org/user_guide/sharing_fixtures for more details about this pattern.
+ */
+private[spark] trait ResetSystemProperties extends BeforeAndAfterEach { this: Suite =>
+  var oldProperties: Properties = null
+
+  override def beforeEach(): Unit = {
+    oldProperties = new Properties(System.getProperties)
+    super.beforeEach()
+  }
+
+  override def afterEach(): Unit = {
+    try {
+      super.afterEach()
+    } finally {
+      System.setProperties(oldProperties)
+      oldProperties = null
+    }
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala
index 0ea2d13a83505..7424c2e91d4f2 100644
--- a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.util
 
-import org.scalatest.BeforeAndAfterAll
-import org.scalatest.FunSuite
-import org.scalatest.PrivateMethodTester
+import org.scalatest.{BeforeAndAfterEach, BeforeAndAfterAll, FunSuite, PrivateMethodTester}
 
 class DummyClass1 {}
 
@@ -46,20 +44,12 @@ class DummyString(val arr: Array[Char]) {
 }
 
 class SizeEstimatorSuite
-  extends FunSuite with BeforeAndAfterAll with PrivateMethodTester {
+  extends FunSuite with BeforeAndAfterEach with PrivateMethodTester with ResetSystemProperties {
 
-  var oldArch: String = _
-  var oldOops: String = _
-
-  override def beforeAll() {
+  override def beforeEach() {
     // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case
-    oldArch = System.setProperty("os.arch", "amd64")
-    oldOops = System.setProperty("spark.test.useCompressedOops", "true")
-  }
-
-  override def afterAll() {
-    resetOrClear("os.arch", oldArch)
-    resetOrClear("spark.test.useCompressedOops", oldOops)
+    System.setProperty("os.arch", "amd64")
+    System.setProperty("spark.test.useCompressedOops", "true")
   }
 
   test("simple classes") {
@@ -122,7 +112,7 @@ class SizeEstimatorSuite
   }
 
   test("32-bit arch") {
-    val arch = System.setProperty("os.arch", "x86")
+    System.setProperty("os.arch", "x86")
 
     val initialize = PrivateMethod[Unit]('initialize)
     SizeEstimator invokePrivate initialize()
@@ -131,14 +121,13 @@ class SizeEstimatorSuite
     assertResult(48)(SizeEstimator.estimate(DummyString("a")))
     assertResult(48)(SizeEstimator.estimate(DummyString("ab")))
     assertResult(56)(SizeEstimator.estimate(DummyString("abcdefgh")))
-    resetOrClear("os.arch", arch)
   }
 
   // NOTE: The String class definition varies across JDK versions (1.6 vs. 1.7) and vendors
   // (Sun vs IBM). Use a DummyString class to make tests deterministic.
   test("64-bit arch with no compressed oops") {
-    val arch = System.setProperty("os.arch", "amd64")
-    val oops = System.setProperty("spark.test.useCompressedOops", "false")
+    System.setProperty("os.arch", "amd64")
+    System.setProperty("spark.test.useCompressedOops", "false")
     val initialize = PrivateMethod[Unit]('initialize)
     SizeEstimator invokePrivate initialize()
 
@@ -146,16 +135,5 @@ class SizeEstimatorSuite
     assertResult(64)(SizeEstimator.estimate(DummyString("a")))
     assertResult(64)(SizeEstimator.estimate(DummyString("ab")))
     assertResult(72)(SizeEstimator.estimate(DummyString("abcdefgh")))
-
-    resetOrClear("os.arch", arch)
-    resetOrClear("spark.test.useCompressedOops", oops)
-  }
-
-  def resetOrClear(prop: String, oldValue: String) {
-    if (oldValue != null) {
-      System.setProperty(prop, oldValue)
-    } else {
-      System.clearProperty(prop)
-    }
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index f9d4bea823f7c..4544382094f96 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -31,7 +31,7 @@ import org.scalatest.FunSuite
 
 import org.apache.spark.SparkConf
 
-class UtilsSuite extends FunSuite {
+class UtilsSuite extends FunSuite with ResetSystemProperties {
 
   test("bytesToString") {
     assert(Utils.bytesToString(10) === "10.0 B")
diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
index 3b89aaba29609..b1b8cb44e098b 100755
--- a/dev/create-release/create-release.sh
+++ b/dev/create-release/create-release.sh
@@ -87,8 +87,8 @@ if [[ ! "$@" =~ --package-only ]]; then
   git commit -a -m "Preparing development version $next_ver"
   git push origin $GIT_TAG
   git push origin HEAD:$GIT_BRANCH
-  git checkout -f $GIT_TAG 
-  
+  git checkout -f $GIT_TAG
+
   # Using Nexus API documented here:
   # https://support.sonatype.com/entries/39720203-Uploading-to-a-Staging-Repository-via-REST-API
   echo "Creating Nexus staging repository"
@@ -106,7 +106,7 @@ if [[ ! "$@" =~ --package-only ]]; then
     clean install
 
   ./dev/change-version-to-2.11.sh
-  
+
   mvn -DskipTests -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
     -Dscala-2.11 -Pyarn -Phive -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
     clean install
@@ -174,7 +174,7 @@ make_binary_release() {
   NAME=$1
   FLAGS=$2
   cp -r spark spark-$RELEASE_VERSION-bin-$NAME
-  
+
   cd spark-$RELEASE_VERSION-bin-$NAME
 
   # TODO There should probably be a flag to make-distribution to allow 2.11 support
@@ -219,7 +219,7 @@ scp spark-* \
 
 # Docs
 cd spark
-sbt/sbt clean
+build/sbt clean
 cd docs
 # Compile docs with Java 7 to use nicer format
 JAVA_HOME=$JAVA_7_HOME PRODUCTION=1 jekyll build
diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py
index a3b78a3eac6d0..8aaa250bd7e29 100755
--- a/dev/create-release/generate-contributors.py
+++ b/dev/create-release/generate-contributors.py
@@ -26,75 +26,103 @@
 
 # You must set the following before use!
 JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
-START_COMMIT = os.environ.get("START_COMMIT", "37b100")
-END_COMMIT = os.environ.get("END_COMMIT", "3693ae")
-
-# If commit range is not specified, prompt the user to provide it
-if not START_COMMIT or not END_COMMIT:
-    print "A commit range is required to proceed."
-    if not START_COMMIT:
-        START_COMMIT = raw_input("Please specify starting commit hash (inclusive): ")
-    if not END_COMMIT:
-        END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ")
-
-# Verify provided arguments
-start_commit_line = get_one_line(START_COMMIT)
-end_commit_line = get_one_line(END_COMMIT)
-num_commits = num_commits_in_range(START_COMMIT, END_COMMIT)
-if not start_commit_line: sys.exit("Start commit %s not found!" % START_COMMIT)
-if not end_commit_line: sys.exit("End commit %s not found!" % END_COMMIT)
-if num_commits == 0:
-    sys.exit("There are no commits in the provided range [%s, %s)" % (START_COMMIT, END_COMMIT))
+RELEASE_TAG = os.environ.get("RELEASE_TAG", "v1.2.0-rc2")
+PREVIOUS_RELEASE_TAG = os.environ.get("PREVIOUS_RELEASE_TAG", "v1.1.0")
+
+# If the release tags are not provided, prompt the user to provide them
+while not tag_exists(RELEASE_TAG):
+    RELEASE_TAG = raw_input("Please provide a valid release tag: ")
+while not tag_exists(PREVIOUS_RELEASE_TAG):
+    print "Please specify the previous release tag."
+    PREVIOUS_RELEASE_TAG = raw_input(\
+      "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ")
+
+# Gather commits found in the new tag but not in the old tag.
+# This filters commits based on both the git hash and the PR number.
+# If either is present in the old tag, then we ignore the commit.
+print "Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG)
+release_commits = get_commits(RELEASE_TAG)
+previous_release_commits = get_commits(PREVIOUS_RELEASE_TAG)
+previous_release_hashes = set()
+previous_release_prs = set()
+for old_commit in previous_release_commits:
+    previous_release_hashes.add(old_commit.get_hash())
+    if old_commit.get_pr_number():
+        previous_release_prs.add(old_commit.get_pr_number())
+new_commits = []
+for this_commit in release_commits:
+    this_hash = this_commit.get_hash()
+    this_pr_number = this_commit.get_pr_number()
+    if this_hash in previous_release_hashes:
+        continue
+    if this_pr_number and this_pr_number in previous_release_prs:
+        continue
+    new_commits.append(this_commit)
+if not new_commits:
+    sys.exit("There are no new commits between %s and %s!" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))
+
+# Prompt the user for confirmation that the commit range is correct
 print "\n=================================================================================="
 print "JIRA server: %s" % JIRA_API_BASE
-print "Start commit (inclusive): %s" % start_commit_line
-print "End commit (non-inclusive): %s" % end_commit_line
-print "Number of commits in this range: %s" % num_commits
+print "Release tag: %s" % RELEASE_TAG
+print "Previous release tag: %s" % PREVIOUS_RELEASE_TAG
+print "Number of commits in this range: %s" % len(new_commits)
 print
-response = raw_input("Is this correct? [Y/n] ")
-if response.lower() != "y" and response:
-    sys.exit("Ok, exiting")
+def print_indented(_list):
+    for x in _list: print "  %s" % x
+if yesOrNoPrompt("Show all commits?"):
+    print_indented(new_commits)
 print "==================================================================================\n"
-
-# Find all commits within this range
-print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT)
-commits = get_one_line_commits(START_COMMIT, END_COMMIT)
-if not commits: sys.exit("Error: No commits found within this range!")
-commits = commits.split("\n")
+if not yesOrNoPrompt("Does this look correct?"):
+    sys.exit("Ok, exiting")
 
 # Filter out special commits
 releases = []
+maintenance = []
 reverts = []
 nojiras = []
 filtered_commits = []
-def is_release(commit):
-    return re.findall("\[release\]", commit.lower()) or\
-        "maven-release-plugin" in commit or "CHANGES.txt" in commit
-def has_no_jira(commit):
-    return not re.findall("SPARK-[0-9]+", commit.upper())
-def is_revert(commit):
-    return "revert" in commit.lower()
-def is_docs(commit):
-    return re.findall("docs*", commit.lower()) or "programming guide" in commit.lower()
-for c in commits:
-    if not c: continue
-    elif is_release(c): releases.append(c)
-    elif is_revert(c): reverts.append(c)
-    elif is_docs(c): filtered_commits.append(c) # docs may not have JIRA numbers
-    elif has_no_jira(c): nojiras.append(c)
+def is_release(commit_title):
+    return re.findall("\[release\]", commit_title.lower()) or\
+      "preparing spark release" in commit_title.lower() or\
+      "preparing development version" in commit_title.lower() or\
+      "CHANGES.txt" in commit_title
+def is_maintenance(commit_title):
+    return "maintenance" in commit_title.lower() or\
+      "manually close" in commit_title.lower()
+def has_no_jira(commit_title):
+    return not re.findall("SPARK-[0-9]+", commit_title.upper())
+def is_revert(commit_title):
+    return "revert" in commit_title.lower()
+def is_docs(commit_title):
+    return re.findall("docs*", commit_title.lower()) or\
+      "programming guide" in commit_title.lower()
+for c in new_commits:
+    t = c.get_title()
+    if not t: continue
+    elif is_release(t): releases.append(c)
+    elif is_maintenance(t): maintenance.append(c)
+    elif is_revert(t): reverts.append(c)
+    elif is_docs(t): filtered_commits.append(c) # docs may not have JIRA numbers
+    elif has_no_jira(t): nojiras.append(c)
     else: filtered_commits.append(c)
 
 # Warn against ignored commits
-def print_indented(_list):
-    for x in _list: print "  %s" % x
-if releases or reverts or nojiras:
+if releases or maintenance or reverts or nojiras:
     print "\n=================================================================================="
-    if releases: print "Releases (%d)" % len(releases); print_indented(releases)
-    if reverts: print "Reverts (%d)" % len(reverts); print_indented(reverts)
-    if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
+    if releases: print "Found %d release commits" % len(releases)
+    if maintenance: print "Found %d maintenance commits" % len(maintenance)
+    if reverts: print "Found %d revert commits" % len(reverts)
+    if nojiras: print "Found %d commits with no JIRA" % len(nojiras)
+    print "* Warning: these commits will be ignored.\n"
+    if yesOrNoPrompt("Show ignored commits?"):
+        if releases: print "Release (%d)" % len(releases); print_indented(releases)
+        if maintenance: print "Maintenance (%d)" % len(maintenance); print_indented(maintenance)
+        if reverts: print "Revert (%d)" % len(reverts); print_indented(reverts)
+        if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
     print "==================== Warning: the above commits will be ignored ==================\n"
-response = raw_input("%d commits left to process. Ok to proceed? [Y/n] " % len(filtered_commits))
-if response.lower() != "y" and response:
+prompt_msg = "%d commits left to process after filtering. Ok to proceed?" % len(filtered_commits)
+if not yesOrNoPrompt(prompt_msg):
     sys.exit("Ok, exiting.")
 
 # Keep track of warnings to tell the user at the end
@@ -123,10 +151,11 @@ def print_indented(_list):
 jira_client = JIRA(options = jira_options)
 print "\n=========================== Compiling contributor list ==========================="
 for commit in filtered_commits:
-    commit_hash = re.findall("^[a-z0-9]+", commit)[0]
-    issues = re.findall("SPARK-[0-9]+", commit.upper())
-    author = get_author(commit_hash)
-    author = unidecode.unidecode(unicode(author, "UTF-8")).strip() # guard against special characters
+    _hash = commit.get_hash()
+    title = commit.get_title()
+    issues = re.findall("SPARK-[0-9]+", title.upper())
+    author = commit.get_author()
+    date = get_date(_hash)
     # If the author name is invalid, keep track of it along
     # with all associated issues so we can translate it later
     if is_valid_author(author):
@@ -136,9 +165,8 @@ def print_indented(_list):
             invalid_authors[author] = set()
         for issue in issues:
             invalid_authors[author].add(issue)
-    date = get_date(commit_hash)
-    # Parse components from the commit message, if any
-    commit_components = find_components(commit, commit_hash)
+    # Parse components from the commit title, if any
+    commit_components = find_components(title, _hash)
     # Populate or merge an issue into author_info[author]
     def populate(issue_type, components):
         components = components or [CORE_COMPONENT] # assume core if no components provided
@@ -153,20 +181,20 @@ def populate(issue_type, components):
         jira_issue = jira_client.issue(issue)
         jira_type = jira_issue.fields.issuetype.name
         jira_type = translate_issue_type(jira_type, issue, warnings)
-        jira_components = [translate_component(c.name, commit_hash, warnings)\
-            for c in jira_issue.fields.components]
+        jira_components = [translate_component(c.name, _hash, warnings)\
+          for c in jira_issue.fields.components]
         all_components = set(jira_components + commit_components)
         populate(jira_type, all_components)
     # For docs without an associated JIRA, manually add it ourselves
-    if is_docs(commit) and not issues:
+    if is_docs(title) and not issues:
         populate("documentation", commit_components)
-    print "  Processed commit %s authored by %s on %s" % (commit_hash, author, date)
+    print "  Processed commit %s authored by %s on %s" % (_hash, author, date)
 print "==================================================================================\n"
 
 # Write to contributors file ordered by author names
-# Each line takes the format "Author name - semi-colon delimited contributions"
-# e.g. Andrew Or - Bug fixes in Windows, Core, and Web UI; improvements in Core
-# e.g. Tathagata Das - Bug fixes and new features in Streaming
+# Each line takes the format " * Author name -- semi-colon delimited contributions"
+# e.g. * Andrew Or -- Bug fixes in Windows, Core, and Web UI; improvements in Core
+# e.g. * Tathagata Das -- Bug fixes and new features in Streaming
 contributors_file = open(contributors_file_name, "w")
 authors = author_info.keys()
 authors.sort()
@@ -185,7 +213,7 @@ def populate(issue_type, components):
     # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN
     else:
         contributions = ["%s in %s" % (issue_type, nice_join(comps)) \
-            for issue_type, comps in author_info[author].items()]
+          for issue_type, comps in author_info[author].items()]
         contribution = "; ".join(contributions)
     # Do not use python's capitalize() on the whole string to preserve case
     assert contribution
@@ -195,7 +223,7 @@ def populate(issue_type, components):
     # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
     if author in invalid_authors and invalid_authors[author]:
         author = author + "/" + "/".join(invalid_authors[author])
-    line = "%s - %s" % (author, contribution)
+    line = " * %s -- %s" % (author, contribution)
     contributors_file.write(line + "\n")
 contributors_file.close()
 print "Contributors list is successfully written to %s!" % contributors_file_name
diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations
new file mode 100644
index 0000000000000..b74e4ee8a330b
--- /dev/null
+++ b/dev/create-release/known_translations
@@ -0,0 +1,59 @@
+# This is a mapping of names to be translated through translate-contributors.py
+# The format expected on each line should be: <old name> - <new name>
+CodingCat - Nan Zhu
+CrazyJvm - Chao Chen
+EugenCepoi - Eugen Cepoi
+GraceH - Jie Huang
+JerryLead - Lijie Xu
+Leolh - Liu Hao
+Lewuathe - Kai Sasaki
+RongGu - Rong Gu
+Shiti - Shiti Saxena
+Victsm - Min Shen
+WangTaoTheTonic - Wang Tao
+XuTingjun - Tingjun Xu
+YanTangZhai - Yantang Zhai
+alexdebrie - Alex DeBrie
+alokito - Alok Saldanha
+anantasty - Anant Asthana
+andrewor14 - Andrew Or
+aniketbhatnagar - Aniket Bhatnagar
+arahuja - Arun Ahuja
+brkyvz - Burak Yavuz
+chesterxgchen - Chester Chen
+chiragaggarwal - Chirag Aggarwal
+chouqin - Qiping Li
+cocoatomo - Tomohiko K.
+coderfi - Fairiz Azizi
+coderxiang - Shuo Xiang
+davies - Davies Liu
+epahomov - Egor Pahomov
+falaki - Hossein Falaki
+freeman-lab - Jeremy Freeman
+industrial-sloth - Jascha Swisher
+jackylk - Jacky Li
+jayunit100 - Jay Vyas
+jerryshao - Saisai Shao
+jkbradley - Joseph Bradley
+lianhuiwang - Lianhui Wang
+lirui-intel - Rui Li
+luluorta - Lu Lu
+luogankun - Gankun Luo
+maji2014 - Derek Ma
+mccheah - Matthew Cheah
+mengxr - Xiangrui Meng
+nartz - Nathan Artz
+odedz - Oded Zimerman
+ravipesala - Ravindra Pesala
+roxchkplusony - Victor Tso
+scwf - Wang Fei
+shimingfei - Shiming Fei
+surq - Surong Quan
+suyanNone - Su Yan
+tedyu - Ted Yu
+tigerquoll - Dale Richardson
+wangxiaojing - Xiaojing Wang
+watermen - Yadong Qi
+witgo - Guoqiang Li
+xinyunh - Xinyun Huang
+zsxwing - Shixiong Zhu
diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index 76a10c32886d4..26221b270394e 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -19,6 +19,7 @@
 # This file contains helper methods used in creating a release.
 
 import re
+import sys
 from subprocess import Popen, PIPE
 
 try:
@@ -47,20 +48,85 @@
 # Contributors list file name
 contributors_file_name = "contributors.txt"
 
+# Prompt the user to answer yes or no until they do so
+def yesOrNoPrompt(msg):
+    response = raw_input("%s [y/n]: " % msg)
+    while response != "y" and response != "n":
+        return yesOrNoPrompt(msg)
+    return response == "y"
+
 # Utility functions run git commands (written with Git 1.8.5)
 def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0]
-def get_author(commit_hash):
-    return run_cmd(["git", "show", "--quiet", "--pretty=format:%an", commit_hash])
+def run_cmd_error(cmd): return Popen(cmd, stdout=PIPE, stderr=PIPE).communicate()[1]
 def get_date(commit_hash):
     return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash])
-def get_one_line(commit_hash):
-    return run_cmd(["git", "show", "--quiet", "--pretty=format:\"%h %cd %s\"", commit_hash])
-def get_one_line_commits(start_hash, end_hash):
-    return run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)])
-def num_commits_in_range(start_hash, end_hash):
-    output = run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)])
-    lines = [line for line in output.split("\n") if line] # filter out empty lines
-    return len(lines)
+def tag_exists(tag):
+    stderr = run_cmd_error(["git", "show", tag])
+    return "error" not in stderr
+
+# A type-safe representation of a commit
+class Commit:
+    def __init__(self, _hash, author, title, pr_number = None):
+        self._hash = _hash
+        self.author = author
+        self.title = title
+        self.pr_number = pr_number
+    def get_hash(self): return self._hash
+    def get_author(self): return self.author
+    def get_title(self): return self.title
+    def get_pr_number(self): return self.pr_number
+    def __str__(self):
+        closes_pr = "(Closes #%s)" % self.pr_number if self.pr_number else ""
+        return "%s %s %s %s" % (self._hash, self.author, self.title, closes_pr)
+
+# Return all commits that belong to the specified tag.
+#
+# Under the hood, this runs a `git log` on that tag and parses the fields
+# from the command output to construct a list of Commit objects. Note that
+# because certain fields reside in the commit description and cannot be parsed
+# through the Github API itself, we need to do some intelligent regex parsing
+# to extract those fields.
+#
+# This is written using Git 1.8.5.
+def get_commits(tag):
+    commit_start_marker = "|=== COMMIT START MARKER ===|"
+    commit_end_marker = "|=== COMMIT END MARKER ===|"
+    field_end_marker = "|=== COMMIT FIELD END MARKER ===|"
+    log_format =\
+        commit_start_marker + "%h" +\
+        field_end_marker + "%an" +\
+        field_end_marker + "%s" +\
+        commit_end_marker + "%b"
+    output = run_cmd(["git", "log", "--quiet", "--pretty=format:" + log_format, tag])
+    commits = []
+    raw_commits = [c for c in output.split(commit_start_marker) if c]
+    for commit in raw_commits:
+        if commit.count(commit_end_marker) != 1:
+            print "Commit end marker not found in commit: "
+            for line in commit.split("\n"): print line
+            sys.exit(1)
+        # Separate commit digest from the body
+        # From the digest we extract the hash, author and the title
+        # From the body, we extract the PR number and the github username
+        [commit_digest, commit_body] = commit.split(commit_end_marker)
+        if commit_digest.count(field_end_marker) != 2:
+            sys.exit("Unexpected format in commit: %s" % commit_digest)
+        [_hash, author, title] = commit_digest.split(field_end_marker)
+        # The PR number and github username is in the commit message
+        # itself and cannot be accessed through any Github API
+        pr_number = None
+        match = re.search("Closes #([0-9]+) from ([^/\\s]+)/", commit_body)
+        if match:
+            [pr_number, github_username] = match.groups()
+            # If the author name is not valid, use the github
+            # username so we can translate it properly later
+            if not is_valid_author(author):
+                author = github_username
+        # Guard against special characters
+        author = unidecode.unidecode(unicode(author, "UTF-8")).strip()
+        commit = Commit(_hash, author, title, pr_number)
+        commits.append(commit)
+    return commits
 
 # Maintain a mapping for translating issue types to contributions in the release notes
 # This serves an additional function of warning the user against unknown issue types
@@ -70,10 +136,13 @@ def num_commits_in_range(start_hash, end_hash):
 known_issue_types = {
     "bug": "bug fixes",
     "build": "build fixes",
+    "dependency upgrade": "build fixes",
     "improvement": "improvements",
     "new feature": "new features",
     "documentation": "documentation",
-    "test": "test"
+    "test": "test",
+    "task": "improvement",
+    "sub-task": "improvement"
 }
 
 # Maintain a mapping for translating component names when creating the release notes
@@ -176,8 +245,7 @@ def get_jira_name(author, jira_client):
 # Return whether the given name is in the form <First Name><space><Last Name>
 def is_valid_author(author):
     if not author: return False
-    author_words = len(author.split(" "))
-    return author_words == 2 or author_words == 3
+    return " " in author and not re.findall("[0-9]", author)
 
 # Capitalize the first letter of each word in the given author name
 def capitalize_author(author):
diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py
index ef4625b003cb6..86fa02d87b9a0 100755
--- a/dev/create-release/translate-contributors.py
+++ b/dev/create-release/translate-contributors.py
@@ -37,17 +37,18 @@
 JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
 JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
 JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
+GITHUB_API_TOKEN = os.environ.get("GITHUB_API_TOKEN", None)
 if not JIRA_USERNAME or not JIRA_PASSWORD:
     sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set")
+if not GITHUB_API_TOKEN:
+    sys.exit("GITHUB_API_TOKEN must be set")
 
-# Write new contributors list to <old_file_name>.new
+# Write new contributors list to <old_file_name>.final
 if not os.path.isfile(contributors_file_name):
     print "Contributors file %s does not exist!" % contributors_file_name
     print "Have you run ./generate-contributors.py yet?"
     sys.exit(1)
 contributors_file = open(contributors_file_name, "r")
-new_contributors_file_name = contributors_file_name + ".new"
-new_contributors_file = open(new_contributors_file_name, "w")
 warnings = []
 
 # In non-interactive mode, this script will choose the first replacement that is valid
@@ -62,7 +63,20 @@
 # Setup Github and JIRA clients
 jira_options = { "server": JIRA_API_BASE }
 jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
-github_client = Github()
+github_client = Github(GITHUB_API_TOKEN)
+
+# Load known author translations that are cached locally
+known_translations = {}
+known_translations_file_name = "known_translations"
+known_translations_file = open(known_translations_file_name, "r")
+for line in known_translations_file:
+    if line.startswith("#"): continue
+    [old_name, new_name] = line.strip("\n").split(" - ")
+    known_translations[old_name] = new_name
+known_translations_file.close()
+
+# Open again in case the user adds new mappings
+known_translations_file = open(known_translations_file_name, "a")
 
 # Generate candidates for the given author. This should only be called if the given author
 # name does not represent a full name as this operation is somewhat expensive. Under the
@@ -80,21 +94,28 @@
 def generate_candidates(author, issues):
     candidates = []
     # First check for full name of Github user
-    github_name = get_github_name(new_author, github_client)
+    github_name = get_github_name(author, github_client)
     if github_name:
-        candidates.append((github_name, "Full name of Github user %s" % new_author))
+        candidates.append((github_name, "Full name of Github user %s" % author))
     else:
-        candidates.append((NOT_FOUND, "No full name found for Github user %s" % new_author))
+        candidates.append((NOT_FOUND, "No full name found for Github user %s" % author))
     # Then do the same for JIRA user
-    jira_name = get_jira_name(new_author, jira_client)
+    jira_name = get_jira_name(author, jira_client)
     if jira_name:
-        candidates.append((jira_name, "Full name of JIRA user %s" % new_author))
+        candidates.append((jira_name, "Full name of JIRA user %s" % author))
     else:
-        candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % new_author))
+        candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % author))
     # Then do the same for the assignee of each of the associated JIRAs
     # Note that a given issue may not have an assignee, or the assignee may not have a full name
     for issue in issues:
-        jira_issue = jira_client.issue(issue)
+        try:
+            jira_issue = jira_client.issue(issue)
+        except JIRAError as e:
+            # Do not exit just because an issue is not found!
+            if e.status_code == 404:
+                warnings.append("Issue %s not found!" % issue)
+                continue
+            raise e
         jira_assignee = jira_issue.fields.assignee
         if jira_assignee:
             user_name = jira_assignee.name
@@ -123,42 +144,61 @@ def generate_candidates(author, issues):
 # In non-interactive mode, this script picks the first valid author name from the candidates
 # If no such name exists, the original name is used (without the JIRA numbers).
 print "\n========================== Translating contributor list =========================="
-for line in contributors_file:
-    author = line.split(" - ")[0]
-    print "Processing author %s" % author
-    if not author:
-        print "    ERROR: Expected the following format <author> - <contributions>"
-        print "    ERROR: Actual = %s" % line
-    if not is_valid_author(author):
-        new_author = author.split("/")[0]
-        issues = author.split("/")[1:]
-        candidates = generate_candidates(new_author, issues)
+lines = contributors_file.readlines()
+contributions = []
+for i, line in enumerate(lines):
+    temp_author = line.strip(" * ").split(" -- ")[0]
+    print "Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines))
+    if not temp_author:
+        error_msg = "    ERROR: Expected the following format \" * <author> -- <contributions>\"\n"
+        error_msg += "    ERROR: Actual = %s" % line
+        print error_msg
+        warnings.append(error_msg)
+        contributions.append(line)
+        continue
+    author = temp_author.split("/")[0]
+    # Use the local copy of known translations where possible
+    if author in known_translations:
+        line = line.replace(temp_author, known_translations[author])
+    elif not is_valid_author(author):
+        new_author = author
+        issues = temp_author.split("/")[1:]
+        candidates = generate_candidates(author, issues)
         # Print out potential replacement candidates along with the sources, e.g.
         #   [X] No full name found for Github user andrewor14
+        #   [X] No assignee found for SPARK-1763
         #   [0] Andrew Or - Full name of JIRA user andrewor14
         #   [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14
         #   [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14
-        #   [X] No assignee found for SPARK-1763
-        #   [3] Custom
+        #   [3] andrewor14 - Raw Github username
+        #   [4] Custom
         candidate_names = []
+        bad_prompts = [] # Prompts that can't actually be selected; print these first.
+        good_prompts = [] # Prompts that contain valid choices
         for candidate, source in candidates:
             if candidate == NOT_FOUND:
-                print "    [X] %s" % source
+                bad_prompts.append("    [X] %s" % source)
             else:
                 index = len(candidate_names)
                 candidate_names.append(candidate)
-                print "    [%d] %s - %s" % (index, candidate, source)
-        custom_index = len(candidate_names)
+                good_prompts.append("    [%d] %s - %s" % (index, candidate, source))
+        raw_index = len(candidate_names)
+        custom_index = len(candidate_names) + 1
+        for p in bad_prompts: print p
+        if bad_prompts: print "    ---"
+        for p in good_prompts: print p
         # In interactive mode, additionally provide "custom" option and await user response
         if INTERACTIVE_MODE:
+            print "    [%d] %s - Raw Github username" % (raw_index, author)
             print "    [%d] Custom" % custom_index
             response = raw_input("    Your choice: ")
-            while not response.isdigit() or int(response) > custom_index:
-                response = raw_input("    Please enter an integer between 0 and %d: " % custom_index)
+            last_index = custom_index
+            while not response.isdigit() or int(response) > last_index:
+                response = raw_input("    Please enter an integer between 0 and %d: " % last_index)
             response = int(response)
             if response == custom_index:
                 new_author = raw_input("    Please type a custom name for this author: ")
-            else:
+            elif response != raw_index:
                 new_author = candidate_names[response]
         # In non-interactive mode, just pick the first candidate
         else:
@@ -171,12 +211,35 @@ def generate_candidates(author, issues):
         if is_valid_author(new_author):
             new_author = capitalize_author(new_author)
         else:
-            warnings.append("Unable to find a valid name %s for author %s" % (new_author, author))
+            warnings.append("Unable to find a valid name %s for author %s" % (author, temp_author))
         print "    * Replacing %s with %s" % (author, new_author)
-        line = line.replace(author, new_author)
-    new_contributors_file.write(line)
+        # If we are in interactive mode, prompt the user whether we want to remember this new mapping
+        if INTERACTIVE_MODE and\
+          author not in known_translations and\
+          yesOrNoPrompt("    Add mapping %s -> %s to known translations file?" % (author, new_author)):
+            known_translations_file.write("%s - %s\n" % (author, new_author))
+            known_translations_file.flush()
+        line = line.replace(temp_author, author)
+    contributions.append(line)
 print "==================================================================================\n"
 contributors_file.close()
+known_translations_file.close()
+
+# Sort the contributions before writing them to the new file.
+# Additionally, check if there are any duplicate author rows.
+# This could happen if the same user has both a valid full
+# name (e.g. Andrew Or) and an invalid one (andrewor14).
+# If so, warn the user about this at the end.
+contributions.sort()
+all_authors = set()
+new_contributors_file_name = contributors_file_name + ".final"
+new_contributors_file = open(new_contributors_file_name, "w")
+for line in contributions:
+    author = line.strip(" * ").split(" -- ")[0]
+    if author in all_authors:
+        warnings.append("Detected duplicate author name %s. Please merge these manually." % author)
+    all_authors.add(author)
+    new_contributors_file.write(line)
 new_contributors_file.close()
 
 print "Translated contributors list successfully written to %s!" % new_contributors_file_name
diff --git a/dev/mima b/dev/mima
index 40603166c21ae..bed5cd042634e 100755
--- a/dev/mima
+++ b/dev/mima
@@ -24,13 +24,13 @@ set -e
 FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
 cd "$FWDIR"
 
-echo -e "q\n" | sbt/sbt oldDeps/update
+echo -e "q\n" | build/sbt oldDeps/update
 rm -f .generated-mima*
 
-# Generate Mima Ignore is called twice, first with latest built jars 
+# Generate Mima Ignore is called twice, first with latest built jars
 # on the classpath and then again with previous version jars on the classpath.
 # Because of a bug in GenerateMIMAIgnore that when old jars are ahead on classpath
-# it did not process the new classes (which are in assembly jar). 
+# it did not process the new classes (which are in assembly jar).
 ./bin/spark-class org.apache.spark.tools.GenerateMIMAIgnore
 
 export SPARK_CLASSPATH="`find lib_managed \( -name '*spark*jar' -a -type f \) | tr "\\n" ":"`"
@@ -38,7 +38,7 @@ echo "SPARK_CLASSPATH=$SPARK_CLASSPATH"
 
 ./bin/spark-class org.apache.spark.tools.GenerateMIMAIgnore
 
-echo -e "q\n" | sbt/sbt mima-report-binary-issues | grep -v -e "info.*Resolving"
+echo -e "q\n" | build/sbt mima-report-binary-issues | grep -v -e "info.*Resolving"
 ret_val=$?
 
 if [ $ret_val != 0 ]; then
diff --git a/dev/run-tests b/dev/run-tests
index 328a73bd8b26d..2257a566bb1bb 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -21,8 +21,10 @@
 FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-# Remove work directory
+# Clean up work directory and caches
 rm -rf ./work
+rm -rf ~/.ivy2/local/org.apache.spark
+rm -rf ~/.ivy2/cache/org.apache.spark
 
 source "$FWDIR/dev/run-tests-codes.sh"
 
@@ -59,17 +61,17 @@ export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl"
 {
   if test -x "$JAVA_HOME/bin/java"; then
       declare java_cmd="$JAVA_HOME/bin/java"
-  else 
+  else
       declare java_cmd=java
   fi
-  
+
   # We can't use sed -r -e due to OS X / BSD compatibility; hence, all the parentheses.
   JAVA_VERSION=$(
     $java_cmd -version 2>&1 \
     | grep -e "^java version" --max-count=1 \
     | sed "s/java version \"\(.*\)\.\(.*\)\.\(.*\)\"/\1\2/"
   )
-  
+
   if [ "$JAVA_VERSION" -lt 18 ]; then
     echo "[warn] Java 8 tests will not run because JDK version is < 1.8."
   fi
@@ -79,7 +81,7 @@ export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl"
 # Partial solution for SPARK-1455.
 if [ -n "$AMPLAB_JENKINS" ]; then
   git fetch origin master:master
-  
+
   sql_diffs=$(
     git diff --name-only master \
     | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
@@ -93,7 +95,7 @@ if [ -n "$AMPLAB_JENKINS" ]; then
   if [ -n "$sql_diffs" ]; then
     echo "[info] Detected changes in SQL. Will run Hive test suite."
     _RUN_SQL_TESTS=true
-    
+
     if [ -z "$non_sql_diffs" ]; then
       echo "[info] Detected no changes except in SQL. Will only run SQL tests."
       _SQL_TESTS_ONLY=true
@@ -141,24 +143,26 @@ CURRENT_BLOCK=$BLOCK_BUILD
 {
 
   # NOTE: echo "q" is needed because sbt on encountering a build file with failure
-  #+ (either resolution or compilation) prompts the user for input either q, r, etc
-  #+ to quit or retry. This echo is there to make it not block.
+  # (either resolution or compilation) prompts the user for input either q, r, etc
+  # to quit or retry. This echo is there to make it not block.
   # NOTE: Do not quote $BUILD_MVN_PROFILE_ARGS or else it will be interpreted as a
-  #+ single argument!
+  # single argument!
   # QUESTION: Why doesn't 'yes "q"' work?
   # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work?
-  # First build with 0.12 to ensure patches do not break the hive 12 build
+  # First build with Hive 0.12.0 to ensure patches do not break the Hive 0.12.0 build
   HIVE_12_BUILD_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver -Phive-0.12.0"
-  echo "[info] Compile with hive 0.12"
+  echo "[info] Compile with Hive 0.12.0"
   echo -e "q\n" \
-    | sbt/sbt $HIVE_12_BUILD_ARGS clean hive/compile hive-thriftserver/compile \
+    | build/sbt $HIVE_12_BUILD_ARGS clean hive/compile hive-thriftserver/compile \
     | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
 
-  # Then build with default version(0.13.1) because tests are based on this version
+  # Then build with default Hive version (0.13.1) because tests are based on this version
+  echo "[info] Compile with Hive 0.13.1"
+  rm -rf lib_managed
   echo "[info] Building Spark with these arguments: $SBT_MAVEN_PROFILES_ARGS"\
     " -Phive -Phive-thriftserver"
   echo -e "q\n" \
-    | sbt/sbt $SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver package assembly/assembly  \
+    | build/sbt $SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver package assembly/assembly  \
     | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
 }
 
@@ -175,27 +179,27 @@ CURRENT_BLOCK=$BLOCK_SPARK_UNIT_TESTS
   if [ -n "$_RUN_SQL_TESTS" ]; then
     SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver"
   fi
-  
+
   if [ -n "$_SQL_TESTS_ONLY" ]; then
     # This must be an array of individual arguments. Otherwise, having one long string
-    #+ will be interpreted as a single test, which doesn't work.
+    # will be interpreted as a single test, which doesn't work.
     SBT_MAVEN_TEST_ARGS=("catalyst/test" "sql/test" "hive/test" "mllib/test")
   else
     SBT_MAVEN_TEST_ARGS=("test")
   fi
-  
+
   echo "[info] Running Spark tests with these arguments: $SBT_MAVEN_PROFILES_ARGS ${SBT_MAVEN_TEST_ARGS[@]}"
-  
+
   # NOTE: echo "q" is needed because sbt on encountering a build file with failure
-  #+ (either resolution or compilation) prompts the user for input either q, r, etc
-  #+ to quit or retry. This echo is there to make it not block.
-  # NOTE: Do not quote $SBT_MAVEN_PROFILES_ARGS or else it will be interpreted as a 
-  #+ single argument!
-  #+ "${SBT_MAVEN_TEST_ARGS[@]}" is cool because it's an array.
+  # (either resolution or compilation) prompts the user for input either q, r, etc
+  # to quit or retry. This echo is there to make it not block.
+  # NOTE: Do not quote $SBT_MAVEN_PROFILES_ARGS or else it will be interpreted as a
+  # single argument!
+  # "${SBT_MAVEN_TEST_ARGS[@]}" is cool because it's an array.
   # QUESTION: Why doesn't 'yes "q"' work?
   # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work?
   echo -e "q\n" \
-    | sbt/sbt $SBT_MAVEN_PROFILES_ARGS "${SBT_MAVEN_TEST_ARGS[@]}" \
+    | build/sbt $SBT_MAVEN_PROFILES_ARGS "${SBT_MAVEN_TEST_ARGS[@]}" \
     | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
 }
 
@@ -210,7 +214,7 @@ CURRENT_BLOCK=$BLOCK_PYSPARK_UNIT_TESTS
 
 echo ""
 echo "========================================================================="
-echo "Detecting binary incompatibilites with MiMa"
+echo "Detecting binary incompatibilities with MiMa"
 echo "========================================================================="
 
 CURRENT_BLOCK=$BLOCK_MIMA
diff --git a/dev/scalastyle b/dev/scalastyle
index 3a4df6e4bf1bc..86919227ed1ab 100755
--- a/dev/scalastyle
+++ b/dev/scalastyle
@@ -17,9 +17,9 @@
 # limitations under the License.
 #
 
-echo -e "q\n" | sbt/sbt -Phive -Phive-thriftserver scalastyle > scalastyle.txt
+echo -e "q\n" | build/sbt -Phive -Phive-thriftserver scalastyle > scalastyle.txt
 # Check style with YARN built too
-echo -e "q\n" | sbt/sbt -Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0 scalastyle \
+echo -e "q\n" | build/sbt -Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0 scalastyle \
   >> scalastyle.txt
 
 ERRORS=$(cat scalastyle.txt | awk '{if($1~/error/)print}')
diff --git a/docs/README.md b/docs/README.md
index 119484038083f..8a54724c4beae 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -21,7 +21,7 @@ read those text files directly if you want. Start with index.md.
 
 The markdown code can be compiled to HTML using the [Jekyll tool](http://jekyllrb.com).
 `Jekyll` and a few dependencies must be installed for this to work. We recommend
-installing via the Ruby Gem dependency manager. Since the exact HTML output 
+installing via the Ruby Gem dependency manager. Since the exact HTML output
 varies between versions of Jekyll and its dependencies, we list specific versions here
 in some cases:
 
@@ -60,7 +60,7 @@ We use Sphinx to generate Python API docs, so you will need to install it by run
 
 ## API Docs (Scaladoc and Sphinx)
 
-You can build just the Spark scaladoc by running `sbt/sbt doc` from the SPARK_PROJECT_ROOT directory.
+You can build just the Spark scaladoc by running `build/sbt doc` from the SPARK_PROJECT_ROOT directory.
 
 Similarly, you can build just the PySpark docs by running `make html` from the
 SPARK_PROJECT_ROOT/python/docs directory. Documentation is only generated for classes that are listed as
@@ -68,7 +68,7 @@ public in `__init__.py`.
 
 When you run `jekyll` in the `docs` directory, it will also copy over the scaladoc for the various
 Spark subprojects into the `docs` directory (and then also into the `_site` directory). We use a
-jekyll plugin to run `sbt/sbt doc` before building the site so if you haven't run it (recently) it
+jekyll plugin to run `build/sbt doc` before building the site so if you haven't run it (recently) it
 may take some time as it generates all of the scaladoc.  The jekyll plugin also generates the
 PySpark docs [Sphinx](http://sphinx-doc.org/).
 
diff --git a/docs/_config.yml b/docs/_config.yml
index a96a76dd9ab5e..e2db274e1f619 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -17,6 +17,6 @@ SPARK_VERSION: 1.3.0-SNAPSHOT
 SPARK_VERSION_SHORT: 1.3.0
 SCALA_BINARY_VERSION: "2.10"
 SCALA_VERSION: "2.10.4"
-MESOS_VERSION: 0.18.1
+MESOS_VERSION: 0.21.0
 SPARK_ISSUE_TRACKER_URL: https://issues.apache.org/jira/browse/SPARK
 SPARK_GITHUB_URL: https://github.com/apache/spark
diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index 627ed37de4a9c..8841f7675d35e 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -33,7 +33,7 @@
         <!-- Google analytics script -->
         <script type="text/javascript">
           var _gaq = _gaq || [];
-          _gaq.push(['_setAccount', 'UA-32518208-1']);
+          _gaq.push(['_setAccount', 'UA-32518208-2']);
           _gaq.push(['_trackPageview']);
 
           (function() {
@@ -79,9 +79,9 @@
                         <li class="dropdown">
                             <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
                             <ul class="dropdown-menu">
-                                <li><a href="api/scala/index.html#org.apache.spark.package">Scaladoc</a></li>
-                                <li><a href="api/java/index.html">Javadoc</a></li>
-                                <li><a href="api/python/index.html">Python API</a></li>
+                                <li><a href="api/scala/index.html#org.apache.spark.package">Scala</a></li>
+                                <li><a href="api/java/index.html">Java</a></li>
+                                <li><a href="api/python/index.html">Python</a></li>
                             </ul>
                         </li>
 
@@ -91,10 +91,11 @@
                                 <li><a href="cluster-overview.html">Overview</a></li>
                                 <li><a href="submitting-applications.html">Submitting Applications</a></li>
                                 <li class="divider"></li>
-                                <li><a href="ec2-scripts.html">Amazon EC2</a></li>
-                                <li><a href="spark-standalone.html">Standalone Mode</a></li>
+                                <li><a href="spark-standalone.html">Spark Standalone</a></li>
                                 <li><a href="running-on-mesos.html">Mesos</a></li>
                                 <li><a href="running-on-yarn.html">YARN</a></li>
+                                <li class="divider"></li>
+                                <li><a href="ec2-scripts.html">Amazon EC2</a></li>
                             </ul>
                         </li>
 
diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index 4566a2fff562b..3c626a0b7f54b 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -25,8 +25,8 @@
   curr_dir = pwd
   cd("..")
 
-  puts "Running 'sbt/sbt -Pkinesis-asl compile unidoc' from " + pwd + "; this may take a few minutes..."
-  puts `sbt/sbt -Pkinesis-asl compile unidoc`
+  puts "Running 'build/sbt -Pkinesis-asl compile unidoc' from " + pwd + "; this may take a few minutes..."
+  puts `build/sbt -Pkinesis-asl compile unidoc`
 
   puts "Moving back into docs dir."
   cd("docs")
diff --git a/docs/building-spark.md b/docs/building-spark.md
index 4922e877e9b78..fb93017861ed0 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -9,6 +9,15 @@ redirect_from: "building-with-maven.html"
 
 Building Spark using Maven requires Maven 3.0.4 or newer and Java 6+.
 
+# Building with `build/mvn`
+
+Spark now comes packaged with a self-contained Maven installation to ease building and deployment of Spark from source located under the `build/` directory. This script will automatically download and setup all necessary build requirements ([Maven](https://maven.apache.org/), [Scala](http://www.scala-lang.org/), and [Zinc](https://github.com/typesafehub/zinc)) locally within the `build/` directory itself. It honors any `mvn` binary if present already, however, will pull down its own copy of Scala and Zinc regardless to ensure proper version requirements are met. `build/mvn` execution acts as a pass through to the `mvn` call allowing easy transition from previous build methods. As an example, one can build a version of Spark as follows:
+
+{% highlight bash %}
+build/mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -DskipTests clean package
+{% endhighlight %}
+
+Other build examples can be found below.
 
 # Setting up Maven's Memory Usage
 
@@ -28,7 +37,9 @@ If you don't run this, you may see errors like the following:
 
 You can fix this by setting the `MAVEN_OPTS` variable as discussed before.
 
-**Note:** *For Java 8 and above this step is not required.*
+**Note:**
+* *For Java 8 and above this step is not required.*
+* *If using `build/mvn` and `MAVEN_OPTS` were not already set, the script will automate this for you.*
 
 # Specifying the Hadoop Version
 
@@ -60,7 +71,7 @@ mvn -Dhadoop.version=2.0.0-mr1-cdh4.2.0 -DskipTests clean package
 mvn -Phadoop-0.23 -Dhadoop.version=0.23.7 -DskipTests clean package
 {% endhighlight %}
 
-For Apache Hadoop 2.x, 0.23.x, Cloudera CDH, and other Hadoop versions with YARN, you can enable the "yarn" profile and optionally set the "yarn.version" property if it is different from "hadoop.version". As of Spark 1.3, Spark only supports YARN versions 2.2.0 and later.
+You can enable the "yarn" profile and optionally set the "yarn.version" property if it is different from "hadoop.version". Spark only supports YARN versions 2.2.0 and later.
 
 Examples:
 
@@ -84,7 +95,7 @@ mvn -Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0 -Dyarn.version=2.2.0 -DskipTests
 # Building With Hive and JDBC Support
 To enable Hive integration for Spark SQL along with its JDBC server and CLI,
 add the `-Phive` and `Phive-thriftserver` profiles to your existing build options.
-By default Spark will build with Hive 0.13.1 bindings. You can also build for 
+By default Spark will build with Hive 0.13.1 bindings. You can also build for
 Hive 0.12.0 using the `-Phive-0.12.0` profile.
 {% highlight bash %}
 # Apache Hadoop 2.4.X with Hive 13 support
@@ -106,7 +117,7 @@ supported in Scala 2.11 builds.
 
 # Spark Tests in Maven
 
-Tests are run by default via the [ScalaTest Maven plugin](http://www.scalatest.org/user_guide/using_the_scalatest_maven_plugin). 
+Tests are run by default via the [ScalaTest Maven plugin](http://www.scalatest.org/user_guide/using_the_scalatest_maven_plugin).
 
 Some of the tests require Spark to be packaged first, so always run `mvn package` with `-DskipTests` the first time.  The following is an example of a correct (build, test) sequence:
 
@@ -124,11 +135,26 @@ We use the scala-maven-plugin which supports incremental and continuous compilat
 
     mvn scala:cc
 
-should run continuous compilation (i.e. wait for changes). However, this has not been tested extensively.
+should run continuous compilation (i.e. wait for changes). However, this has not been tested
+extensively. A couple of gotchas to note:
+* it only scans the paths `src/main` and `src/test` (see
+[docs](http://scala-tools.org/mvnsites/maven-scala-plugin/usage_cc.html)), so it will only work
+from within certain submodules that have that structure.
+* you'll typically need to run `mvn install` from the project root for compilation within
+specific submodules to work; this is because submodules that depend on other submodules do so via
+the `spark-parent` module).
+
+Thus, the full flow for running continuous-compilation of the `core` submodule may look more like:
+ ```
+ $ mvn install
+ $ cd core
+ $ mvn scala:cc
+```
 
-# Using With IntelliJ IDEA
+# Building Spark with IntelliJ IDEA or Eclipse
 
-This setup works fine in IntelliJ IDEA 11.1.4. After opening the project via the pom.xml file in the project root folder, you only need to activate either the hadoop1 or hadoop2 profile in the "Maven Properties" popout. We have not tried Eclipse/Scala IDE with this.
+For help in setting up IntelliJ IDEA or Eclipse for Spark development, and troubleshooting, refer to the
+[wiki page for IDE setup](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-IDESetup).
 
 # Building Spark Debian Packages
 
@@ -143,9 +169,9 @@ The debian package can then be found under assembly/target. We added the short c
 Running only Java 8 tests and nothing else.
 
     mvn install -DskipTests -Pjava8-tests
-    
-Java 8 tests are run when `-Pjava8-tests` profile is enabled, they will run in spite of `-DskipTests`. 
-For these tests to run your system must have a JDK 8 installation. 
+
+Java 8 tests are run when `-Pjava8-tests` profile is enabled, they will run in spite of `-DskipTests`.
+For these tests to run your system must have a JDK 8 installation.
 If you have JDK 8 installed but it is not the system default, you can set JAVA_HOME to point to JDK 8 before running the tests.
 
 # Building for PySpark on YARN
@@ -157,7 +183,7 @@ then ship it over to the cluster. We are investigating the exact cause for this.
 
 # Packaging without Hadoop Dependencies for YARN
 
-The assembly jar produced by `mvn package` will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with yarn.application.classpath.  The `hadoop-provided` profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself. 
+The assembly jar produced by `mvn package` will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with yarn.application.classpath.  The `hadoop-provided` profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself.
 
 # Building with SBT
 
@@ -168,22 +194,22 @@ compilation. More advanced developers may wish to use SBT.
 The SBT build is derived from the Maven POM files, and so the same Maven profiles and variables
 can be set to control the SBT build. For example:
 
-    sbt/sbt -Pyarn -Phadoop-2.3 assembly
+    build/sbt -Pyarn -Phadoop-2.3 assembly
 
 # Testing with SBT
 
-Some of the tests require Spark to be packaged first, so always run `sbt/sbt assembly` the first time.  The following is an example of a correct (build, test) sequence:
+Some of the tests require Spark to be packaged first, so always run `build/sbt assembly` the first time.  The following is an example of a correct (build, test) sequence:
 
-    sbt/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver assembly
-    sbt/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver test
+    build/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver assembly
+    build/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver test
 
 To run only a specific test suite as follows:
 
-    sbt/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver "test-only org.apache.spark.repl.ReplSuite"
+    build/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver "test-only org.apache.spark.repl.ReplSuite"
 
 To run test suites of a specific sub project as follows:
 
-    sbt/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver core/test
+    build/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver core/test
 
 # Speeding up Compilation with Zinc
 
@@ -192,3 +218,9 @@ compiler. When run locally as a background process, it speeds up builds of Scala
 like Spark. Developers who regularly recompile Spark with Maven will be the most interested in
 Zinc. The project site gives instructions for building and running `zinc`; OS X users can
 install it using `brew install zinc`.
+
+If using the `build/mvn` package `zinc` will automatically be downloaded and leveraged for all
+builds. This process will auto-start after the first time `build/mvn` is called and bind to port
+3030 unless the `ZINC_PORT` environment variable is set. The `zinc` process can subsequently be
+shut down at any time by running `build/zinc-<version>/bin/zinc -shutdown` and will automatically
+restart whenever `build/mvn` is called.
diff --git a/docs/configuration.md b/docs/configuration.md
index c83b103213a17..3110844d3412c 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -8,7 +8,7 @@ title: Spark Configuration
 Spark provides three locations to configure the system:
 
 * [Spark properties](#spark-properties) control most application parameters and can be set by using
-  a [SparkConf](api/core/index.html#org.apache.spark.SparkConf) object, or through Java
+  a [SparkConf](api/scala/index.html#org.apache.spark.SparkConf) object, or through Java
   system properties.
 * [Environment variables](#environment-variables) can be used to set per-machine settings, such as
   the IP address, through the `conf/spark-env.sh` script on each node.
@@ -23,8 +23,8 @@ application. These properties can be set directly on a
 (e.g. master URL and application name), as well as arbitrary key-value pairs through the
 `set()` method. For example, we could initialize an application with two threads as follows:
 
-Note that we run with local[2], meaning two threads - which represents "minimal" parallelism, 
-which can help detect bugs that only exist when we run in a distributed context. 
+Note that we run with local[2], meaning two threads - which represents "minimal" parallelism,
+which can help detect bugs that only exist when we run in a distributed context.
 
 {% highlight scala %}
 val conf = new SparkConf()
@@ -35,7 +35,7 @@ val sc = new SparkContext(conf)
 {% endhighlight %}
 
 Note that we can have more than 1 thread in local mode, and in cases like spark streaming, we may actually
-require one to prevent any sort of starvation issues.  
+require one to prevent any sort of starvation issues.
 
 ## Dynamically Loading Spark Properties
 In some cases, you may want to avoid hard-coding certain configurations in a `SparkConf`. For
@@ -48,8 +48,8 @@ val sc = new SparkContext(new SparkConf())
 
 Then, you can supply configuration values at runtime:
 {% highlight bash %}
-./bin/spark-submit --name "My app" --master local[4] --conf spark.shuffle.spill=false 
-  --conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" myApp.jar 
+./bin/spark-submit --name "My app" --master local[4] --conf spark.shuffle.spill=false
+  --conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" myApp.jar
 {% endhighlight %}
 
 The Spark shell and [`spark-submit`](submitting-applications.html)
@@ -75,8 +75,8 @@ in the `spark-defaults.conf` file.
 
 The application web UI at `http://<driver>:4040` lists Spark properties in the "Environment" tab.
 This is a useful place to check to make sure that your properties have been set correctly. Note
-that only values explicitly specified through either `spark-defaults.conf` or SparkConf will
-appear. For all other configuration properties, you can assume the default value is used.
+that only values explicitly specified through `spark-defaults.conf`, `SparkConf`, or the command
+line will appear. For all other configuration properties, you can assume the default value is used.
 
 ## Available Properties
 
@@ -102,11 +102,10 @@ of the most common options to set are:
   </td>
 </tr>
 <tr>
-  <td><code>spark.executor.memory</code></td>
-  <td>512m</td>
+  <td><code>spark.driver.cores</code></td>
+  <td>1</td>
   <td>
-    Amount of memory to use per executor process, in the same format as JVM memory strings
-    (e.g. <code>512m</code>, <code>2g</code>).
+    Number of cores to use for the driver process, only in cluster mode.
   </td>
 </tr>
 <tr>
@@ -117,13 +116,21 @@ of the most common options to set are:
     (e.g. <code>512m</code>, <code>2g</code>).
   </td>
 </tr>
+<tr>
+  <td><code>spark.executor.memory</code></td>
+  <td>512m</td>
+  <td>
+    Amount of memory to use per executor process, in the same format as JVM memory strings
+    (e.g. <code>512m</code>, <code>2g</code>).
+  </td>
+</tr>
 <tr>
   <td><code>spark.driver.maxResultSize</code></td>
   <td>1g</td>
   <td>
     Limit of total size of serialized results of all partitions for each Spark action (e.g. collect).
     Should be at least 1M, or 0 for unlimited. Jobs will be aborted if the total size
-    is above this limit. 
+    is above this limit.
     Having a high limit may cause out-of-memory errors in driver (depends on spark.driver.memory
     and memory overhead of objects in JVM). Setting a proper limit can protect the driver from
     out-of-memory errors.
@@ -217,6 +224,45 @@ Apart from these, the following properties are also available, and may be useful
     Set a special library path to use when launching executor JVM's.
   </td>
 </tr>
+<tr>
+  <td><code>spark.executor.logs.rolling.strategy</code></td>
+  <td>(none)</td>
+  <td>
+    Set the strategy of rolling of executor logs. By default it is disabled. It can
+    be set to "time" (time-based rolling) or "size" (size-based rolling). For "time",
+    use <code>spark.executor.logs.rolling.time.interval</code> to set the rolling interval.
+    For "size", use <code>spark.executor.logs.rolling.size.maxBytes</code> to set
+    the maximum file size for rolling.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executor.logs.rolling.time.interval</code></td>
+  <td>daily</td>
+  <td>
+    Set the time interval by which the executor logs will be rolled over.
+    Rolling is disabled by default. Valid values are `daily`, `hourly`, `minutely` or
+    any interval in seconds. See <code>spark.executor.logs.rolling.maxRetainedFiles</code>
+    for automatic cleaning of old logs.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executor.logs.rolling.size.maxBytes</code></td>
+  <td>(none)</td>
+  <td>
+    Set the max size of the file by which the executor logs will be rolled over.
+    Rolling is disabled by default. Value is set in terms of bytes.
+    See <code>spark.executor.logs.rolling.maxRetainedFiles</code>
+    for automatic cleaning of old logs.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executor.logs.rolling.maxRetainedFiles</code></td>
+  <td>(none)</td>
+  <td>
+    Sets the number of latest rolling log files that are going to be retained by the system.
+    Older log files will be deleted. Disabled by default.
+  </td>
+</tr>
 <tr>
   <td><code>spark.files.userClassPathFirst</code></td>
   <td>false</td>
@@ -253,10 +299,11 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.python.profile.dump</code></td>
   <td>(none)</td>
   <td>
-    The directory which is used to dump the profile result before driver exiting. 
+    The directory which is used to dump the profile result before driver exiting.
     The results will be dumped as separated file for each RDD. They can be loaded
     by ptats.Stats(). If this is specified, the profile result will not be displayed
     automatically.
+  </td>
 </tr>
 <tr>
   <td><code>spark.python.worker.reuse</code></td>
@@ -272,8 +319,8 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.executorEnv.[EnvironmentVariableName]</code></td>
   <td>(none)</td>
   <td>
-    Add the environment variable specified by <code>EnvironmentVariableName</code> to the Executor 
-    process. The user can specify multiple of these and to set multiple environment variables. 
+    Add the environment variable specified by <code>EnvironmentVariableName</code> to the Executor
+    process. The user can specify multiple of these to set multiple environment variables.
   </td>
 </tr>
 <tr>
@@ -415,7 +462,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.ui.retainedJobs</code></td>
   <td>1000</td>
   <td>
-    How many stages the Spark UI and status APIs remember before garbage
+    How many jobs the Spark UI and status APIs remember before garbage
     collecting.
   </td>
 </tr>
@@ -478,9 +525,9 @@ Apart from these, the following properties are also available, and may be useful
   <td>
     The codec used to compress internal data such as RDD partitions, broadcast variables and
     shuffle outputs. By default, Spark provides three codecs: <code>lz4</code>, <code>lzf</code>,
-    and <code>snappy</code>. You can also use fully qualified class names to specify the codec, 
-    e.g. 
-    <code>org.apache.spark.io.LZ4CompressionCodec</code>,    
+    and <code>snappy</code>. You can also use fully qualified class names to specify the codec,
+    e.g.
+    <code>org.apache.spark.io.LZ4CompressionCodec</code>,
     <code>org.apache.spark.io.LZFCompressionCodec</code>,
     and <code>org.apache.spark.io.SnappyCompressionCodec</code>.
   </td>
@@ -641,7 +688,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.storage.memoryMapThreshold</code></td>
-  <td>8192</td>
+  <td>2097152</td>
   <td>
     Size of a block, in bytes, above which Spark memory maps when reading a block from disk.
     This prevents Spark from memory mapping very small blocks. In general, memory
@@ -672,7 +719,9 @@ Apart from these, the following properties are also available, and may be useful
     <td>If set to true, validates the output specification (e.g. checking if the output directory already exists)
     used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing
     output directories. We recommend that users do not disable this except if trying to achieve compatibility with
-    previous versions of Spark. Simply use Hadoop's FileSystem API to delete output directories by hand.</td>
+    previous versions of Spark. Simply use Hadoop's FileSystem API to delete output directories by hand.
+    This setting is ignored for jobs generated through Spark Streaming's StreamingContext, since
+    data may need to be rewritten to pre-existing output directories during checkpoint recovery.</td>
 </tr>
 <tr>
     <td><code>spark.hadoop.cloneConf</code></td>
@@ -779,6 +828,16 @@ Apart from these, the following properties are also available, and may be useful
     Communication timeout between Spark nodes, in seconds.
   </td>
 </tr>
+<tr>
+  <td><code>spark.network.timeout</code></td>
+  <td>120</td>
+  <td>
+    Default timeout for all network interactions, in seconds. This config will be used in 
+    place of <code>spark.core.connection.ack.wait.timeout</code>, <code>spark.akka.timeout</code>, 
+    <code>spark.storage.blockManagerSlaveTimeoutMs</code> or
+    <code>spark.shuffle.io.connectionTimeout</code>, if they are not configured.
+  </td>
+</tr>
 <tr>
   <td><code>spark.akka.heartbeat.pauses</code></td>
   <td>6000</td>
@@ -815,6 +874,41 @@ Apart from these, the following properties are also available, and may be useful
     between nodes leading to flooding the network with those.
   </td>
 </tr>
+<tr>
+  <td><code>spark.shuffle.io.preferDirectBufs</code></td>
+  <td>true</td>
+  <td>
+    (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache 
+    block transfer. For environments where off-heap memory is tightly limited, users may wish to 
+    turn this off to force all allocations from Netty to be on-heap.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.io.numConnectionsPerPeer</code></td>
+  <td>1</td>
+  <td>
+    (Netty only) Connections between hosts are reused in order to reduce connection buildup for 
+    large clusters. For clusters with many hard disks and few hosts, this may result in insufficient
+    concurrency to saturate all disks, and so users may consider increasing this value.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.io.maxRetries</code></td>
+  <td>3</td>
+  <td>
+    (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is
+    set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC 
+    pauses or transient network connectivity issues.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.io.retryWait</code></td>
+  <td>5</td>
+  <td>
+    (Netty only) Seconds to wait between retries of fetches. The maximum delay caused by retrying
+    is simply <code>maxRetries * retryWait</code>, by default 15 seconds. 
+  </td>
+</tr>
 </table>
 
 #### Scheduling
@@ -948,7 +1042,7 @@ Apart from these, the following properties are also available, and may be useful
     (resources are executors in yarn mode, CPU cores in standalone mode)
     to wait for before scheduling begins. Specified as a double between 0.0 and 1.0.
     Regardless of whether the minimum ratio of resources has been reached,
-    the maximum amount of time it will wait before scheduling begins is controlled by config 
+    the maximum amount of time it will wait before scheduling begins is controlled by config
     <code>spark.scheduler.maxRegisteredResourcesWaitingTime</code>.
   </td>
 </tr>
@@ -957,7 +1051,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>30000</td>
   <td>
     Maximum amount of time to wait for resources to register before scheduling begins
-    (in milliseconds).  
+    (in milliseconds).
   </td>
 </tr>
 <tr>
@@ -971,6 +1065,67 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 </table>
 
+#### Dynamic allocation
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.dynamicAllocation.enabled</code></td>
+  <td>false</td>
+  <td>
+    Whether to use dynamic resource allocation, which scales the number of executors registered
+    with this application up and down based on the workload. Note that this is currently only
+    available on YARN mode. For more detail, see the description
+    <a href="job-scheduling.html#dynamic-resource-allocation">here</a>.
+    <br><br>
+    This requires the following configurations to be set:
+    <code>spark.dynamicAllocation.minExecutors</code>,
+    <code>spark.dynamicAllocation.maxExecutors</code>, and
+    <code>spark.shuffle.service.enabled</code>
+  </td>
+</tr>
+<tr>
+  <td><code>spark.dynamicAllocation.minExecutors</code></td>
+  <td>(none)</td>
+  <td>
+    Lower bound for the number of executors if dynamic allocation is enabled (required).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.dynamicAllocation.maxExecutors</code></td>
+  <td>(none)</td>
+  <td>
+    Upper bound for the number of executors if dynamic allocation is enabled (required).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.dynamicAllocation.schedulerBacklogTimeout</code></td>
+  <td>60</td>
+  <td>
+    If dynamic allocation is enabled and there have been pending tasks backlogged for more than
+    this duration (in seconds), new executors will be requested. For more detail, see this
+    <a href="job-scheduling.html#resource-allocation-policy">description</a>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.dynamicAllocation.sustainedSchedulerBacklogTimeout</code></td>
+  <td><code>schedulerBacklogTimeout</code></td>
+  <td>
+    Same as <code>spark.dynamicAllocation.schedulerBacklogTimeout</code>, but used only for
+    subsequent executor requests. For more detail, see this
+    <a href="job-scheduling.html#resource-allocation-policy">description</a>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.dynamicAllocation.executorIdleTimeout</code></td>
+  <td>600</td>
+  <td>
+    If dynamic allocation is enabled and an executor has been idle for more than this duration
+    (in seconds), the executor will be removed. For more detail, see this
+    <a href="job-scheduling.html#resource-allocation-policy">description</a>.
+  </td>
+</tr>
+</table>
+
 #### Security
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
@@ -1026,7 +1181,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>false</td>
   <td>
     Whether Spark acls should are enabled. If enabled, this checks to see if the user has
-    access permissions to view or modify the job.  Note this requires the user to be known, 
+    access permissions to view or modify the job.  Note this requires the user to be known,
     so if the user comes across as null no checks are done. Filters can be used with the UI
     to authenticate and set the user.
   </td>
@@ -1065,17 +1220,31 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.streaming.blockInterval</code></td>
   <td>200</td>
   <td>
-    Interval (milliseconds) at which data received by Spark Streaming receivers is coalesced
-    into blocks of data before storing them in Spark.
+    Interval (milliseconds) at which data received by Spark Streaming receivers is chunked
+    into blocks of data before storing them in Spark. Minimum recommended - 50 ms. See the
+    <a href="streaming-programming-guide.html#level-of-parallelism-in-data-receiving">performance
+     tuning</a> section in the Spark Streaming programing guide for more details.
   </td>
 </tr>
 <tr>
   <td><code>spark.streaming.receiver.maxRate</code></td>
   <td>infinite</td>
   <td>
-    Maximum rate (per second) at which each receiver will push data into blocks. Effectively,
-    each stream will consume at most this number of records per second.
+    Maximum number records per second at which each receiver will receive data.
+    Effectively, each stream will consume at most this number of records per second.
     Setting this configuration to 0 or a negative number will put no limit on the rate.
+    See the <a href="streaming-programming-guide.html#deploying-applications">deployment guide</a>
+    in the Spark Streaming programing guide for mode details.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.streaming.receiver.writeAheadLog.enable</code></td>
+  <td>false</td>
+  <td>
+    Enable write ahead logs for receivers. All the input data received through receivers
+    will be saved to write ahead logs that will allow it to be recovered after driver failures.
+    See the <a href="streaming-programming-guide.html#deploying-applications">deployment guide</a>
+    in the Spark Streaming programing guide for more details.
   </td>
 </tr>
 <tr>
@@ -1089,45 +1258,6 @@ Apart from these, the following properties are also available, and may be useful
     higher memory usage in Spark.
   </td>
 </tr>
-<tr>
-  <td><code>spark.executor.logs.rolling.strategy</code></td>
-  <td>(none)</td>
-  <td>
-    Set the strategy of rolling of executor logs. By default it is disabled. It can
-    be set to "time" (time-based rolling) or "size" (size-based rolling). For "time",
-    use <code>spark.executor.logs.rolling.time.interval</code> to set the rolling interval.
-    For "size", use <code>spark.executor.logs.rolling.size.maxBytes</code> to set
-    the maximum file size for rolling.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.executor.logs.rolling.time.interval</code></td>
-  <td>daily</td>
-  <td>
-    Set the time interval by which the executor logs will be rolled over.
-    Rolling is disabled by default. Valid values are `daily`, `hourly`, `minutely` or
-    any interval in seconds. See <code>spark.executor.logs.rolling.maxRetainedFiles</code>
-    for automatic cleaning of old logs.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.executor.logs.rolling.size.maxBytes</code></td>
-  <td>(none)</td>
-  <td>
-    Set the max size of the file by which the executor logs will be rolled over.
-    Rolling is disabled by default. Value is set in terms of bytes.
-    See <code>spark.executor.logs.rolling.maxRetainedFiles</code>
-    for automatic cleaning of old logs.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.executor.logs.rolling.maxRetainedFiles</code></td>
-  <td>(none)</td>
-  <td>
-    Sets the number of latest rolling log files that are going to be retained by the system.
-    Older log files will be deleted. Disabled by default.
-  </td>
-</tr>
 </table>
 
 #### Cluster Managers
diff --git a/docs/ec2-scripts.md b/docs/ec2-scripts.md
index ed51d0abb3a45..d50f445d7ecc7 100644
--- a/docs/ec2-scripts.md
+++ b/docs/ec2-scripts.md
@@ -94,6 +94,25 @@ another.
 permissions on your private key file, you can run `launch` with the
 `--resume` option to restart the setup process on an existing cluster.
 
+# Launching a Cluster in a VPC
+
+-   Run
+    `./spark-ec2 -k <keypair> -i <key-file> -s <num-slaves> --vpc-id=<vpc-id> --subnet-id=<subnet-id> launch <cluster-name>`,
+    where `<keypair>` is the name of your EC2 key pair (that you gave it
+    when you created it), `<key-file>` is the private key file for your
+    key pair, `<num-slaves>` is the number of slave nodes to launch (try
+    1 at first), `<vpc-id>` is the name of your VPC, `<subnet-id>` is the
+    name of your subnet, and `<cluster-name>` is the name to give to your
+    cluster.
+
+    For example:
+
+    ```bash
+    export AWS_SECRET_ACCESS_KEY=AaBbCcDdEeFGgHhIiJjKkLlMmNnOoPpQqRrSsTtU
+export AWS_ACCESS_KEY_ID=ABCDEFG1234567890123
+./spark-ec2 --key-pair=awskey --identity-file=awskey.pem --region=us-west-1 --zone=us-west-1a --vpc-id=vpc-a28d24c7 --subnet-id=subnet-4eb27b39 --spark-version=1.1.0 launch my-spark-cluster
+    ```
+
 # Running Applications
 
 -   Go into the `ec2` directory in the release of Spark you downloaded.
diff --git a/docs/hadoop-third-party-distributions.md b/docs/hadoop-third-party-distributions.md
index dd73e9dc54440..87dcc58feb494 100644
--- a/docs/hadoop-third-party-distributions.md
+++ b/docs/hadoop-third-party-distributions.md
@@ -18,7 +18,7 @@ see the guide on [building with maven](building-spark.html#specifying-the-hadoop
 
 The table below lists the corresponding `hadoop.version` code for each CDH/HDP release. Note that
 some Hadoop releases are binary compatible across client versions. This means the pre-built Spark
-distribution may "just work" without you needing to compile. That said, we recommend compiling with 
+distribution may "just work" without you needing to compile. That said, we recommend compiling with
 the _exact_ Hadoop version you are running to avoid any compatibility errors.
 
 <table>
@@ -50,7 +50,7 @@ the _exact_ Hadoop version you are running to avoid any compatibility errors.
 
 In SBT, the equivalent can be achieved by setting the the `hadoop.version` property:
 
-    sbt/sbt -Dhadoop.version=1.0.4 assembly
+    build/sbt -Dhadoop.version=1.0.4 assembly
 
 # Linking Applications to the Hadoop Version
 
@@ -98,11 +98,11 @@ Spark can run in a variety of deployment modes:
 
 * Using dedicated set of Spark nodes in your cluster. These nodes should be co-located with your
   Hadoop installation.
-* Running on the same nodes as an existing Hadoop installation, with a fixed amount memory and 
+* Running on the same nodes as an existing Hadoop installation, with a fixed amount memory and
   cores dedicated to Spark on each node.
 * Run Spark alongside Hadoop using a cluster resource manager, such as YARN or Mesos.
 
-These options are identical for those using CDH and HDP. 
+These options are identical for those using CDH and HDP.
 
 # Inheriting Cluster Configuration
 
@@ -116,5 +116,5 @@ The location of these configuration files varies across CDH and HDP versions, bu
 a common location is inside of `/etc/hadoop/conf`. Some tools, such as Cloudera Manager, create
 configurations on-the-fly, but offer a mechanisms to download copies of them.
 
-To make these files visible to Spark, set `HADOOP_CONF_DIR` in `$SPARK_HOME/spark-env.sh` 
+To make these files visible to Spark, set `HADOOP_CONF_DIR` in `$SPARK_HOME/spark-env.sh`
 to a location containing the configuration files.
diff --git a/docs/job-scheduling.md b/docs/job-scheduling.md
index 94604f301dd46..a5425eb3557b2 100644
--- a/docs/job-scheduling.md
+++ b/docs/job-scheduling.md
@@ -56,6 +56,114 @@ the same RDDs. For example, the [Shark](http://shark.cs.berkeley.edu) JDBC serve
 queries. In future releases, in-memory storage systems such as [Tachyon](http://tachyon-project.org) will
 provide another approach to share RDDs.
 
+## Dynamic Resource Allocation
+
+Spark 1.2 introduces the ability to dynamically scale the set of cluster resources allocated to
+your application up and down based on the workload. This means that your application may give
+resources back to the cluster if they are no longer used and request them again later when there
+is demand. This feature is particularly useful if multiple applications share resources in your
+Spark cluster. If a subset of the resources allocated to an application becomes idle, it can be
+returned to the cluster's pool of resources and acquired by other applications. In Spark, dynamic
+resource allocation is performed on the granularity of the executor and can be enabled through
+`spark.dynamicAllocation.enabled`.
+
+This feature is currently disabled by default and available only on [YARN](running-on-yarn.html).
+A future release will extend this to [standalone mode](spark-standalone.html) and
+[Mesos coarse-grained mode](running-on-mesos.html#mesos-run-modes). Note that although Spark on
+Mesos already has a similar notion of dynamic resource sharing in fine-grained mode, enabling
+dynamic allocation allows your Mesos application to take advantage of coarse-grained low-latency
+scheduling while sharing cluster resources efficiently.
+
+### Configuration and Setup
+
+All configurations used by this feature live under the `spark.dynamicAllocation.*` namespace.
+To enable this feature, your application must set `spark.dynamicAllocation.enabled` to `true` and
+provide lower and upper bounds for the number of executors through
+`spark.dynamicAllocation.minExecutors` and `spark.dynamicAllocation.maxExecutors`. Other relevant
+configurations are described on the [configurations page](configuration.html#dynamic-allocation)
+and in the subsequent sections in detail.
+
+Additionally, your application must use an external shuffle service. The purpose of the service is
+to preserve the shuffle files written by executors so the executors can be safely removed (more
+detail described [below](job-scheduling.html#graceful-decommission-of-executors)). To enable
+this service, set `spark.shuffle.service.enabled` to `true`. In YARN, this external shuffle service
+is implemented in `org.apache.spark.yarn.network.YarnShuffleService` that runs in each `NodeManager`
+in your cluster. To start this service, follow these steps:
+
+1. Build Spark with the [YARN profile](building-spark.html). Skip this step if you are using a
+pre-packaged distribution.
+2. Locate the `spark-<version>-yarn-shuffle.jar`. This should be under
+`$SPARK_HOME/network/yarn/target/scala-<version>` if you are building Spark yourself, and under
+`lib` if you are using a distribution.
+2. Add this jar to the classpath of all `NodeManager`s in your cluster.
+3. In the `yarn-site.xml` on each node, add `spark_shuffle` to `yarn.nodemanager.aux-services`,
+then set `yarn.nodemanager.aux-services.spark_shuffle.class` to
+`org.apache.spark.network.yarn.YarnShuffleService`. Additionally, set all relevant
+`spark.shuffle.service.*` [configurations](configuration.html).
+4. Restart all `NodeManager`s in your cluster.
+
+### Resource Allocation Policy
+
+At a high level, Spark should relinquish executors when they are no longer used and acquire
+executors when they are needed. Since there is no definitive way to predict whether an executor
+that is about to be removed will run a task in the near future, or whether a new executor that is
+about to be added will actually be idle, we need a set of heuristics to determine when to remove
+and request executors.
+
+#### Request Policy
+
+A Spark application with dynamic allocation enabled requests additional executors when it has
+pending tasks waiting to be scheduled. This condition necessarily implies that the existing set
+of executors is insufficient to simultaneously saturate all tasks that have been submitted but
+not yet finished.
+
+Spark requests executors in rounds. The actual request is triggered when there have been pending
+tasks for `spark.dynamicAllocation.schedulerBacklogTimeout` seconds, and then triggered again
+every `spark.dynamicAllocation.sustainedSchedulerBacklogTimeout` seconds thereafter if the queue
+of pending tasks persists. Additionally, the number of executors requested in each round increases
+exponentially from the previous round. For instance, an application will add 1 executor in the
+first round, and then 2, 4, 8 and so on executors in the subsequent rounds.
+
+The motivation for an exponential increase policy is twofold. First, an application should request
+executors cautiously in the beginning in case it turns out that only a few additional executors is
+sufficient. This echoes the justification for TCP slow start. Second, the application should be
+able to ramp up its resource usage in a timely manner in case it turns out that many executors are
+actually needed.
+
+#### Remove Policy
+
+The policy for removing executors is much simpler. A Spark application removes an executor when
+it has been idle for more than `spark.dynamicAllocation.executorIdleTimeout` seconds. Note that,
+under most circumstances, this condition is mutually exclusive with the request condition, in that
+an executor should not be idle if there are still pending tasks to be scheduled.
+
+### Graceful Decommission of Executors
+
+Before dynamic allocation, a Spark executor exits either on failure or when the associated
+application has also exited. In both scenarios, all state associated with the executor is no
+longer needed and can be safely discarded. With dynamic allocation, however, the application
+is still running when an executor is explicitly removed. If the application attempts to access
+state stored in or written by the executor, it will have to perform a recompute the state. Thus,
+Spark needs a mechanism to decommission an executor gracefully by preserving its state before
+removing it.
+
+This requirement is especially important for shuffles. During a shuffle, the Spark executor first
+writes its own map outputs locally to disk, and then acts as the server for those files when other
+executors attempt to fetch them. In the event of stragglers, which are tasks that run for much
+longer than their peers, dynamic allocation may remove an executor before the shuffle completes,
+in which case the shuffle files written by that executor must be recomputed unnecessarily.
+
+The solution for preserving shuffle files is to use an external shuffle service, also introduced
+in Spark 1.2. This service refers to a long-running process that runs on each node of your cluster
+independently of your Spark applications and their executors. If the service is enabled, Spark
+executors will fetch shuffle files from the service instead of from each other. This means any
+shuffle state written by an executor may continue to be served beyond the executor's lifetime.
+
+In addition to writing shuffle files, executors also cache data either on disk or in memory.
+When an executor is removed, however, all cached data will no longer be accessible. There is
+currently not yet a solution for this in Spark 1.2. In future releases, the cached data may be
+preserved through an off-heap storage similar in spirit to how shuffle files are preserved through
+the external shuffle service.
 
 # Scheduling Within an Application
 
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index 1c2e27341473b..be178d7689fdd 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -3,13 +3,16 @@ layout: global
 title: Spark ML Programming Guide
 ---
 
-Spark ML is Spark's new machine learning package.  It is currently an alpha component but is potentially a successor to [MLlib](mllib-guide.html). The `spark.ml` package aims to replace the old APIs with a cleaner, more uniform set of APIs which will help users create full machine learning pipelines.
+`spark.ml` is a new package introduced in Spark 1.2, which aims to provide a uniform set of
+high-level APIs that help users create and tune practical machine learning pipelines.
+It is currently an alpha component, and we would like to hear back from the community about
+how it fits real-world use cases and how it could be improved.
 
-MLlib vs. Spark ML:
-
-* Users can use algorithms from either of the two packages, but APIs may differ.  Currently, `spark.ml` offers a subset of the algorithms from `spark.mllib`. Since Spark ML is an alpha component, its API may change in future releases.
-* Developers should contribute new algorithms to `spark.mllib` and can optionally contribute to `spark.ml`.  See below for more details.
-* Spark ML only has Scala and Java APIs, whereas MLlib also has a Python API.
+Note that we will keep supporting and adding features to `spark.mllib` along with the
+development of `spark.ml`.
+Users should be comfortable using `spark.mllib` features and expect more features coming.
+Developers should contribute new algorithms to `spark.mllib` and can optionally contribute
+to `spark.ml`.
 
 **Table of Contents**
 
@@ -686,17 +689,3 @@ Spark ML currently depends on MLlib and has the same dependencies.
 Please see the [MLlib Dependencies guide](mllib-guide.html#Dependencies) for more info.
 
 Spark ML also depends upon Spark SQL, but the relevant parts of Spark SQL do not bring additional dependencies.
-
-# Developers
-
-**Development plan**
-
-If all goes well, `spark.ml` will become the primary ML package at the time of the Spark 1.3 release.  Initially, simple wrappers will be used to port algorithms to `spark.ml`, but eventually, code will be moved to `spark.ml` and `spark.mllib` will be deprecated.
-
-**Advice to developers**
-
-During the next development cycle, new algorithms should be contributed to `spark.mllib`, but we welcome patches sent to either package.  If an algorithm is best expressed using the new API (e.g., feature transformers), we may ask for developers to use the new `spark.ml` API.
-Wrappers for old and new algorithms can be contributed to `spark.ml`.
-
-Users will be able to use algorithms from either of the two packages.  The main difficulty will be the differences in APIs between the two packages.
-
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index efd7dda310712..39c64d06926bf 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -35,16 +35,20 @@ MLlib is under active development.
 The APIs marked `Experimental`/`DeveloperApi` may change in future releases, 
 and the migration guide below will explain all changes between releases.
 
-# spark.ml: The New ML Package
+# spark.ml: high-level APIs for ML pipelines
 
-Spark 1.2 includes a new machine learning package called `spark.ml`, currently an alpha component but potentially a successor to `spark.mllib`.  The `spark.ml` package aims to replace the old APIs with a cleaner, more uniform set of APIs which will help users create full machine learning pipelines.
+Spark 1.2 includes a new package called `spark.ml`, which aims to provide a uniform set of
+high-level APIs that help users create and tune practical machine learning pipelines.
+It is currently an alpha component, and we would like to hear back from the community about
+how it fits real-world use cases and how it could be improved.
 
-See the **[spark.ml programming guide](ml-guide.html)** for more information on this package.
-
-Users can use algorithms from either of the two packages, but APIs may differ.  Currently, `spark.ml` offers a subset of the algorithms from `spark.mllib`.
+Note that we will keep supporting and adding features to `spark.mllib` along with the
+development of `spark.ml`.
+Users should be comfortable using `spark.mllib` features and expect more features coming.
+Developers should contribute new algorithms to `spark.mllib` and can optionally contribute
+to `spark.ml`.
 
-Developers should contribute new algorithms to `spark.mllib` and can optionally contribute to `spark.ml`.
-See the `spark.ml` programming guide linked above for more details.
+See the **[spark.ml programming guide](ml-guide.html)** for more information on this package.
 
 # Dependencies
 
diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 5e0d5c15d7069..2443fc29b4706 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -913,7 +913,7 @@ for details.
 </tr>
 <tr>
   <td> <b>cogroup</b>(<i>otherDataset</i>, [<i>numTasks</i>]) </td>
-  <td> When called on datasets of type (K, V) and (K, W), returns a dataset of (K, Iterable&lt;V&gt;, Iterable&lt;W&gt;) tuples. This operation is also called <code>groupWith</code>. </td>
+  <td> When called on datasets of type (K, V) and (K, W), returns a dataset of (K, (Iterable&lt;V&gt;, Iterable&lt;W&gt;)) tuples. This operation is also called <code>groupWith</code>. </td>
 </tr>
 <tr>
   <td> <b>cartesian</b>(<i>otherDataset</i>) </td>
@@ -1316,7 +1316,35 @@ For accumulator updates performed inside <b>actions only</b>, Spark guarantees t
 will only be applied once, i.e. restarted tasks will not update the value. In transformations, users should be aware 
 of that each task's update may be applied more than once if tasks or job stages are re-executed.
 
+Accumulators do not change the lazy evaluation model of Spark. If they are being updated within an operation on an RDD, their value is only updated once that RDD is computed as part of an action. Consequently, accumulator updates are not guaranteed to be executed when made within a lazy transformation like `map()`. The below code fragment demonstrates this property:
 
+<div class="codetabs">
+
+<div data-lang="scala"  markdown="1">
+{% highlight scala %}
+val acc = sc.accumulator(0)
+data.map(x => acc += x; f(x))
+// Here, acc is still 0 because no actions have cause the `map` to be computed.
+{% endhighlight %}
+</div>
+
+<div data-lang="java"  markdown="1">
+{% highlight java %}
+Accumulator<Integer> accum = sc.accumulator(0);
+data.map(x -> accum.add(x); f(x););
+// Here, accum is still 0 because no actions have cause the `map` to be computed.
+{% endhighlight %}
+</div>
+
+<div data-lang="python"  markdown="1">
+{% highlight python %}
+accum = sc.accumulator(0)
+data.map(lambda x => acc.add(x); f(x))
+# Here, acc is still 0 because no actions have cause the `map` to be computed.
+{% endhighlight %}
+</div>
+
+</div>
 
 # Deploying to a Cluster
 
diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index 1073abb202c56..78358499fd01f 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -2,6 +2,8 @@
 layout: global
 title: Running Spark on Mesos
 ---
+* This will become a table of contents (this text will be scraped).
+{:toc}
 
 Spark can run on hardware clusters managed by [Apache Mesos](http://mesos.apache.org/).
 
@@ -183,6 +185,49 @@ node. Please refer to [Hadoop on Mesos](https://github.com/mesos/hadoop).
 In either case, HDFS runs separately from Hadoop MapReduce, without being scheduled through Mesos.
 
 
+# Configuration
+
+See the [configuration page](configuration.html) for information on Spark configurations.  The following configs are specific for Spark on Mesos.
+
+#### Spark Properties
+
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.mesos.coarse</code></td>
+  <td>false</td>
+  <td>
+    Set the run mode for Spark on Mesos. For more information about the run mode, refer to #Mesos Run Mode section above.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.mesos.extra.cores</code></td>
+  <td>0</td>
+  <td>
+    Set the extra amount of cpus to request per task. This setting is only used for Mesos coarse grain mode.
+    The total amount of cores requested per task is the number of cores in the offer plus the extra cores configured.
+    Note that total amount of cores the executor will request in total will not exceed the spark.cores.max setting.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.mesos.executor.home</code></td>
+  <td>SPARK_HOME</td>
+  <td>
+    The location where the mesos executor will look for Spark binaries to execute, and uses the SPARK_HOME setting on default.
+    This variable is only used when no spark.executor.uri is provided, and assumes Spark is installed on the specified location
+    on each slave.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.mesos.executor.memoryOverhead</code></td>
+  <td>384</td>
+  <td>
+    The amount of memory that Mesos executor will request for the task to account for the overhead of running the executor itself.
+    The final total amount of memory allocated is the maximum value between executor memory plus memoryOverhead, and overhead fraction (1.07) plus the executor memory.
+  </td>
+</tr>
+</table>
+
 # Troubleshooting and Debugging
 
 A few places to look during debugging:
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 62b317129b72a..68ab127bcf087 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -22,10 +22,37 @@ Most of the configs are the same for Spark on YARN as for other deployment modes
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
-  <td><code>spark.yarn.applicationMaster.waitTries</code></td>
-  <td>10</td>
+  <td><code>spark.yarn.am.memory</code></td>
+  <td>512m</td>
   <td>
-    Set the number of times the ApplicationMaster waits for the the Spark master and then also the number of tries it waits for the SparkContext to be initialized
+    Amount of memory to use for the YARN Application Master in client mode, in the same format as JVM memory strings (e.g. <code>512m</code>, <code>2g</code>).
+    In cluster mode, use <code>spark.driver.memory</code> instead.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.driver.cores</code></td>
+  <td>1</td>
+  <td>
+    Number of cores used by the driver in YARN cluster mode.
+    Since the driver is run in the same JVM as the YARN Application Master in cluster mode, this also controls the cores used by the YARN AM.
+    In client mode, use <code>spark.yarn.am.cores</code> to control the number of cores used by the YARN AM instead.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.yarn.am.cores</code></td>
+  <td>1</td>
+  <td>
+    Number of cores to use for the YARN Application Master in client mode.
+    In cluster mode, use <code>spark.driver.cores</code> instead.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.yarn.am.waitTime</code></td>
+  <td>100000</td>
+  <td>
+    In yarn-cluster mode, time in milliseconds for the application master to wait for the
+    SparkContext to be initialized. In yarn-client mode, time for the application master to wait
+    for the driver to connect to it.
   </td>
 </tr>
 <tr>
@@ -88,7 +115,14 @@ Most of the configs are the same for Spark on YARN as for other deployment modes
   <td><code>spark.yarn.driver.memoryOverhead</code></td>
   <td>driverMemory * 0.07, with minimum of 384 </td>
   <td>
-    The amount of off heap memory (in megabytes) to be allocated per driver. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%).
+    The amount of off heap memory (in megabytes) to be allocated per driver in cluster mode. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.yarn.am.memoryOverhead</code></td>
+  <td>AM memory * 0.07, with minimum of 384 </td>
+  <td>
+    Same as <code>spark.yarn.driver.memoryOverhead</code>, but for the Application Master in client mode.
   </td>
 </tr>
 <tr>
@@ -139,6 +173,22 @@ Most of the configs are the same for Spark on YARN as for other deployment modes
     The maximum number of threads to use in the application master for launching executor containers.
   </td>
 </tr>
+<tr>
+  <td><code>spark.yarn.am.extraJavaOptions</code></td>
+  <td>(none)</td>
+  <td>
+  A string of extra JVM options to pass to the YARN Application Master in client mode.
+  In cluster mode, use spark.driver.extraJavaOptions instead.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.yarn.maxAppAttempts</code></td>
+  <td>yarn.resourcemanager.am.max-attempts in YARN</td>
+  <td>
+  The maximum number of attempts that will be made to submit the application.
+  It should be no larger than the global number of max attempts in the YARN configuration.
+  </td>
+</tr>
 </table>
 
 # Launching Spark on YARN
@@ -201,18 +251,18 @@ settings and a restart of all node managers. Thus, this is not applicable to hos
 
 To use a custom log4j configuration for the application master or executors, there are two options:
 
-- upload a custom log4j.properties using spark-submit, by adding it to the "--files" list of files
+- upload a custom `log4j.properties` using `spark-submit`, by adding it to the `--files` list of files
   to be uploaded with the application.
-- add "-Dlog4j.configuration=<location of configuration file>" to "spark.driver.extraJavaOptions"
-  (for the driver) or "spark.executor.extraJavaOptions" (for executors). Note that if using a file,
-  the "file:" protocol should be explicitly provided, and the file needs to exist locally on all
+- add `-Dlog4j.configuration=<location of configuration file>` to `spark.driver.extraJavaOptions`
+  (for the driver) or `spark.executor.extraJavaOptions` (for executors). Note that if using a file,
+  the `file:` protocol should be explicitly provided, and the file needs to exist locally on all
   the nodes.
 
 Note that for the first option, both executors and the application master will share the same
 log4j configuration, which may cause issues when they run on the same node (e.g. trying to write
 to the same log file).
 
-If you need a reference to the proper location to put log files in the YARN so that YARN can properly display and aggregate them, use "${spark.yarn.app.container.log.dir}" in your log4j.properties. For example, log4j.appender.file_appender.File=${spark.yarn.app.container.log.dir}/spark.log. For streaming application, configuring RollingFileAppender and setting file location to YARN's log directory will avoid disk overflow caused by large log file, and logs can be accessed using YARN's log utility.
+If you need a reference to the proper location to put log files in the YARN so that YARN can properly display and aggregate them, use `spark.yarn.app.container.log.dir` in your log4j.properties. For example, `log4j.appender.file_appender.File=${spark.yarn.app.container.log.dir}/spark.log`. For streaming application, configuring `RollingFileAppender` and setting file location to YARN's log directory will avoid disk overflow caused by large log file, and logs can be accessed using YARN's log utility.
 
 # Important notes
 
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index ae7b81d5bb71f..5c6084fb46255 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -257,7 +257,7 @@ To run an interactive Spark shell against the cluster, run the following command
 
 You can also pass an option `--total-executor-cores <numCores>` to control the number of cores that spark-shell uses on the cluster.
 
-# Launching Compiled Spark Applications
+# Launching Spark Applications
 
 The [`spark-submit` script](submitting-applications.html) provides the most straightforward way to
 submit a compiled Spark application to the cluster. For standalone clusters, Spark currently
@@ -272,6 +272,15 @@ should specify them through the `--jars` flag using comma as a delimiter (e.g. `
 To control the application's configuration or execution environment, see
 [Spark Configuration](configuration.html).
 
+Additionally, standalone `cluster` mode supports restarting your application automatically if it
+exited with non-zero exit code. To use this feature, you may pass in the `--supervise` flag to
+`spark-submit` when launching your application. Then, if you wish to kill an application that is
+failing repeatedly, you may do so through:
+
+    ./bin/spark-class org.apache.spark.deploy.Client kill <master url> <driver ID>
+
+You can find the driver ID through the standalone Master web UI at `http://<master url>:8080`.
+
 # Resource Scheduling
 
 The standalone cluster mode currently only supports a simple FIFO scheduler across applications.
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index be284fbe217a5..be8c5c2c1522e 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -625,6 +625,10 @@ This conversion can be done using one of two methods in a SQLContext:
 * `jsonFile` - loads data from a directory of JSON files where each line of the files is a JSON object.
 * `jsonRDD` - loads data from an existing RDD where each element of the RDD is a string containing a JSON object.
 
+Note that the file that is offered as _jsonFile_ is not a typical JSON file. Each
+line must contain a separate, self-contained valid JSON object. As a consequence,
+a regular multi-line JSON file will most often fail.
+
 {% highlight scala %}
 // sc is an existing SparkContext.
 val sqlContext = new org.apache.spark.sql.SQLContext(sc)
@@ -663,6 +667,10 @@ This conversion can be done using one of two methods in a JavaSQLContext :
 * `jsonFile` - loads data from a directory of JSON files where each line of the files is a JSON object.
 * `jsonRDD` - loads data from an existing RDD where each element of the RDD is a string containing a JSON object.
 
+Note that the file that is offered as _jsonFile_ is not a typical JSON file. Each
+line must contain a separate, self-contained valid JSON object. As a consequence,
+a regular multi-line JSON file will most often fail.
+
 {% highlight java %}
 // sc is an existing JavaSparkContext.
 JavaSQLContext sqlContext = new org.apache.spark.sql.api.java.JavaSQLContext(sc);
@@ -701,6 +709,10 @@ This conversion can be done using one of two methods in a SQLContext:
 * `jsonFile` - loads data from a directory of JSON files where each line of the files is a JSON object.
 * `jsonRDD` - loads data from an existing RDD where each element of the RDD is a string containing a JSON object.
 
+Note that the file that is offered as _jsonFile_ is not a typical JSON file. Each
+line must contain a separate, self-contained valid JSON object. As a consequence,
+a regular multi-line JSON file will most often fail.
+
 {% highlight python %}
 # sc is an existing SparkContext.
 from pyspark.sql import SQLContext
@@ -819,13 +831,10 @@ turning on some experimental options.
 
 ## Caching Data In Memory
 
-Spark SQL can cache tables using an in-memory columnar format by calling `sqlContext.cacheTable("tableName")`.
+Spark SQL can cache tables using an in-memory columnar format by calling `sqlContext.cacheTable("tableName")` or `schemaRDD.cache()`.
 Then Spark SQL will scan only required columns and will automatically tune compression to minimize
 memory usage and GC pressure. You can call `sqlContext.uncacheTable("tableName")` to remove the table from memory.
 
-Note that if you call `schemaRDD.cache()` rather than `sqlContext.cacheTable(...)`, tables will _not_ be cached using
-the in-memory columnar format, and therefore `sqlContext.cacheTable(...)` is strongly recommended for this use case.
-
 Configuration of in-memory caching can be done using the `setConf` method on SQLContext or by running
 `SET key=value` commands using SQL.
 
@@ -938,6 +947,18 @@ Configuration of Hive is done by placing your `hive-site.xml` file in `conf/`.
 
 You may also use the beeline script that comes with Hive.
 
+Thrift JDBC server also supports sending thrift RPC messages over HTTP transport. 
+Use the following setting to enable HTTP mode as system property or in `hive-site.xml` file in `conf/`: 
+
+    hive.server2.transport.mode - Set this to value: http 
+    hive.server2.thrift.http.port - HTTP port number fo listen on; default is 10001
+    hive.server2.http.endpoint - HTTP endpoint; default is cliservice
+
+To test, use beeline to connect to the JDBC/ODBC server in http mode with:
+
+    beeline> !connect jdbc:hive2://<host>:<port>/<database>?hive.server2.transport.mode=http;hive.server2.thrift.http.path=<http_endpoint>
+
+
 ## Running the Spark SQL CLI
 
 The Spark SQL CLI is a convenient tool to run the Hive metastore service in local mode and execute
@@ -986,12 +1007,11 @@ let user control table caching explicitly:
     CACHE TABLE logs_last_month;
     UNCACHE TABLE logs_last_month;
 
-**NOTE:** `CACHE TABLE tbl` is lazy, similar to `.cache` on an RDD. This command only marks `tbl` to ensure that
-partitions are cached when calculated but doesn't actually cache it until a query that touches `tbl` is executed.
-To force the table to be cached, you may simply count the table immediately after executing `CACHE TABLE`:
+**NOTE:** `CACHE TABLE tbl` is now __eager__ by default not __lazy__. Don’t need to trigger cache materialization manually anymore.
 
-    CACHE TABLE logs_last_month;
-    SELECT COUNT(1) FROM logs_last_month;
+Spark SQL newly introduced a statement to let user control table caching whether or not lazy since Spark 1.2.0:
+
+	CACHE [LAZY] TABLE [AS SELECT] ...
 
 Several caching related features are not supported yet:
 
@@ -1313,9 +1333,9 @@ import  org.apache.spark.sql._
 <div data-lang="java" markdown="1">
 
 All data types of Spark SQL are located in the package of
-`org.apache.spark.sql.api.java`. To access or create a data type,
+`org.apache.spark.sql.types`. To access or create a data type,
 please use factory methods provided in
-`org.apache.spark.sql.api.java.DataType`.
+`org.apache.spark.sql.types.DataTypes`.
 
 <table class="table">
 <tr>
@@ -1326,109 +1346,110 @@ please use factory methods provided in
   <td> <b>ByteType</b> </td>
   <td> byte or Byte </td>
   <td>
-  DataType.ByteType
+  DataTypes.ByteType
   </td>
 </tr>
 <tr>
   <td> <b>ShortType</b> </td>
   <td> short or Short </td>
   <td>
-  DataType.ShortType
+  DataTypes.ShortType
   </td>
 </tr>
 <tr>
   <td> <b>IntegerType</b> </td>
   <td> int or Integer </td>
   <td>
-  DataType.IntegerType
+  DataTypes.IntegerType
   </td>
 </tr>
 <tr>
   <td> <b>LongType</b> </td>
   <td> long or Long </td>
   <td>
-  DataType.LongType
+  DataTypes.LongType
   </td>
 </tr>
 <tr>
   <td> <b>FloatType</b> </td>
   <td> float or Float </td>
   <td>
-  DataType.FloatType
+  DataTypes.FloatType
   </td>
 </tr>
 <tr>
   <td> <b>DoubleType</b> </td>
   <td> double or Double </td>
   <td>
-  DataType.DoubleType
+  DataTypes.DoubleType
   </td>
 </tr>
 <tr>
   <td> <b>DecimalType</b> </td>
   <td> java.math.BigDecimal </td>
   <td>
-  DataType.DecimalType
+  DataTypes.createDecimalType()<br />
+  DataTypes.createDecimalType(<i>precision</i>, <i>scale</i>).
   </td>
 </tr>
 <tr>
   <td> <b>StringType</b> </td>
   <td> String </td>
   <td>
-  DataType.StringType
+  DataTypes.StringType
   </td>
 </tr>
 <tr>
   <td> <b>BinaryType</b> </td>
   <td> byte[] </td>
   <td>
-  DataType.BinaryType
+  DataTypes.BinaryType
   </td>
 </tr>
 <tr>
   <td> <b>BooleanType</b> </td>
   <td> boolean or Boolean </td>
   <td>
-  DataType.BooleanType
+  DataTypes.BooleanType
   </td>
 </tr>
 <tr>
   <td> <b>TimestampType</b> </td>
   <td> java.sql.Timestamp </td>
   <td>
-  DataType.TimestampType
+  DataTypes.TimestampType
   </td>
 </tr>
 <tr>
   <td> <b>DateType</b> </td>
   <td> java.sql.Date </td>
   <td>
-  DataType.DateType
+  DataTypes.DateType
   </td>
 </tr>
 <tr>
   <td> <b>ArrayType</b> </td>
   <td> java.util.List </td>
   <td>
-  DataType.createArrayType(<i>elementType</i>)<br />
+  DataTypes.createArrayType(<i>elementType</i>)<br />
   <b>Note:</b> The value of <i>containsNull</i> will be <i>true</i><br />
-  DataType.createArrayType(<i>elementType</i>, <i>containsNull</i>).
+  DataTypes.createArrayType(<i>elementType</i>, <i>containsNull</i>).
   </td>
 </tr>
 <tr>
   <td> <b>MapType</b> </td>
   <td> java.util.Map </td>
   <td>
-  DataType.createMapType(<i>keyType</i>, <i>valueType</i>)<br />
+  DataTypes.createMapType(<i>keyType</i>, <i>valueType</i>)<br />
   <b>Note:</b> The value of <i>valueContainsNull</i> will be <i>true</i>.<br />
-  DataType.createMapType(<i>keyType</i>, <i>valueType</i>, <i>valueContainsNull</i>)<br />
+  DataTypes.createMapType(<i>keyType</i>, <i>valueType</i>, <i>valueContainsNull</i>)<br />
   </td>
 </tr>
 <tr>
   <td> <b>StructType</b> </td>
   <td> org.apache.spark.sql.api.java.Row </td>
   <td>
-  DataType.createStructType(<i>fields</i>)<br />
+  DataTypes.createStructType(<i>fields</i>)<br />
   <b>Note:</b> <i>fields</i> is a List or an array of StructFields.
   Also, two fields with the same name are not allowed.
   </td>
@@ -1438,7 +1459,7 @@ please use factory methods provided in
   <td> The value type in Java of the data type of this field
   (For example, int for a StructField with the data type IntegerType) </td>
   <td>
-  DataType.createStructField(<i>name</i>, <i>dataType</i>, <i>nullable</i>)
+  DataTypes.createStructField(<i>name</i>, <i>dataType</i>, <i>nullable</i>)
   </td>
 </tr>
 </table>
diff --git a/docs/streaming-custom-receivers.md b/docs/streaming-custom-receivers.md
index 27cd085782f66..6a2048121f8bf 100644
--- a/docs/streaming-custom-receivers.md
+++ b/docs/streaming-custom-receivers.md
@@ -7,25 +7,30 @@ Spark Streaming can receive streaming data from any arbitrary data source beyond
 the one's for which it has in-built support (that is, beyond Flume, Kafka, Kinesis, files, sockets, etc.).
 This requires the developer to implement a *receiver* that is customized for receiving data from
 the concerned data source. This guide walks through the process of implementing a custom receiver
-and using it in a Spark Streaming application.
+and using it in a Spark Streaming application. Note that custom receivers can be implemented
+in Scala or Java.
 
-### Implementing a Custom Receiver
+## Implementing a Custom Receiver
 
-This starts with implementing a [Receiver](api/scala/index.html#org.apache.spark.streaming.receiver.Receiver).
+This starts with implementing a **Receiver**
+([Scala doc](api/scala/index.html#org.apache.spark.streaming.receiver.Receiver),
+[Java doc](api/java/org/apache/spark/streaming/receiver/Receiver.html)).
 A custom receiver must extend this abstract class by implementing two methods
+
 - `onStart()`: Things to do to start receiving data.
 - `onStop()`: Things to do to stop receiving data.
 
-Note that `onStart()` and `onStop()` must not block indefinitely. Typically, onStart() would start the threads
+Both `onStart()` and `onStop()` must not block indefinitely. Typically, `onStart()` would start the threads
 that responsible for receiving the data and `onStop()` would ensure that the receiving by those threads
 are stopped. The receiving threads can also use `isStopped()`, a `Receiver` method, to check whether they
 should stop receiving data.
 
 Once the data is received, that data can be stored inside Spark
-by calling `store(data)`, which is a method provided by the
-[Receiver](api/scala/index.html#org.apache.spark.streaming.receiver.Receiver) class.
+by calling `store(data)`, which is a method provided by the Receiver class.
 There are number of flavours of `store()` which allow you store the received data
-record-at-a-time or as whole collection of objects / serialized bytes.
+record-at-a-time or as whole collection of objects / serialized bytes. Note that the flavour of
+`store()` used to implemented a receiver affects its reliability and fault-tolerance semantics.
+This is discussed [later](#receiver-reliability) in more detail.
 
 Any exception in the receiving threads should be caught and handled properly to avoid silent
 failures of the receiver. `restart(<exception>)` will restart the receiver by
@@ -158,7 +163,7 @@ public class JavaCustomReceiver extends Receiver<String> {
 </div>
 
 
-### Using the custom receiver in a Spark Streaming application
+## Using the custom receiver in a Spark Streaming application
 
 The custom receiver can be used in a Spark Streaming application by using
 `streamingContext.receiverStream(<instance of custom receiver>)`. This will create
@@ -191,9 +196,68 @@ The full source code is in the example [JavaCustomReceiver.java](https://github.
 </div>
 </div>
 
-
-
-### Implementing and Using a Custom Actor-based Receiver
+## Receiver Reliability
+As discussed in brief in the
+[Spark Streaming Programming Guide](streaming-programming-guide.html#receiver-reliability),
+there are two kinds of receivers based on their reliability and fault-tolerance semantics.
+
+1. *Reliable Receiver* - For *reliable sources* that allow sent data to be acknowledged, a
+  *reliable receiver* correctly acknowledges to the source that the data has been received
+  and stored in Spark reliably (that is, replicated successfully). Usually,
+  implementing this receiver involves careful consideration of the semantics of source
+  acknowledgements.
+1. *Unreliable Receiver* - These are receivers for unreliable sources that do not support
+  acknowledging. Even for reliable sources, one may implement an unreliable receiver that
+  do not go into the complexity of acknowledging correctly.
+
+To implement a *reliable receiver*, you have to use `store(multiple-records)` to store data.
+This flavour of `store` is a blocking call which returns only after all the given records have
+been stored inside Spark. If the receiver's configured storage level uses replication
+(enabled by default), then this call returns after replication has completed.
+Thus it ensures that the data is reliably stored, and the receiver can now acknowledge the
+source appropriately. This ensures that no data is caused when the receiver fails in the middle
+of replicating data -- the buffered data will not be acknowledged and hence will be later resent
+by the source.
+
+An *unreliable receiver* does not have to implement any of this logic. It can simply receive
+records from the source and insert them one-at-a-time using `store(single-record)`. While it does
+not get the reliability guarantees of `store(multiple-records)`, it has the following advantages.
+
+- The system takes care of chunking that data into appropriate sized blocks (look for block
+interval in the [Spark Streaming Programming Guide](streaming-programming-guide.html)).
+- The system takes care of controlling the receiving rates if the rate limits have been specified.
+- Because of these two, unreliable receivers are simpler to implement than reliable receivers.
+
+The following table summarizes the characteristics of both types of receivers
+
+<table class="table">
+<tr>
+  <th>Receiver Type</th>
+  <th>Characteristics</th>
+</tr>
+<tr>
+  <td><b>Unreliable Receivers</b></td>
+  <td>
+    Simple to implement.<br>
+    System takes care of block generation and rate control.
+    No fault-tolerance guarantees, can lose data on receiver failure.
+  </td>
+</tr>
+<tr>
+  <td><b>Reliable Receivers</b></td>
+  <td>
+    Strong fault-tolerance guarantees, can ensure zero data loss.<br/>
+    Block generation and rate control to be handled by the receiver implementation.<br/>
+    Implementation complexity depends on the acknowledgement mechanisms of the source.
+  </td>
+</tr>
+<tr>
+  <td></td>
+  <td></td>
+</tr>
+</table>
+
+## Implementing and Using a Custom Actor-based Receiver
 
 Custom [Akka Actors](http://doc.akka.io/docs/akka/2.2.4/scala/actors.html) can also be used to
 receive data. The [`ActorHelper`](api/scala/index.html#org.apache.spark.streaming.receiver.ActorHelper)
@@ -203,7 +267,7 @@ trait can be applied on any Akka actor, which allows received data to be stored
 {% highlight scala %}
 class CustomActor extends Actor with ActorHelper {
   def receive = {
-   case data: String => store(data)
+    case data: String => store(data)
   }
 }
 {% endhighlight %}
@@ -217,5 +281,3 @@ val lines = ssc.actorStream[String](Props(new CustomActor()), "CustomReceiver")
 
 See [ActorWordCount.scala](https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/examples/streaming/ActorWordCount.scala)
 for an end-to-end example.
-
-
diff --git a/docs/streaming-flume-integration.md b/docs/streaming-flume-integration.md
index d57c3e0ef9ba0..ac01dd3d8019a 100644
--- a/docs/streaming-flume-integration.md
+++ b/docs/streaming-flume-integration.md
@@ -66,9 +66,16 @@ configuring Flume agents.
 
 ## Approach 2 (Experimental): Pull-based Approach using a Custom Sink
 Instead of Flume pushing data directly to Spark Streaming, this approach runs a custom Flume sink that allows the following.
+
 - Flume pushes data into the sink, and the data stays buffered.
-- Spark Streaming uses transactions to pull data from the sink. Transactions succeed only after data is received and replicated by Spark Streaming.
-This ensures that better reliability and fault-tolerance than the previous approach. However, this requires configuring Flume to run a custom sink. Here are the configuration steps.
+- Spark Streaming uses a [reliable Flume receiver](streaming-programming-guide.html#receiver-reliability)
+  and transactions to pull data from the sink. Transactions succeed only after data is received and
+  replicated by Spark Streaming.
+
+This ensures stronger reliability and
+[fault-tolerance guarantees](streaming-programming-guide.html#fault-tolerance-semantics)
+than the previous approach. However, this requires configuring Flume to run a custom sink.
+Here are the configuration steps.
 
 #### General Requirements
 Choose a machine that will run the custom sink in a Flume agent. The rest of the Flume pipeline is configured to send data to that agent. Machines in the Spark cluster should have access to the chosen machine running the custom sink.
@@ -104,7 +111,7 @@ See the [Flume's documentation](https://flume.apache.org/documentation.html) for
 configuring Flume agents.
 
 #### Configuring Spark Streaming Application
-1. **Linking:** In your SBT/Maven projrect definition, link your streaming application against the `spark-streaming-flume_{{site.SCALA_BINARY_VERSION}}` (see [Linking section](streaming-programming-guide.html#linking) in the main programming guide).
+1. **Linking:** In your SBT/Maven project definition, link your streaming application against the `spark-streaming-flume_{{site.SCALA_BINARY_VERSION}}` (see [Linking section](streaming-programming-guide.html#linking) in the main programming guide).
 
 2. **Programming:** In the streaming application code, import `FlumeUtils` and create input DStream as follows.
 
diff --git a/docs/streaming-kafka-integration.md b/docs/streaming-kafka-integration.md
index a3b705d4c31d0..77c0abbbacbd0 100644
--- a/docs/streaming-kafka-integration.md
+++ b/docs/streaming-kafka-integration.md
@@ -4,7 +4,7 @@ title: Spark Streaming + Kafka Integration Guide
 ---
 [Apache Kafka](http://kafka.apache.org/) is publish-subscribe messaging rethought as a distributed, partitioned, replicated commit log service.  Here we explain how to configure Spark Streaming to receive data from Kafka.
 
-1. **Linking:** In your SBT/Maven projrect definition, link your streaming application against the following artifact (see [Linking section](streaming-programming-guide.html#linking) in the main programming guide for further information).
+1. **Linking:** In your SBT/Maven project definition, link your streaming application against the following artifact (see [Linking section](streaming-programming-guide.html#linking) in the main programming guide for further information).
 
 		groupId = org.apache.spark
 		artifactId = spark-streaming-kafka_{{site.SCALA_BINARY_VERSION}}
@@ -20,7 +20,7 @@ title: Spark Streaming + Kafka Integration Guide
         	streamingContext, [zookeeperQuorum], [group id of the consumer], [per-topic number of Kafka partitions to consume])
 
 	See the [API docs](api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$)
-	and the [example]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala).
+	and the [example]({{site.SPARK_GITHUB_URL}}/blob/master/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala).
 	</div>
 	<div data-lang="java" markdown="1">
 		import org.apache.spark.streaming.kafka.*;
@@ -29,7 +29,7 @@ title: Spark Streaming + Kafka Integration Guide
         	streamingContext, [zookeeperQuorum], [group id of the consumer], [per-topic number of Kafka partitions to consume]);
 
 	See the [API docs](api/java/index.html?org/apache/spark/streaming/kafka/KafkaUtils.html)
-	and the [example]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/examples/streaming/JavaKafkaWordCount.java).
+	and the [example]({{site.SPARK_GITHUB_URL}}/blob/master/examples/scala-2.10/src/main/java/org/apache/spark/examples/streaming/JavaKafkaWordCount.java).
 	</div>
 	</div>
 
@@ -40,3 +40,20 @@ title: Spark Streaming + Kafka Integration Guide
 	- Multiple Kafka input DStreams can be created with different groups and topics for parallel receiving of data using multiple receivers.
 
 3. **Deploying:** Package `spark-streaming-kafka_{{site.SCALA_BINARY_VERSION}}` and its dependencies (except `spark-core_{{site.SCALA_BINARY_VERSION}}` and `spark-streaming_{{site.SCALA_BINARY_VERSION}}` which are provided by `spark-submit`) into the application JAR. Then use `spark-submit` to launch your application (see [Deploying section](streaming-programming-guide.html#deploying-applications) in the main programming guide).
+
+Note that the Kafka receiver used by default is an
+[*unreliable* receiver](streaming-programming-guide.html#receiver-reliability) section in the
+programming guide). In Spark 1.2, we have added an experimental *reliable* Kafka receiver that
+provides stronger
+[fault-tolerance guarantees](streaming-programming-guide.html#fault-tolerance-semantics) of zero
+data loss on failures. This receiver is automatically used when the write ahead log
+(also introduced in Spark 1.2) is enabled
+(see [Deployment](#deploying-applications.html) section in the programming guide). This
+may reduce the receiving throughput of individual Kafka receivers compared to the unreliable
+receivers, but this can be corrected by running
+[more receivers in parallel](streaming-programming-guide.html#level-of-parallelism-in-data-receiving)
+to increase aggregate throughput. Additionally, it is recommended that the replication of the
+received data within Spark be disabled when the write ahead log is enabled as the log is already stored
+in a replicated storage system. This can be done by setting the storage level for the input
+stream to `StorageLevel.MEMORY_AND_DISK_SER` (that is, use
+`KafkaUtils.createStream(..., StorageLevel.MEMORY_AND_DISK_SER)`).
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 5ebe834a32d31..e37a2bb37b9a4 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -7,13 +7,13 @@ title: Spark Streaming Programming Guide
 {:toc}
 
 # Overview
-Spark Streaming is an extension of the core Spark API that allows enables scalable, high-throughput,
+Spark Streaming is an extension of the core Spark API that enables scalable, high-throughput,
 fault-tolerant stream processing of live data streams. Data can be ingested from many sources
-like Kafka, Flume, Twitter, ZeroMQ, Kinesis or plain old TCP sockets and be processed using complex
+like Kafka, Flume, Twitter, ZeroMQ, Kinesis or TCP sockets can be processed using complex
 algorithms expressed with high-level functions like `map`, `reduce`, `join` and `window`.
 Finally, processed data can be pushed out to filesystems, databases,
 and live dashboards. In fact, you can apply Spark's
-[machine learning](mllib-guide.html) algorithms, and
+[machine learning](mllib-guide.html) and
 [graph processing](graphx-programming-guide.html) algorithms on data streams.
 
 <p style="text-align: center;">
@@ -38,16 +38,25 @@ stream of results in batches.
 
 Spark Streaming provides a high-level abstraction called *discretized stream* or *DStream*,
 which represents a continuous stream of data. DStreams can be created either from input data
-stream from sources such as Kafka, Flume, and Kinesis, or by applying high-level
+streams from sources such as Kafka, Flume, and Kinesis, or by applying high-level
 operations on other DStreams. Internally, a DStream is represented as a sequence of
 [RDDs](api/scala/index.html#org.apache.spark.rdd.RDD).
 
 This guide shows you how to start writing Spark Streaming programs with DStreams. You can
-write Spark Streaming programs in Scala or Java, both of which are presented in this guide. You
-will find tabs throughout this guide that let you choose between Scala and Java
-code snippets.
+write Spark Streaming programs in Scala, Java or Python (introduced in Spark 1.2),
+all of which are presented in this guide.
+You will find tabs throughout this guide that let you choose between code snippets of
+different languages.
+
+**Note:** Python API for Spark Streaming has been introduced in Spark 1.2. It has all the DStream
+transformations and almost all the output operations available in Scala and Java interfaces.
+However, it has only support for basic sources like text files and text data over sockets.
+APIs for additional sources, like Kafka and Flume, will be available in the future.
+Further information about available features in the Python API are mentioned throughout this
+document; look out for the tag
+<span class="badge" style="background-color: grey">Python API</span>.
 
-***************************************************************************************************  
+***************************************************************************************************
 
 # A Quick Example
 Before we go into the details of how to write your own Spark Streaming program,
@@ -66,7 +75,7 @@ main entry point for all streaming functionality. We create a local StreamingCon
 {% highlight scala %}
 import org.apache.spark._
 import org.apache.spark.streaming._
-import org.apache.spark.streaming.StreamingContext._
+import org.apache.spark.streaming.StreamingContext._ // not necessary in Spark 1.3+
 
 // Create a local StreamingContext with two working thread and batch interval of 1 second.
 // The master requires 2 cores to prevent from a starvation scenario.
@@ -76,7 +85,7 @@ val ssc = new StreamingContext(conf, Seconds(1))
 {% endhighlight %}
 
 Using this context, we can create a DStream that represents streaming data from a TCP
-source hostname, e.g. `localhost`, and port, e.g. `9999`
+source, specified as hostname (e.g. `localhost`) and port (e.g. `9999`).
 
 {% highlight scala %}
 // Create a DStream that will connect to hostname:port, like localhost:9999
@@ -98,7 +107,7 @@ each line will be split into multiple words and the stream of words is represent
 `words` DStream.  Next, we want to count these words.
 
 {% highlight scala %}
-import org.apache.spark.streaming.StreamingContext._
+import org.apache.spark.streaming.StreamingContext._ // not necessary in Spark 1.3+
 // Count each word in each batch
 val pairs = words.map(word => (word, 1))
 val wordCounts = pairs.reduceByKey(_ + _)
@@ -141,11 +150,11 @@ import scala.Tuple2;
 
 // Create a local StreamingContext with two working thread and batch interval of 1 second
 SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount")
-JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(1000))
+JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(1))
 {% endhighlight %}
 
 Using this context, we can create a DStream that represents streaming data from a TCP
-source hostname, e.g. `localhost`, and port, e.g. `9999`
+source, specified as hostname (e.g. `localhost`) and port (e.g. `9999`).
 
 {% highlight java %}
 // Create a DStream that will connect to hostname:port, like localhost:9999
@@ -216,7 +225,7 @@ The complete code can be found in the Spark Streaming example
 
 </div>
 <div data-lang="python"  markdown="1" >
-First, we import StreamingContext, which is the main entry point for all streaming functionality. We create a local StreamingContext with two execution threads, and batch interval of 1 second.
+First, we import [StreamingContext](api/python/pyspark.streaming.html#pyspark.streaming.StreamingContext), which is the main entry point for all streaming functionality. We create a local StreamingContext with two execution threads, and batch interval of 1 second.
 
 {% highlight python %}
 from pyspark import SparkContext
@@ -228,7 +237,7 @@ ssc = StreamingContext(sc, 1)
 {% endhighlight %}
 
 Using this context, we can create a DStream that represents streaming data from a TCP
-source hostname, e.g. `localhost`, and port, e.g. `9999`
+source, specified as hostname (e.g. `localhost`) and port (e.g. `9999`).
 
 {% highlight python %}
 # Create a DStream that will connect to hostname:port, like localhost:9999
@@ -308,7 +317,7 @@ $ ./bin/spark-submit examples/src/main/python/streaming/network_wordcount.py loc
 
 
 Then, any lines typed in the terminal running the netcat server will be counted and printed on
-screen every second. It will look something like this.
+screen every second. It will look something like the following.
 
 <table width="100%">
     <td>
@@ -372,7 +381,7 @@ Time: 2014-10-14 15:25:21
 ...
 {% endhighlight %}
 </div>
-</div>    
+</div>
     </td>
 </table>
 
@@ -382,8 +391,7 @@ Time: 2014-10-14 15:25:21
 
 # Basic Concepts
 
-Next, we move beyond the simple example and elaborate on the basics of Spark Streaming that you
-need to know to write your streaming applications.
+Next, we move beyond the simple example and elaborate on the basics of Spark Streaming.
 
 ## Linking
 
@@ -414,7 +422,7 @@ some of the common ones are as follows.
 <tr><th>Source</th><th>Artifact</th></tr>
 <tr><td> Kafka </td><td> spark-streaming-kafka_{{site.SCALA_BINARY_VERSION}} </td></tr>
 <tr><td> Flume </td><td> spark-streaming-flume_{{site.SCALA_BINARY_VERSION}} </td></tr>
-<tr><td> Kinesis<br/></td><td>spark-streaming-kinesis-asl_{{site.SCALA_BINARY_VERSION}} [Apache Software License] </td></tr>
+<tr><td> Kinesis<br/></td><td>spark-streaming-kinesis-asl_{{site.SCALA_BINARY_VERSION}} [Amazon Software License] </td></tr>
 <tr><td> Twitter </td><td> spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}} </td></tr>
 <tr><td> ZeroMQ </td><td> spark-streaming-zeromq_{{site.SCALA_BINARY_VERSION}} </td></tr>
 <tr><td> MQTT </td><td> spark-streaming-mqtt_{{site.SCALA_BINARY_VERSION}} </td></tr>
@@ -446,7 +454,7 @@ val ssc = new StreamingContext(conf, Seconds(1))
 
 The `appName` parameter is a name for your application to show on the cluster UI.
 `master` is a [Spark, Mesos or YARN cluster URL](submitting-applications.html#master-urls),
-or a special __"local[\*]"__ string to run in local mode. In practice, when running on a cluster, 
+or a special __"local[\*]"__ string to run in local mode. In practice, when running on a cluster,
 you will not want to hardcode `master` in the program,
 but rather [launch the application with `spark-submit`](submitting-applications.html) and
 receive it there. However, for local testing and unit tests, you can pass "local[\*]" to run Spark Streaming
@@ -481,7 +489,7 @@ JavaStreamingContext ssc = new JavaStreamingContext(conf, Duration(1000));
 
 The `appName` parameter is a name for your application to show on the cluster UI.
 `master` is a [Spark, Mesos or YARN cluster URL](submitting-applications.html#master-urls),
-or a special __"local[\*]"__ string to run in local mode. In practice, when running on a cluster, 
+or a special __"local[\*]"__ string to run in local mode. In practice, when running on a cluster,
 you will not want to hardcode `master` in the program,
 but rather [launch the application with `spark-submit`](submitting-applications.html) and
 receive it there. However, for local testing and unit tests, you can pass "local[*]" to run Spark Streaming
@@ -497,8 +505,8 @@ A `JavaStreamingContext` object can also be created from an existing `JavaSparkC
 import org.apache.spark.streaming.api.java.*;
 
 JavaSparkContext sc = ...   //existing JavaSparkContext
-JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(1000));
-{% endhighlight %} 
+JavaStreamingContext ssc = new JavaStreamingContext(sc, Durations.seconds(1));
+{% endhighlight %}
 </div>
 <div data-lang="python" markdown="1">
 
@@ -514,7 +522,7 @@ ssc = StreamingContext(sc, 1)
 
 The `appName` parameter is a name for your application to show on the cluster UI.
 `master` is a [Spark, Mesos or YARN cluster URL](submitting-applications.html#master-urls),
-or a special __"local[\*]"__ string to run in local mode. In practice, when running on a cluster, 
+or a special __"local[\*]"__ string to run in local mode. In practice, when running on a cluster,
 you will not want to hardcode `master` in the program,
 but rather [launch the application with `spark-submit`](submitting-applications.html) and
 receive it there. However, for local testing and unit tests, you can pass "local[\*]" to run Spark Streaming
@@ -526,17 +534,18 @@ section for more details.
 </div>
 </div>
 
-After a context is defined, you have to do the follow steps.
+After a context is defined, you have to do the following.
 
-1. Define the input sources.
-1. Setup the streaming computations.
-1. Start the receiving and procesing of data using `streamingContext.start()`.
-1. The processing will continue until `streamingContext.stop()` is called.
+1. Define the input sources by creating input DStreams.
+1. Define the streaming computations by applying transformation and output operations to DStreams.
+1. Start receiving data and processing it using `streamingContext.start()`.
+1. Wait for the processing to be stopped (manually or due to any error) using `streamingContext.awaitTermination()`.
+1. The processing can be manually stopped using `streamingContext.stop()`.
 
 ##### Points to remember:
 {:.no_toc}
-- Once a context has been started, no new streaming computations can be setup or added to it.
-- Once a context has been stopped, it cannot be started (that is, re-used) again.
+- Once a context has been started, no new streaming computations can be set up or added to it.
+- Once a context has been stopped, it cannot be restarted.
 - Only one StreamingContext can be active in a JVM at the same time.
 - stop() on StreamingContext also stops the SparkContext. To stop only the StreamingContext, set optional parameter of `stop()` called `stopSparkContext` to false.
 - A SparkContext can be re-used to create multiple StreamingContexts, as long as the previous StreamingContext is stopped (without stopping the SparkContext) before the next StreamingContext is created.
@@ -577,29 +586,54 @@ These operations are discussed in detail in later sections.
 
 ***
 
-## Input DStreams
-Input DStreams are DStreams representing the stream of raw data received from streaming sources.
-Spark Streaming has two categories of streaming sources.
+## Input DStreams and Receivers
+Input DStreams are DStreams representing the stream of input data received from streaming
+sources. In the [quick example](#a-quick-example), `lines` was an input DStream as it represented
+the stream of data received from the netcat server. Every input DStream
+(except file stream, discussed later in this section) is associated with a **Receiver**
+([Scala doc](api/scala/index.html#org.apache.spark.streaming.receiver.Receiver),
+[Java doc](api/java/org/apache/spark/streaming/receiver/Receiver.html)) object which receives the
+data from a source and stores it in Spark's memory for processing.
+
+Spark Streaming provides two categories of built-in streaming sources.
+
+- *Basic sources*: Sources directly available in the StreamingContext API.
+  Example: file systems, socket connections, and Akka actors.
+- *Advanced sources*: Sources like Kafka, Flume, Kinesis, Twitter, etc. are available through
+  extra utility classes. These require linking against extra dependencies as discussed in the
+  [linking](#linking) section.
+
+We are going to discuss some of the sources present in each category later in this section.
+
+Note that, if you want to receive multiple streams of data in parallel in your streaming
+application, you can create multiple input DStreams (discussed
+further in the [Performance Tuning](#level-of-parallelism-in-data-receiving) section). This will
+create multiple receivers which will simultaneously receive multiple data streams. But note that
+Spark worker/executor as a long-running task, hence it occupies one of the cores allocated to the
+Spark Streaming application. Hence, it is important to remember that Spark Streaming application
+needs to be allocated enough cores (or threads, if running locally) to process the received data,
+as well as, to run the receiver(s).
 
-- *Basic sources*: Sources directly available in the StreamingContext API. Example: file systems, socket connections, and Akka actors.
-- *Advanced sources*: Sources like Kafka, Flume, Kinesis, Twitter, etc. are available through extra utility classes. These require linking against extra dependencies as discussed in the [linking](#linking) section.
+##### Points to remember
+{:.no_toc}
 
-Every input DStream (except file stream) is associated with a single [Receiver](api/scala/index.html#org.apache.spark.streaming.receiver.Receiver) object which receives the data from a source and stores it in Spark's memory for processing. So every input DStream receives a single stream of data. Note that in a streaming application, you can create multiple input DStreams to receive multiple streams of data in parallel. This is discussed later in the [Performance Tuning](#level-of-parallelism-in-data-receiving) section.
+- When running a Spark Streaming program locally, do not use "local" or "local[1]" as the master URL.
+  Either of these means that only one thread will be used for running tasks locally. If you are using
+  a input DStream based on a receiver (e.g. sockets, Kafka, Flume, etc.), then the single thread will
+  be used to run the receiver, leaving no thread for processing the received data. Hence, when
+  running locally, always use "local[*n*]" as the master URL where *n* > number of receivers to run
+  (see [Spark Properties](configuration.html#spark-properties.html) for information on how to set
+  the master).
 
-A receiver is run within a Spark worker/executor as a long-running task, hence it occupies one of the cores allocated to the Spark Streaming application. Hence, it is important to remember that Spark Streaming application needs to be allocated enough cores to process the received data, as well as, to run the receiver(s). Therefore, few important points to remember are:
+- Extending the logic to running on a cluster, the number of cores allocated to the Spark Streaming
+  application must be more than the number of receivers. Otherwise the system will receive  data, but
+  not be able to process them.
 
-##### Points to remember
-{:.no_toc}
-- If the number of threads allocated to the application is less than or equal to the number of input DStreams / receivers, then the system will receive data, but not be able to process them.
-- When running locally, if you master URL is set to "local", then there is only one core to run tasks.  That is insufficient for programs using a DStream as the receiver (file streams are okay).  So, a "local" master URL in a streaming app is generally going to cause starvation for the processor.  
-Thus in any streaming app, you generally will want to allocate more than one thread (i.e. set your master to "local[2]") when testing locally.
-See [Spark Properties] (configuration.html#spark-properties.html).
-  
 ### Basic Sources
 {:.no_toc}
 
-We have already taken a look at the `ssc.socketTextStream(...)` in the [quick
-example](#a-quick-example) which creates a DStream from text
+We have already taken a look at the `ssc.socketTextStream(...)` in the [quick example](#a-quick-example)
+which creates a DStream from text
 data received over a TCP socket connection. Besides sockets, the StreamingContext API provides
 methods for creating DStreams from files and Akka actors as input sources.
 
@@ -607,10 +641,10 @@ methods for creating DStreams from files and Akka actors as input sources.
 
     <div class="codetabs">
     <div data-lang="scala" markdown="1">
-		streamingContext.fileStream[keyClass, valueClass, inputFormatClass](dataDirectory)
+        streamingContext.fileStream[KeyClass, ValueClass, InputFormatClass](dataDirectory)
     </div>
     <div data-lang="java" markdown="1">
-		streamingContext.fileStream<keyClass, valueClass, inputFormatClass>(dataDirectory);
+		streamingContext.fileStream<KeyClass, ValueClass, InputFormatClass>(dataDirectory);
     </div>
     <div data-lang="python" markdown="1">
 		streamingContext.textFileStream(dataDirectory)
@@ -626,22 +660,42 @@ methods for creating DStreams from files and Akka actors as input sources.
 
 	For simple text files, there is an easier method `streamingContext.textFileStream(dataDirectory)`. And file streams do not require running a receiver, hence does not require allocating cores.
 
-- **Streams based on Custom Actors:** DStreams can be created with data streams received through Akka actors by using `streamingContext.actorStream(actorProps, actor-name)`. See the [Custom Receiver Guide](streaming-custom-receivers.html#implementing-and-using-a-custom-actor-based-receiver) for more details.
+	<span class="badge" style="background-color: grey">Python API</span>	As of Spark 1.2,
+	`fileStream` is not available in the Python API, only	`textFileStream` is	available.
+
+- **Streams based on Custom Actors:** DStreams can be created with data streams received through Akka
+  actors by using `streamingContext.actorStream(actorProps, actor-name)`. See the [Custom Receiver
+  Guide](streaming-custom-receivers.html) for more details.
+
+  <span class="badge" style="background-color: grey">Python API</span> Since actors are available only in the Java and Scala
+  libraries, `actorStream` is not available in the Python API.
 
 - **Queue of RDDs as a Stream:** For testing a Spark Streaming application with test data, one can also create a DStream based on a queue of RDDs, using `streamingContext.queueStream(queueOfRDDs)`. Each RDD pushed into the queue will be treated as a batch of data in the DStream, and processed like a stream.
 
 For more details on streams from sockets, files, and actors,
 see the API documentations of the relevant functions in
 [StreamingContext](api/scala/index.html#org.apache.spark.streaming.StreamingContext) for
-Scala and [JavaStreamingContext](api/java/index.html?org/apache/spark/streaming/api/java/JavaStreamingContext.html) for Java.
+Scala, [JavaStreamingContext](api/java/index.html?org/apache/spark/streaming/api/java/JavaStreamingContext.html)
+for Java, and [StreamingContext](api/python/pyspark.streaming.html#pyspark.streaming.StreamingContext) for Python.
 
 ### Advanced Sources
 {:.no_toc}
-This category of sources require interfacing with external non-Spark libraries, some of them with complex dependencies (e.g., Kafka and Flume). Hence, to minimize issues related to version conflicts of dependencies, the functionality to create DStreams from these sources have been moved to separate libraries, that can be [linked to](#linking) explicitly as necessary.  For example, if you want to create a DStream using data from Twitter's stream of tweets, you have to do the following.
-
-1. *Linking*: Add the artifact `spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}}` to the SBT/Maven project dependencies.
-1. *Programming*: Import the `TwitterUtils` class and create a DStream with `TwitterUtils.createStream` as shown below.
-1. *Deploying*: Generate an uber JAR with all the dependencies (including the dependency `spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}}` and its transitive dependencies) and then deploy the application. This is further explained in the [Deploying section](#deploying-applications).
+<span class="badge" style="background-color: grey">Python API</span> As of Spark 1.2,
+these sources are not available in the Python API.
+
+This category of sources require interfacing with external non-Spark libraries, some of them with
+complex dependencies (e.g., Kafka and Flume). Hence, to minimize issues related to version conflicts
+of dependencies, the functionality to create DStreams from these sources have been moved to separate
+libraries, that can be [linked](#linking) to explicitly when necessary. For example, if you want to
+create a DStream using data from Twitter's stream of tweets, you have to do the following.
+
+1. *Linking*: Add the artifact `spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}}` to the
+  SBT/Maven project dependencies.
+1. *Programming*: Import the `TwitterUtils` class and create a DStream with
+  `TwitterUtils.createStream` as shown below.
+1. *Deploying*: Generate an uber JAR with all the dependencies (including the dependency
+  `spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}}` and its transitive dependencies) and
+  then deploy the application. This is further explained in the [Deploying section](#deploying-applications).
 
 <div class="codetabs">
 <div data-lang="scala">
@@ -660,17 +714,21 @@ TwitterUtils.createStream(jssc);
 </div>
 </div>
 
-Note that these advanced sources are not available in the `spark-shell`, hence applications based on these
-advanced sources cannot be tested in the shell.
+Note that these advanced sources are not available in the Spark shell, hence applications based on
+these advanced sources cannot be tested in the shell. If you really want to use them in the Spark
+shell you will have to download the corresponding Maven artifact's JAR along with its dependencies
+and it in the classpath.
 
 Some of these advanced sources are as follows.
 
 - **Twitter:** Spark Streaming's TwitterUtils uses Twitter4j 3.0.3 to get the public stream of tweets using
-    [Twitter's Streaming API](https://dev.twitter.com/docs/streaming-apis). Authentication information
-    can be provided by any of the [methods](http://twitter4j.org/en/configuration.html) supported by
-    Twitter4J library. You can either get the public stream, or get the filtered stream based on a
-    keywords. See the API documentation ([Scala](api/scala/index.html#org.apache.spark.streaming.twitter.TwitterUtils$), [Java](api/java/index.html?org/apache/spark/streaming/twitter/TwitterUtils.html)) and examples ([TwitterPopularTags]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterPopularTags.scala) and
-    [TwitterAlgebirdCMS]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterAlgebirdCMS.scala)).
+  [Twitter's Streaming API](https://dev.twitter.com/docs/streaming-apis). Authentication information
+  can be provided by any of the [methods](http://twitter4j.org/en/configuration.html) supported by
+  Twitter4J library. You can either get the public stream, or get the filtered stream based on a
+  keywords. See the API documentation ([Scala](api/scala/index.html#org.apache.spark.streaming.twitter.TwitterUtils$),
+  [Java](api/java/index.html?org/apache/spark/streaming/twitter/TwitterUtils.html)) and examples
+  ([TwitterPopularTags]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterPopularTags.scala)
+  and [TwitterAlgebirdCMS]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterAlgebirdCMS.scala)).
 
 - **Flume:** Spark Streaming {{site.SPARK_VERSION_SHORT}} can received data from Flume 1.4.0. See the [Flume Integration Guide](streaming-flume-integration.html) for more details.
 
@@ -680,14 +738,37 @@ Some of these advanced sources are as follows.
 
 ### Custom Sources
 {:.no_toc}
-Input DStreams can also be created out of custom data sources. All you have to do is implement an user-defined **receiver** (see next section to understand what that is) that can receive data from the custom sources and push it into Spark. See the
-[Custom Receiver Guide](streaming-custom-receivers.html) for details.
+
+<span class="badge" style="background-color: grey">Python API</span> As of Spark 1.2,
+these sources are not available in the Python API.
+
+Input DStreams can also be created out of custom data sources. All you have to do is implement an
+user-defined **receiver** (see next section to understand what that is) that can receive data from
+the custom sources and push it into Spark. See the [Custom Receiver
+Guide](streaming-custom-receivers.html) for details.
+
+### Receiver Reliability
+{:.no_toc}
+
+There can be two kinds of data sources based on their *reliability*. Sources
+(like Kafka and Flume) allow the transferred data to be acknowledged. If the system receiving
+data from these *reliable* sources acknowledge the received data correctly, it can be ensured
+that no data gets lost due to any kind of failure. This leads to two kinds of receivers.
+
+1. *Reliable Receiver* - A *reliable receiver* correctly acknowledges a reliable
+  source that the data has been received and stored in Spark with replication.
+1. *Unreliable Receiver* - These are receivers for sources that do not support acknowledging. Even
+  for reliable sources, one may implement an unreliable receiver that do not go into the complexity
+  of acknowledging correctly.
+
+The details of how to write a reliable receiver are discussed in the
+[Custom Receiver Guide](streaming-custom-receivers.html).
 
 ***
 
 ## Transformations on DStreams
 Similar to that of RDDs, transformations allow the data from the input DStream to be modified.
-DStreams support many of the transformations available on normal Spark RDD's. 
+DStreams support many of the transformations available on normal Spark RDD's.
 Some of the common ones are as follows.
 
 <table class="table">
@@ -841,6 +922,10 @@ the `(word, 1)` pairs) and the `runningCount` having the previous count. For the
 Scala code, take a look at the example
 [stateful_network_wordcount.py]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/python/streaming/stateful_network_wordcount.py).
 
+Note that using `updateStateByKey` requires the checkpoint directory to be configured, which is
+discussed in detail in the [checkpointing](#checkpointing) section.
+
+
 #### Transform Operation
 {:.no_toc}
 The `transform` operation (along with its variations like `transformWith`) allows
@@ -948,7 +1033,7 @@ Function2<Integer, Integer, Integer> reduceFunc = new Function2<Integer, Integer
 };
 
 // Reduce last 30 seconds of data, every 10 seconds
-JavaPairDStream<String, Integer> windowedWordCounts = pairs.reduceByKeyAndWindow(reduceFunc, new Duration(30000), new Duration(10000));
+JavaPairDStream<String, Integer> windowedWordCounts = pairs.reduceByKeyAndWindow(reduceFunc, Durations.seconds(30), Durations.seconds(10));
 {% endhighlight %}
 
 </div>
@@ -1005,7 +1090,8 @@ said two parameters - <i>windowLength</i> and <i>slideInterval</i>.
   of keys as the window slides. However, it is applicable to only "invertible reduce functions",
   that is, those reduce functions which have a corresponding "inverse reduce" function (taken as
   parameter <i>invFunc</i>. Like in <code>reduceByKeyAndWindow</code>, the number of reduce tasks
-  is configurable through an optional argument.
+  is configurable through an optional argument. Note that [checkpointing](#checkpointing) must be
+  enabled for using this operation.
 </td>
 </tr>
 <tr>
@@ -1026,49 +1112,58 @@ see [DStream](api/scala/index.html#org.apache.spark.streaming.dstream.DStream)
 and [PairDStreamFunctions](api/scala/index.html#org.apache.spark.streaming.dstream.PairDStreamFunctions).
 For the Java API, see [JavaDStream](api/java/index.html?org/apache/spark/streaming/api/java/JavaDStream.html)
 and [JavaPairDStream](api/java/index.html?org/apache/spark/streaming/api/java/JavaPairDStream.html).
-For the Python API, see [DStream](api/python/pyspark.streaming.html#pyspark.streaming.DStream)
+For the Python API, see [DStream](api/python/pyspark.streaming.html#pyspark.streaming.DStream).
 
 ***
 
 ## Output Operations on DStreams
 Output operations allow DStream's data to be pushed out external systems like a database or a file systems.
 Since the output operations actually allow the transformed data to be consumed by external systems,
-they trigger the actual execution of all the DStream transformations (similar to actions for RDDs). 
+they trigger the actual execution of all the DStream transformations (similar to actions for RDDs).
 Currently, the following output operations are defined:
 
 <table class="table">
 <tr><th style="width:30%">Output Operation</th><th>Meaning</th></tr>
 <tr>
   <td> <b>print</b>()</td>
-  <td> Prints first ten elements of every batch of data in a DStream on the driver. 
-  This is useful for development and debugging. 
+  <td> Prints first ten elements of every batch of data in a DStream on the driver node running
+  the streaming application. This is useful for development and debugging.
   <br/>
-  <b>PS</b>: called <b>pprint</b>() in Python)
+  <span class="badge" style="background-color: grey">Python API</span> This is called
+  <b>pprint()</b> in the Python API.
   </td>
 </tr>
+<tr>
+  <td> <b>saveAsTextFiles</b>(<i>prefix</i>, [<i>suffix</i>]) </td>
+  <td> Save this DStream's contents as a text files. The file name at each batch interval is
+  generated based on <i>prefix</i> and <i>suffix</i>: <i>"prefix-TIME_IN_MS[.suffix]"</i>. </td>
+</tr>
 <tr>
   <td> <b>saveAsObjectFiles</b>(<i>prefix</i>, [<i>suffix</i>]) </td>
-  <td> Save this DStream's contents as a <code>SequenceFile</code> of serialized objects. The file
+  <td> Save this DStream's contents as a <code>SequenceFile</code> of serialized Java objects. The file
   name at each batch interval is generated based on <i>prefix</i> and
   <i>suffix</i>: <i>"prefix-TIME_IN_MS[.suffix]"</i>.
+  <br/>
+  <span class="badge" style="background-color: grey">Python API</span> This is not available in
+  the Python API.
   </td>
 </tr>
-<tr>
-  <td> <b>saveAsTextFiles</b>(<i>prefix</i>, [<i>suffix</i>]) </td>
-  <td> Save this DStream's contents as a text files. The file name at each batch interval is
-  generated based on <i>prefix</i> and <i>suffix</i>: <i>"prefix-TIME_IN_MS[.suffix]"</i>. </td>
-</tr>
 <tr>
   <td> <b>saveAsHadoopFiles</b>(<i>prefix</i>, [<i>suffix</i>]) </td>
   <td> Save this DStream's contents as a Hadoop file. The file name at each batch interval is
-  generated based on <i>prefix</i> and <i>suffix</i>: <i>"prefix-TIME_IN_MS[.suffix]"</i>. </td>
+  generated based on <i>prefix</i> and <i>suffix</i>: <i>"prefix-TIME_IN_MS[.suffix]"</i>.
+  <br>
+  <span class="badge" style="background-color: grey">Python API</span> This is not available in
+  the Python API.
+  </td>
 </tr>
 <tr>
   <td> <b>foreachRDD</b>(<i>func</i>) </td>
   <td> The most generic output operator that applies a function, <i>func</i>, to each RDD generated from
   the stream. This function should push the data in each RDD to a external system, like saving the RDD to
   files, or writing it over the network to a database. Note that the function <i>func</i> is executed
-  at the driver, and will usually have RDD actions in it that will force the computation of the streaming RDDs.</td>
+  in the driver process running the streaming application, and will usually have RDD actions in it
+  that will force the computation of the streaming RDDs.</td>
 </tr>
 <tr><td></td><td></td></tr>
 </table>
@@ -1079,7 +1174,7 @@ Currently, the following output operations are defined:
 However, it is important to understand how to use this primitive correctly and efficiently.
 Some of the common mistakes to avoid are as follows.
 
-- Often writing data to external system requires creating a connection object
+Often writing data to external system requires creating a connection object
 (e.g. TCP connection to a remote server) and using it to send data to a remote system.
 For this purpose, a developer may inadvertently try creating a connection object at
 the Spark driver, but try to use it in a Spark worker to save records in the RDDs.
@@ -1087,78 +1182,78 @@ For example (in Scala),
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-
 {% highlight scala %}
-        dstream.foreachRDD(rdd => {
-            val connection = createNewConnection()  // executed at the driver
-            rdd.foreach(record => {
-                connection.send(record) // executed at the worker
-            })
-        })
+dstream.foreachRDD { rdd =>
+  val connection = createNewConnection()  // executed at the driver
+  rdd.foreach { record =>
+    connection.send(record) // executed at the worker
+  }
+}
 {% endhighlight %}
-
 </div>
 <div data-lang="python" markdown="1">
-
 {% highlight python %}
 def sendRecord(rdd):
     connection = createNewConnection()  # executed at the driver
     rdd.foreach(lambda record: connection.send(record))
     connection.close()
-        
+
 dstream.foreachRDD(sendRecord)
 {% endhighlight %}
-
 </div>
 </div>
 
-  This is incorrect as this requires the connection object to be serialized and sent from the driver to the worker. Such connection objects are rarely transferrable across machines. This error may manifest as serialization errors (connection object not serializable), initialization errors (connection object needs to be initialized at the workers), etc. The correct solution is to create the connection object at the worker.
+This is incorrect as this requires the connection object to be serialized and sent from the
+driver to the worker. Such connection objects are rarely transferrable across machines. This
+error may manifest as serialization errors (connection object not serializable), initialization
+errors (connection object needs to be initialized at the workers), etc. The correct solution is
+to create the connection object at the worker.
 
-- However, this can lead to another common mistake - creating a new connection for every record. For example,
+However, this can lead to another common mistake - creating a new connection for every record.
+For example,
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-
 {% highlight scala %}
-        dstream.foreachRDD(rdd => {
-            rdd.foreach(record => {
-                val connection = createNewConnection()
-                connection.send(record)
-                connection.close()
-            })
-        })
+dstream.foreachRDD { rdd =>
+  rdd.foreach { record =>
+    val connection = createNewConnection()
+    connection.send(record)
+    connection.close()
+  }
+}
 {% endhighlight %}
-
 </div>
 <div data-lang="python" markdown="1">
-
 {% highlight python %}
 def sendRecord(record):
     connection = createNewConnection()
     connection.send(record)
     connection.close()
-        
+
 dstream.foreachRDD(lambda rdd: rdd.foreach(sendRecord))
 {% endhighlight %}
-
 </div>
 </div>
 
-  Typically, creating a connection object has time and resource overheads. Therefore, creating and destroying a connection object for each record can incur unnecessarily high overheads and can significantly reduce the overall throughput of the system. A better solution is to use `rdd.foreachPartition` - create a single connection object and send all the records in a RDD partition using that connection.
+Typically, creating a connection object has time and resource overheads. Therefore, creating and
+destroying a connection object for each record can incur unnecessarily high overheads and can
+significantly reduce the overall throughput of the system. A better solution is to use
+`rdd.foreachPartition` - create a single connection object and send all the records in  a RDD
+partition using that connection.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 {% highlight scala %}
-        dstream.foreachRDD(rdd => {
-            rdd.foreachPartition(partitionOfRecords => {
-                val connection = createNewConnection()
-                partitionOfRecords.foreach(record => connection.send(record))
-                connection.close()
-            })
-        })
+dstream.foreachRDD { rdd =>
+  rdd.foreachPartition { partitionOfRecords =>
+    val connection = createNewConnection()
+    partitionOfRecords.foreach(record => connection.send(record))
+    connection.close()
+  }
+}
 {% endhighlight %}
 </div>
-
 <div data-lang="python" markdown="1">
 {% highlight python %}
 def sendPartition(iter):
@@ -1166,29 +1261,29 @@ def sendPartition(iter):
     for record in iter:
         connection.send(record)
     connection.close()
-    
+
 dstream.foreachRDD(lambda rdd: rdd.foreachPartition(sendPartition))
 {% endhighlight %}
 </div>
-</div>    
+</div>
 
   This amortizes the connection creation overheads over many records.
 
-- Finally, this can be further optimized by reusing connection objects across multiple RDDs/batches.
-	One can maintain a static pool of connection objects than can be reused as
-    RDDs of multiple batches are pushed to the external system, thus further reducing the overheads.
-    
+Finally, this can be further optimized by reusing connection objects across multiple RDDs/batches.
+One can maintain a static pool of connection objects than can be reused as
+RDDs of multiple batches are pushed to the external system, thus further reducing the overheads.
+
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 {% highlight scala %}
-        dstream.foreachRDD(rdd => {
-            rdd.foreachPartition(partitionOfRecords => {
-                // ConnectionPool is a static, lazily initialized pool of connections
-                val connection = ConnectionPool.getConnection()
-                partitionOfRecords.foreach(record => connection.send(record))
-                ConnectionPool.returnConnection(connection)  // return to the pool for future reuse
-            })
-        })
+dstream.foreachRDD { rdd =>
+  rdd.foreachPartition { partitionOfRecords =>
+    // ConnectionPool is a static, lazily initialized pool of connections
+    val connection = ConnectionPool.getConnection()
+    partitionOfRecords.foreach(record => connection.send(record))
+    ConnectionPool.returnConnection(connection)  // return to the pool for future reuse
+  }
+}
 {% endhighlight %}
 </div>
 
@@ -1201,11 +1296,11 @@ def sendPartition(iter):
         connection.send(record)
     # return to the pool for future reuse
     ConnectionPool.returnConnection(connection)
-    
+
 dstream.foreachRDD(lambda rdd: rdd.foreachPartition(sendPartition))
 {% endhighlight %}
 </div>
-</div> 
+</div>
 
 Note that the connections in the pool should be lazily created on demand and timed out if not used for a while. This achieves the most efficient sending of data to external systems.
 
@@ -1220,7 +1315,7 @@ Note that the connections in the pool should be lazily created on demand and tim
 
 ## Caching / Persistence
 Similar to RDDs, DStreams also allow developers to persist the stream's data in memory. That is,
-using `persist()` method on a DStream would automatically persist every RDD of that DStream in
+using `persist()` method on a DStream will automatically persist every RDD of that DStream in
 memory. This is useful if the data in the DStream will be computed multiple times (e.g., multiple
 operations on the same data). For window-based operations like `reduceByWindow` and
 `reduceByKeyAndWindow` and state-based operations like `updateStateByKey`, this is implicitly true.
@@ -1238,49 +1333,260 @@ information on different persistence levels can be found in
 ***
 
 ## Checkpointing
-A _stateful operation_ is one which operates over multiple batches of data. This includes all
-window-based operations and the `updateStateByKey` operation. Since stateful operations have a
-dependency on previous batches of data, they continuously accumulate metadata over time.
-To clear this metadata, streaming supports periodic _checkpointing_ by saving intermediate data
-to HDFS. Note that checkpointing also incurs the cost of saving to HDFS which may cause the
-corresponding batch to take longer to process. Hence, the interval of checkpointing needs to be
-set carefully. At small batch sizes (say 1 second), checkpointing every batch may significantly
-reduce operation throughput. Conversely, checkpointing too slowly causes the lineage and task
-sizes to grow which may have detrimental effects. Typically, a checkpoint interval of 5 - 10
-times of sliding interval of a DStream is good setting to try.
-
-To enable checkpointing, the developer has to provide the HDFS path to which RDD will be saved.
-This is done by using
+A streaming application must operate 24/7 and hence must be resilient to failures unrelated
+to the application logic (e.g., system failures, JVM crashes, etc.). For this to be possible,
+Spark Streaming needs to *checkpoints* enough information to a fault-
+tolerant storage system such that it can recover from failures. There are two types of data
+that are checkpointed.
+
+- *Metadata checkpointing* - Saving of the information defining the streaming computation to
+  fault-tolerant storage like HDFS. This is used to recover from failure of the node running the
+  driver of the streaming application (discussed in detail later). Metadata includes:
+  +  *Configuration* - The configuration that were used to create the streaming application.
+  +  *DStream operations* - The set of DStream operations that define the streaming application.
+  +  *Incomplete batches* - Batches whose jobs are queued but have not completed yet.
+- *Data checkpointing* - Saving of the generated RDDs to reliable storage. This is necessary
+  in some *stateful* transformations that combine data across multiple batches. In such
+  transformations, the generated RDDs depends on RDDs of previous batches, which causes the length
+  of the dependency chain to keep increasing with time. To avoid such unbounded increase in recovery
+   time (proportional to dependency chain), intermediate RDDs of stateful transformations are periodically
+  *checkpointed* to reliable storage (e.g. HDFS) to cut off the dependency chains.
+
+To summarize, metadata checkpointing is primarily needed for recovery from driver failures,
+whereas data or RDD checkpointing is necessary even for basic functioning if stateful
+transformations are used.
+
+#### When to enable Checkpointing
+{:.no_toc}
+
+Checkpointing must be enabled for applications with any of the following requirements:
+
+- *Usage of stateful transformations* - If either `updateStateByKey` or `reduceByKeyAndWindow` (with
+  inverse function) is used in the application, then the checkpoint directory must be provided for
+  allowing periodic RDD checkpointing.
+- *Recovering from failures of the driver running the application* - Metadata checkpoints are used
+  for to recover with progress information.
+
+Note that simple streaming applications without the aforementioned stateful transformations can be
+run without enabling checkpointing. The recovery from driver failures will also be partial in
+that case (some received but unprocessed data may be lost). This is often acceptable and many run
+Spark Streaming applications in this way. Support for non-Hadoop environments is expected
+to improve in the future.
+
+#### How to configure Checkpointing
+{:.no_toc}
+
+Checkpointing can be enabled by setting a directory in a fault-tolerant,
+reliable file system (e.g., HDFS, S3, etc.) to which the checkpoint information will be saved.
+This is done by using `streamingContext.checkpoint(checkpointDirectory)`. This will allow you to
+use the aforementioned stateful transformations. Additionally,
+if you want make the application recover from driver failures, you should rewrite your
+streaming application to have the following behavior.
+
+  + When the program is being started for the first time, it will create a new StreamingContext,
+    set up all the streams and then call start().
+  + When the program is being restarted after failure, it will re-create a StreamingContext
+    from the checkpoint data in the checkpoint directory.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+This behavior is made simple by using `StreamingContext.getOrCreate`. This is used as follows.
 
 {% highlight scala %}
-ssc.checkpoint(hdfsPath) // assuming ssc is the StreamingContext or JavaStreamingContext
+// Function to create and setup a new StreamingContext
+def functionToCreateContext(): StreamingContext = {
+    val ssc = new StreamingContext(...)   // new context
+    val lines = ssc.socketTextStream(...) // create DStreams
+    ...
+    ssc.checkpoint(checkpointDirectory)   // set checkpoint directory
+    ssc
+}
+
+// Get StreamingContext from checkpoint data or create a new one
+val context = StreamingContext.getOrCreate(checkpointDirectory, functionToCreateContext _)
+
+// Do additional setup on context that needs to be done,
+// irrespective of whether it is being started or restarted
+context. ...
+
+// Start the context
+context.start()
+context.awaitTermination()
 {% endhighlight %}
 
-The interval of checkpointing of a DStream can be set by using
+If the `checkpointDirectory` exists, then the context will be recreated from the checkpoint data.
+If the directory does not exist (i.e., running for the first time),
+then the function `functionToCreateContext` will be called to create a new
+context and set up the DStreams. See the Scala example
+[RecoverableNetworkWordCount]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala).
+This example appends the word counts of network data into a file.
 
-{% highlight scala %}
-dstream.checkpoint(checkpointInterval)
+</div>
+<div data-lang="java" markdown="1">
+
+This behavior is made simple by using `JavaStreamingContext.getOrCreate`. This is used as follows.
+
+{% highlight java %}
+// Create a factory object that can create a and setup a new JavaStreamingContext
+JavaStreamingContextFactory contextFactory = new JavaStreamingContextFactory() {
+  @Override public JavaStreamingContext create() {
+    JavaStreamingContext jssc = new JavaStreamingContext(...);  // new context
+    JavaDStream<String> lines = jssc.socketTextStream(...);     // create DStreams
+    ...
+    jssc.checkpoint(checkpointDirectory);                       // set checkpoint directory
+    return jssc;
+  }
+};
+
+// Get JavaStreamingContext from checkpoint data or create a new one
+JavaStreamingContext context = JavaStreamingContext.getOrCreate(checkpointDirectory, contextFactory);
+
+// Do additional setup on context that needs to be done,
+// irrespective of whether it is being started or restarted
+context. ...
+
+// Start the context
+context.start();
+context.awaitTermination();
+{% endhighlight %}
+
+If the `checkpointDirectory` exists, then the context will be recreated from the checkpoint data.
+If the directory does not exist (i.e., running for the first time),
+then the function `contextFactory` will be called to create a new
+context and set up the DStreams. See the Scala example
+[JavaRecoverableNetworkWordCount]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/examples/streaming/JavaRecoverableNetworkWordCount.java).
+This example appends the word counts of network data into a file.
+
+</div>
+<div data-lang="python" markdown="1">
+
+This behavior is made simple by using `StreamingContext.getOrCreate`. This is used as follows.
+
+{% highlight python %}
+# Function to create and setup a new StreamingContext
+def functionToCreateContext():
+    sc = SparkContext(...)   # new context
+    ssc = new StreamingContext(...)
+    lines = ssc.socketTextStream(...) # create DStreams
+    ...
+    ssc.checkpoint(checkpointDirectory)   # set checkpoint directory
+    return ssc
+
+# Get StreamingContext from checkpoint data or create a new one
+context = StreamingContext.getOrCreate(checkpointDirectory, functionToCreateContext)
+
+# Do additional setup on context that needs to be done,
+# irrespective of whether it is being started or restarted
+context. ...
+
+# Start the context
+context.start()
+context.awaitTermination()
 {% endhighlight %}
 
-For DStreams that must be checkpointed (that is, DStreams created by `updateStateByKey` and
-`reduceByKeyAndWindow` with inverse function), the checkpoint interval of the DStream is by
-default set to a multiple of the DStream's sliding interval such that its at least 10 seconds.
+If the `checkpointDirectory` exists, then the context will be recreated from the checkpoint data.
+If the directory does not exist (i.e., running for the first time),
+then the function `functionToCreateContext` will be called to create a new
+context and set up the DStreams. See the Python example
+[recoverable_network_wordcount.py]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python/streaming/recoverable_network_wordcount.py).
+This example appends the word counts of network data into a file.
+
+You can also explicitly create a `StreamingContext` from the checkpoint data and start the
+ computation by using `StreamingContext.getOrCreate(checkpointDirectory, None)`.
+
+</div>
+</div>
+
+In addition to using `getOrCreate` one also needs to ensure that the driver process gets
+restarted automatically on failure. This can only be done by the deployment infrastructure that is
+used to run the application. This is further discussed in the
+[Deployment](#deploying-applications.html) section.
+
+Note that checkpointing of RDDs incurs the cost of saving to reliable storage.
+This may cause an increase in the processing time of those batches where RDDs get checkpointed.
+Hence, the interval of
+checkpointing needs to be set carefully. At small batch sizes (say 1 second), checkpointing every
+batch may significantly reduce operation throughput. Conversely, checkpointing too infrequently
+causes the lineage and task sizes to grow which may have detrimental effects. For stateful
+transformations that require RDD checkpointing, the default interval is a multiple of the
+batch interval that is at least 10 seconds. It can be set by using
+`dstream.checkpoint(checkpointInterval)`. Typically, a checkpoint interval of 5 - 10 times of
+sliding interval of a DStream is good setting to try.
 
 ***
 
 ## Deploying Applications
-A Spark Streaming application is deployed on a cluster in the same way as any other Spark application.
-Please refer to the [deployment guide](cluster-overview.html) for more details.
+This section discusses the steps to deploy a Spark Streaming application.
 
-Note that the applications
-that use [advanced sources](#advanced-sources) (e.g. Kafka, Flume, Twitter) are also required to package the
-extra artifact they link to, along with their dependencies, in the JAR that is used to deploy the application.
-For example, an application using `TwitterUtils` will have to include
-`spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}}` and all its transitive
-dependencies in the application JAR.
+### Requirements
+{:.no_toc}
 
-If a running Spark Streaming application needs to be upgraded (with new application code), then
-there are two possible mechanism.
+To run a Spark Streaming applications, you need to have the following.
+
+- *Cluster with a cluster manager* - This is the general requirement of any Spark application,
+  and discussed in detail in the [deployment guide](cluster-overview.html).
+
+- *Package the application JAR* - You have to compile your streaming application into a JAR.
+  If you are using [`spark-submit`](submitting-applications.html) to start the
+  application, then you will not need to provide Spark and Spark Streaming in the JAR. However,
+  if your application uses [advanced sources](#advanced-sources) (e.g. Kafka, Flume, Twitter),
+  then you will have to package the extra artifact they link to, along with their dependencies,
+  in the JAR that is used to deploy the application. For example, an application using `TwitterUtils`
+  will have to include `spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}}` and all its
+  transitive dependencies in the application JAR.
+
+- *Configuring sufficient memory for the executors* - Since the received data must be stored in
+  memory, the executors must be configured with sufficient memory to hold the received data. Note
+  that if you are doing 10 minute window operations, the system has to keep at least last 10 minutes
+  of data in memory. So the memory requirements for the application depends on the operations
+  used in it.
+
+- *Configuring checkpointing* - If the stream application requires it, then a directory in the
+  Hadoop API compatible fault-tolerant storage (e.g. HDFS, S3, etc.) must be configured as the
+  checkpoint directory and the streaming application written in a way that checkpoint
+  information can be used for failure recovery. See the [checkpointing](#checkpointing) section
+  for more details.
+
+- *Configuring automatic restart of the application driver* - To automatically recover from a
+  driver failure, the deployment infrastructure that is
+  used to run the streaming application must monitor the driver process and relaunch the driver
+  if it fails. Different [cluster managers](cluster-overview.html#cluster-manager-types)
+  have different tools to achieve this.
+    + *Spark Standalone* - A Spark application driver can be submitted to run within the Spark
+      Standalone cluster (see
+      [cluster deploy mode](spark-standalone.html#launching-spark-applications)), that is, the
+      application driver itself runs on one of the worker nodes. Furthermore, the
+      Standalone cluster manager can be instructed to *supervise* the driver,
+      and relaunch it if the driver fails either due to non-zero exit code,
+      or due to failure of the node running the driver. See *cluster mode* and *supervise* in the
+      [Spark Standalone guide](spark-standalone.html) for more details.
+    + *YARN* - Yarn supports a similar mechanism for automatically restarting an application.
+      Please refer to YARN documentation for more details.
+    + *Mesos* - [Marathon](https://github.com/mesosphere/marathon) has been used to achieve this
+      with Mesos.
+
+
+- *[Experimental in Spark 1.2] Configuring write ahead logs* - In Spark 1.2,
+  we have introduced a new experimental feature of write ahead logs for achieving strong
+  fault-tolerance guarantees. If enabled,  all the data received from a receiver gets written into
+  a write ahead log in the configuration checkpoint directory. This prevents data loss on driver
+  recovery, thus ensuring zero data loss (discussed in detail in the
+  [Fault-tolerance Semantics](#fault-tolerance-semantics) section). This can be enabled by setting
+  the [configuration parameter](configuration.html#spark-streaming)
+  `spark.streaming.receiver.writeAheadLog.enable` to `true`. However, these stronger semantics may
+  come at the cost of the receiving throughput of individual receivers. This can be corrected by
+  running [more receivers in parallel](#level-of-parallelism-in-data-receiving)
+  to increase aggregate throughput. Additionally, it is recommended that the replication of the
+  received data within Spark be disabled when the write ahead log is enabled as the log is already
+  stored in a replicated storage system. This can be done by setting the storage level for the
+  input stream to `StorageLevel.MEMORY_AND_DISK_SER`.
+
+### Upgrading Application Code
+{:.no_toc}
+
+If a running Spark Streaming application needs to be upgraded with new
+application code, then there are two possible mechanism.
 
 - The upgraded Spark Streaming application is started and run in parallel to the existing application.
 Once the new one (receiving the same data as the old one) has been warmed up and ready
@@ -1294,8 +1600,18 @@ for graceful shutdown options) which ensure data that have been received is comp
 processed before shutdown. Then the
 upgraded application can be started, which will start processing from the same point where the earlier
 application left off. Note that this can be done only with input sources that support source-side buffering
-(like Kafka, and Flume) as data needs to be buffered while the previous application down and
-the upgraded application is not yet up.
+(like Kafka, and Flume) as data needs to be buffered while the previous application was down and
+the upgraded application is not yet up. And restarting from earlier checkpoint
+information of pre-upgrade code cannot be done. The checkpoint information essentially
+contains serialized Scala/Java/Python objects and trying to deserialize objects with new,
+modified classes may lead to errors. In this case, either start the upgraded app with a different
+checkpoint directory, or delete the previous checkpoint directory.
+
+### Other Considerations
+{:.no_toc}
+If the data is being received by the receivers faster than what can be processed,
+you can limit the rate by setting the [configuration parameter](configuration.html#spark-streaming)
+`spark.streaming.receiver.maxRate`.
 
 ***
 
@@ -1308,11 +1624,14 @@ receivers are active, number of records received, receiver error, etc.)
 and completed batches (batch processing times, queueing delays, etc.). This can be used to
 monitor the progress of the streaming application.
 
-The following two metrics in web UI is particularly important -
-*Processing Time* and *Scheduling Delay* (under *Batch Processing Statistics*). The first is the
-time to process each batch of data, and the second is the time a batch waits in a queue
-for the processing of previous batches to finish. If the batch processing time is consistently more
-than the batch interval and/or the queueing delay keeps increasing, then it indicates the system is
+The following two metrics in web UI are particularly important:
+
+- *Processing Time* - The time to process each batch of data.
+- *Scheduling Delay* - the time a batch waits in a queue for the processing of previous batches
+  to finish.
+
+If the batch processing time is consistently more than the batch interval and/or the queueing
+delay keeps increasing, then it indicates the system is
 not able to process the batches as fast they are being generated and falling behind.
 In that case, consider
 [reducing](#reducing-the-processing-time-of-each-batch) the batch processing time.
@@ -1376,13 +1695,18 @@ unifiedStream.print();
 </div>
 </div>
 
-
-Another parameter that should be considered is the receiver's blocking interval. For most receivers,
-the received data is coalesced together into large blocks of data before storing inside Spark's memory.
-The number of blocks in each batch determines the number of tasks that will be used to process those
-the received data in a map-like transformation. This blocking interval is determined by the
-[configuration parameter](configuration.html) `spark.streaming.blockInterval` and the default value
-is 200 milliseconds.
+Another parameter that should be considered is the receiver's blocking interval,
+which is determined by the [configuration parameter](configuration.html#spark-streaming)
+`spark.streaming.blockInterval`. For most receivers, the received data is coalesced together into
+blocks of data before storing inside Spark's memory. The number of blocks in each batch
+determines the number of tasks that will be used to process those
+the received data in a map-like transformation. The number of tasks per receiver per batch will be
+approximately (batch interval / block interval). For example, block interval of 200 ms will
+create 10 tasks per 2 second batches. Too low the number of tasks (that is, less than the number
+of cores per machine), then it will be inefficient as all available cores will not be used to
+process the data. To increase the number of tasks for a given batch interval, reduce the
+block interval. However, the recommended minimum value of block interval is about 50 ms,
+below which the task launching overheads may be a problem.
 
 An alternative to receiving data with multiple input streams / receivers is to explicitly repartition
 the input data stream (using `inputStream.repartition(<number of partitions>)`).
@@ -1393,12 +1717,12 @@ before further processing.
 {:.no_toc}
 Cluster resources can be under-utilized if the number of parallel tasks used in any stage of the
 computation is not high enough. For example, for distributed reduce operations like `reduceByKey`
-and `reduceByKeyAndWindow`, the default number of parallel tasks is decided by the [config property]
-(configuration.html#spark-properties) `spark.default.parallelism`. You can pass the level of
-parallelism as an argument (see [`PairDStreamFunctions`]
-(api/scala/index.html#org.apache.spark.streaming.dstream.PairDStreamFunctions)
-documentation), or set the [config property](configuration.html#spark-properties)
-`spark.default.parallelism` to change the default.
+and `reduceByKeyAndWindow`, the default number of parallel tasks is controlled by
+the`spark.default.parallelism` [configuration property](configuration.html#spark-properties). You
+can pass the level of parallelism as an argument (see
+[`PairDStreamFunctions`](api/scala/index.html#org.apache.spark.streaming.dstream.PairDStreamFunctions)
+documentation), or set the `spark.default.parallelism`
+[configuration property](configuration.html#spark-properties) to change the default.
 
 ### Data Serialization
 {:.no_toc}
@@ -1493,294 +1817,120 @@ consistent batch processing times.
 ***************************************************************************************************
 ***************************************************************************************************
 
-# Fault-tolerance Properties
-In this section, we are going to discuss the behavior of Spark Streaming application in the event
-of a node failure. To understand this, let us remember the basic fault-tolerance properties of
+# Fault-tolerance Semantics
+In this section, we will discuss the behavior of Spark Streaming applications in the event
+of node failures. To understand this, let us remember the basic fault-tolerance semantics of
 Spark's RDDs.
 
- 1. An RDD is an immutable, deterministically re-computable, distributed dataset. Each RDD
- remembers the lineage of deterministic operations that were used on a fault-tolerant input
- dataset to create it.
- 1. If any partition of an RDD is lost due to a worker node failure, then that partition can be
- re-computed from the original fault-tolerant dataset using the lineage of operations.
-
-Since all data transformations in Spark Streaming are based on RDD operations, as long as the input
-dataset is present, all intermediate data can recomputed. Keeping these properties in mind, we are
-going to discuss the failure semantics in more detail.
-
-## Failure of a Worker Node
-There are two failure behaviors based on which input sources are used.
-
-1. _Using HDFS files as input source_ - Since the data is reliably stored on HDFS, all data can
-re-computed and therefore no data will be lost due to any failure.
-1. _Using any input source that receives data through a network_ - For network-based data sources
-like Kafka and Flume, the received input data is replicated in memory between nodes of the cluster
-(default replication factor is 2). So if a worker node fails, then the system can recompute the
-lost from the the left over copy of the input data. However, if the worker node where a network
-receiver was running fails, then a tiny bit of data may be lost, that is, the data received by
-the system but not yet replicated to other node(s). The receiver will be started on a different
-node and it will continue to receive data.
+1. An RDD is an immutable, deterministically re-computable, distributed dataset. Each RDD
+remembers the lineage of deterministic operations that were used on a fault-tolerant input
+dataset to create it.
+1. If any partition of an RDD is lost due to a worker node failure, then that partition can be
+re-computed from the original fault-tolerant dataset using the lineage of operations.
+1. Assuming that all of the RDD transformations are deterministic, the data in the final transformed
+   RDD will always be the same irrespective of failures in the Spark cluster.
+
+Spark operates on data on fault-tolerant file systems like HDFS or S3. Hence,
+all of the RDDs generated from the fault-tolerant data are also fault-tolerant. However, this is not
+the case for Spark Streaming as the data in most cases is received over the network (except when
+`fileStream` is used). To achieve the same fault-tolerance properties for all of the generated RDDs,
+the received data is replicated among multiple Spark executors in worker nodes in the cluster
+(default replication factor is 2). This leads to two kinds of data in the
+system that needs to recovered in the event of failures:
+
+1. *Data received and replicated* - This data survives failure of a single worker node as a copy
+  of it exists on one of the nodes.
+1. *Data received but buffered for replication* - Since this is not replicated,
+   the only way to recover that data is to get it again from the source.
+
+Furthermore, there are two kinds of failures that we should be concerned about:
+
+1. *Failure of a Worker Node* - Any of the worker nodes running executors can fail,
+   and all in-memory data on those nodes will be lost. If any receivers were running on failed
+   nodes, then their buffered data will be lost.
+1. *Failure of the Driver Node* - If the driver node running the Spark Streaming application
+   fails, then obviously the SparkContext is lost, and all executors with their in-memory
+   data are lost.
+
+With this basic knowledge, let us understand the fault-tolerance semantics of Spark Streaming.
+
+## Semantics with files as input source
+{:.no_toc}
+If all of the input data is already present in a fault-tolerant files system like
+HDFS, Spark Streaming can always recover from any failure and process all the data. This gives
+*exactly-once* semantics, that all the data will be processed exactly once no matter what fails.
+
+## Semantics with input sources based on receivers
+{:.no_toc}
+For input sources based on receivers, the fault-tolerance semantics depend on both the failure
+scenario and the type of receiver.
+As we discussed [earlier](#receiver-reliability), there are two types of receivers:
+
+1. *Reliable Receiver* - These receivers acknowledge reliable sources only after ensuring that
+  the received data has been replicated. If such a receiver fails,
+  the buffered (unreplicated) data does not get acknowledged to the source. If the receiver is
+  restarted, the source will resend the data, and therefore no data will be lost due to the failure.
+1. *Unreliable Receiver* - Such receivers can lose data when they fail due to worker
+  or driver failures.
+
+Depending on what type of receivers are used we achieve the following semantics.
+If a worker node fails, then there is no data loss with reliable receivers. With unreliable
+receivers, data received but not replicated can get lost. If the driver node fails,
+then besides these losses, all the past data that was received and replicated in memory will be
+lost. This will affect the results of the stateful transformations.
+
+To avoid this loss of past received data, Spark 1.2 introduces an experimental feature of _write
+ahead logs_ which saves the received data to fault-tolerant storage. With the [write ahead logs
+enabled](#deploying-applications) and reliable receivers, there is zero data loss and
+exactly-once semantics.
+
+The following table summarizes the semantics under failures:
 
+<table class="table">
+  <tr>
+    <th style="width:30%">Deployment Scenario</th>
+    <th>Worker Failure</th>
+    <th>Driver Failure</th>
+  </tr>
+  <tr>
+    <td>
+      <b>Spark 1.1 or earlier, or</b><br/>
+      <b>Spark 1.2 without write ahead log</b>
+    </td>
+    <td>
+      Buffered data lost with unreliable receivers<br/>
+      Zero data loss with reliable receivers and files<br/>
+    </td>
+    <td>
+      Buffered data lost with unreliable receivers<br/>
+      Past data lost with all receivers<br/>
+      Zero data loss with files
+      </td>
+  </tr>
+  <tr>
+    <td><b>Spark 1.2 with write ahead log</b></td>
+    <td>Zero data loss with reliable receivers and files</td>
+    <td>Zero data loss with reliable receivers and files</td>
+  </tr>
+  <tr>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+</table>
+
+## Semantics of output operations
+{:.no_toc}
 Since all data is modeled as RDDs with their lineage of deterministic operations, any recomputation
  always leads to the same result. As a result, all DStream transformations are guaranteed to have
  _exactly-once_ semantics. That is, the final transformed result will be same even if there were
  was a worker node failure. However, output operations (like `foreachRDD`) have _at-least once_
  semantics, that is, the transformed data may get written to an external entity more than once in
  the event of a worker failure. While this is acceptable for saving to HDFS using the
- `saveAs*Files` operations (as the file will simply get over-written by the same data),
+ `saveAs***Files` operations (as the file will simply get over-written by the same data),
  additional transactions-like mechanisms may be necessary to achieve exactly-once semantics
  for output operations.
 
-## Failure of the Driver Node
-For a streaming application to operate 24/7, Spark Streaming allows a streaming computation
-to be resumed even after the failure of the driver node. Spark Streaming periodically writes the
-metadata information of the DStreams setup through the `StreamingContext` to a
-HDFS directory (can be any Hadoop-compatible filesystem). This periodic
-*checkpointing* can be enabled by setting the checkpoint
-directory using `ssc.checkpoint(<checkpoint directory>)` as described
-[earlier](#rdd-checkpointing). On failure of the driver node,
-the lost `StreamingContext` can be recovered from this information, and restarted.
-
-To allow a Spark Streaming program to be recoverable, it must be written in a way such that
-it has the following behavior:
-
-1.  When the program is being started for the first time, it will create a new StreamingContext,
-    set up all the streams and then call start().
-1.  When the program is being restarted after failure, it will re-create a StreamingContext
-    from the checkpoint data in the checkpoint directory.
-
-<div class="codetabs">
-<div data-lang="scala" markdown="1">
-
-This behavior is made simple by using `StreamingContext.getOrCreate`. This is used as follows.
-
-{% highlight scala %}
-// Function to create and setup a new StreamingContext
-def functionToCreateContext(): StreamingContext = {
-    val ssc = new StreamingContext(...)   // new context
-    val lines = ssc.socketTextStream(...) // create DStreams
-    ...
-    ssc.checkpoint(checkpointDirectory)   // set checkpoint directory
-    ssc
-}
-
-// Get StreamingContext from checkpoint data or create a new one
-val context = StreamingContext.getOrCreate(checkpointDirectory, functionToCreateContext _)
-
-// Do additional setup on context that needs to be done,
-// irrespective of whether it is being started or restarted
-context. ...
-
-// Start the context
-context.start()
-context.awaitTermination()
-{% endhighlight %}
-
-If the `checkpointDirectory` exists, then the context will be recreated from the checkpoint data.
-If the directory does not exist (i.e., running for the first time),
-then the function `functionToCreateContext` will be called to create a new
-context and set up the DStreams. See the Scala example
-[RecoverableNetworkWordCount]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala).
-This example appends the word counts of network data into a file.
-
-You can also explicitly create a `StreamingContext` from the checkpoint data and start the
- computation by using `new StreamingContext(checkpointDirectory)`.
-
-</div>
-<div data-lang="java" markdown="1">
-
-This behavior is made simple by using `JavaStreamingContext.getOrCreate`. This is used as follows.
-
-{% highlight java %}
-// Create a factory object that can create a and setup a new JavaStreamingContext
-JavaStreamingContextFactory contextFactory = new JavaStreamingContextFactory() {
-  @Override public JavaStreamingContext create() {
-    JavaStreamingContext jssc = new JavaStreamingContext(...);  // new context
-    JavaDStream<String> lines = jssc.socketTextStream(...);     // create DStreams
-    ...
-    jssc.checkpoint(checkpointDirectory);                       // set checkpoint directory
-    return jssc;
-  }
-};
-
-// Get JavaStreamingContext from checkpoint data or create a new one
-JavaStreamingContext context = JavaStreamingContext.getOrCreate(checkpointDirectory, contextFactory);
-
-// Do additional setup on context that needs to be done,
-// irrespective of whether it is being started or restarted
-context. ...
-
-// Start the context
-context.start();
-context.awaitTermination();
-{% endhighlight %}
-
-If the `checkpointDirectory` exists, then the context will be recreated from the checkpoint data.
-If the directory does not exist (i.e., running for the first time),
-then the function `contextFactory` will be called to create a new
-context and set up the DStreams.
-
-You can also explicitly create a `JavaStreamingContext` from the checkpoint data and start
-the computation by using `new JavaStreamingContext(checkpointDirectory)`.
-
-</div>
-<div data-lang="python" markdown="1">
-
-This behavior is made simple by using `StreamingContext.getOrCreate`. This is used as follows.
-
-{% highlight python %}
-# Function to create and setup a new StreamingContext
-def functionToCreateContext():
-    sc = SparkContext(...)   # new context
-    ssc = new StreamingContext(...)  
-    lines = ssc.socketTextStream(...) # create DStreams
-    ...
-    ssc.checkpoint(checkpointDirectory)   # set checkpoint directory
-    return ssc
-
-# Get StreamingContext from checkpoint data or create a new one
-context = StreamingContext.getOrCreate(checkpointDirectory, functionToCreateContext)
-
-# Do additional setup on context that needs to be done,
-# irrespective of whether it is being started or restarted
-context. ...
-
-# Start the context
-context.start()
-context.awaitTermination()
-{% endhighlight %}
-
-If the `checkpointDirectory` exists, then the context will be recreated from the checkpoint data.
-If the directory does not exist (i.e., running for the first time),
-then the function `functionToCreateContext` will be called to create a new
-context and set up the DStreams. See the Python example
-[recoverable_network_wordcount.py]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python/streaming/recoverable_network_wordcount.py).
-This example appends the word counts of network data into a file.
-
-You can also explicitly create a `StreamingContext` from the checkpoint data and start the
- computation by using `StreamingContext.getOrCreate(checkpointDirectory, None)`.
-
-</div>
-
-</div>
-
-**Note**: If Spark Streaming and/or the Spark Streaming program is recompiled,
-you *must* create a new `StreamingContext` or `JavaStreamingContext`,
-not recreate from checkpoint data. This is because trying to load a
-context from checkpoint data may fail if the data was generated before recompilation of the
-classes. So, if you are using `getOrCreate`, then make sure that the checkpoint directory is
-explicitly deleted every time recompiled code needs to be launched.
-
-This failure recovery can be done automatically using Spark's
-[standalone cluster mode](spark-standalone.html), which allows the driver of any Spark application
-to be launched within the cluster and be restarted on failure (see
-[supervise mode](spark-standalone.html#launching-applications-inside-the-cluster)). This can be
-tested locally by launching the above example using the supervise mode in a
-local standalone cluster and killing the java process running the driver (will be shown as
-*DriverWrapper* when `jps` is run to show all active Java processes). The driver should be
-automatically restarted, and the word counts will cont
-
-For other deployment environments like Mesos and Yarn, you have to restart the driver through other
-mechanisms.
-
-#### Recovery Semantics
-{:.no_toc}
-
-There are two different failure behaviors based on which input sources are used.
-
-1. _Using HDFS files as input source_ - Since the data is reliably stored on HDFS, all data can
-re-computed and therefore no data will be lost due to any failure.
-1. _Using any input source that receives data through a network_ - The received input data is
-replicated in memory to multiple nodes. Since all the data in the Spark worker's memory is lost
-when the Spark driver fails, the past input data will not be accessible and driver recovers.
-Hence, if stateful and window-based operations are used
-(like `updateStateByKey`, `window`, `countByValueAndWindow`, etc.), then the intermediate state
-will not be recovered completely.
-
-In future releases, we will support full recoverability for all input sources. Note that for
-non-stateful transformations like `map`, `count`, and `reduceByKey`, with _all_ input streams,
-the system, upon restarting, will continue to receive and process new data.
-
-To better understand the behavior of the system under driver failure with a HDFS source, let's
-consider what will happen with a file input stream. Specifically, in the case of the file input
-stream, it will correctly identify new files that were created while the driver was down and
-process them in the same way as it would have if the driver had not failed. To explain further
-in the case of file input stream, we shall use an example. Let's say, files are being generated
-every second, and a Spark Streaming program reads every new file and output the number of lines
-in the file. This is what the sequence of outputs would be with and without a driver failure.
-
-<table class="table">
-    <!-- Results table headers -->
-    <tr>
-      <th> Time </th>
-      <th> Number of lines in input file </th>
-      <th> Output without driver failure </th>
-      <th> Output with driver failure </th>
-    </tr>
-    <tr>
-      <td>1</td>
-      <td>10</td>
-      <td>10</td>
-      <td>10</td>
-    </tr>
-    <tr>
-      <td>2</td>
-      <td>20</td>
-      <td>20</td>
-      <td>20</td>
-    </tr>
-    <tr>
-      <td>3</td>
-      <td>30</td>
-      <td>30</td>
-      <td>30</td>
-    </tr>
-    <tr>
-      <td>4</td>
-      <td>40</td>
-      <td>40</td>
-      <td>[DRIVER FAILS]<br />no output</td>
-    </tr>
-    <tr>
-      <td>5</td>
-      <td>50</td>
-      <td>50</td>
-      <td>no output</td>
-    </tr>
-    <tr>
-      <td>6</td>
-      <td>60</td>
-      <td>60</td>
-      <td>no output</td>
-    </tr>
-    <tr>
-      <td>7</td>
-      <td>70</td>
-      <td>70</td>
-      <td>[DRIVER RECOVERS]<br />40, 50, 60, 70</td>
-    </tr>
-    <tr>
-      <td>8</td>
-      <td>80</td>
-      <td>80</td>
-      <td>80</td>
-    </tr>
-    <tr>
-      <td>9</td>
-      <td>90</td>
-      <td>90</td>
-      <td>90</td>
-    </tr>
-    <tr>
-      <td>10</td>
-      <td>100</td>
-      <td>100</td>
-      <td>100</td>
-    </tr>
-</table>
-
-If the driver had crashed in the middle of the processing of time 3, then it will process time 3
-and output 30 after recovery.
 
 ***************************************************************************************************
 ***************************************************************************************************
@@ -1864,5 +2014,5 @@ package and renamed for better clarity.
 
 * More examples in [Scala]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples/streaming)
   and [Java]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/examples/streaming)
-  and [Python] ({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python/streaming)
+  and [Python]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python/streaming)
 * [Paper](http://www.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-259.pdf) and [video](http://youtu.be/g171ndOHgJ0) describing Spark Streaming.
diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md
index 45b70b1a5457a..3bd1deaccfafe 100644
--- a/docs/submitting-applications.md
+++ b/docs/submitting-applications.md
@@ -10,7 +10,7 @@ through a uniform interface so you don't have to configure your application spec
 # Bundling Your Application's Dependencies
 If your code depends on other projects, you will need to package them alongside
 your application in order to distribute the code to a Spark cluster. To do this,
-to create an assembly jar (or "uber" jar) containing your code and its dependencies. Both
+create an assembly jar (or "uber" jar) containing your code and its dependencies. Both
 [sbt](https://github.com/sbt/sbt-assembly) and
 [Maven](http://maven.apache.org/plugins/maven-shade-plugin/)
 have assembly plugins. When creating assembly jars, list Spark and Hadoop
@@ -43,28 +43,33 @@ Some of the commonly used options are:
 
 * `--class`: The entry point for your application (e.g. `org.apache.spark.examples.SparkPi`)
 * `--master`: The [master URL](#master-urls) for the cluster (e.g. `spark://23.195.26.187:7077`)
-* `--deploy-mode`: Whether to deploy your driver on the worker nodes (`cluster`) or locally as an external client (`client`) (default: `client`)*
+* `--deploy-mode`: Whether to deploy your driver on the worker nodes (`cluster`) or locally as an external client (`client`) (default: `client`) <b> &#8224; </b>
 * `--conf`: Arbitrary Spark configuration property in key=value format. For values that contain spaces wrap "key=value" in quotes (as shown).
 * `application-jar`: Path to a bundled jar including your application and all dependencies. The URL must be globally visible inside of your cluster, for instance, an `hdfs://` path or a `file://` path that is present on all nodes.
 * `application-arguments`: Arguments passed to the main method of your main class, if any
 
-*A common deployment strategy is to submit your application from a gateway machine that is
+<b>&#8224;</b> A common deployment strategy is to submit your application from a gateway machine
+that is
 physically co-located with your worker machines (e.g. Master node in a standalone EC2 cluster).
 In this setup, `client` mode is appropriate. In `client` mode, the driver is launched directly
-within the client `spark-submit` process, with the input and output of the application attached
-to the console. Thus, this mode is especially suitable for applications that involve the REPL
-(e.g. Spark shell).
+within the `spark-submit` process which acts as a *client* to the cluster. The input and
+output of the application is attached to the console. Thus, this mode is especially suitable
+for applications that involve the REPL (e.g. Spark shell).
 
 Alternatively, if your application is submitted from a machine far from the worker machines (e.g.
 locally on your laptop), it is common to use `cluster` mode to minimize network latency between
 the drivers and the executors. Note that `cluster` mode is currently not supported for standalone
-clusters, Mesos clusters, or python applications.
+clusters, Mesos clusters, or Python applications.
 
 For Python applications, simply pass a `.py` file in the place of `<application-jar>` instead of a JAR,
 and add Python `.zip`, `.egg` or `.py` files to the search path with `--py-files`.
 
-To enumerate all options available to `spark-submit` run it with `--help`. Here are a few
-examples of common options:
+There are a few options available that are specific to the
+[cluster manager](#cluster-overview.html#cluster-manager-types) that is being used.
+For example, with a [Spark Standalone](#spark-standalone) cluster with `cluster` deploy mode,
+you can also specify `--supervise` to make sure that the driver is automatically restarted if it
+fails with non-zero exit code. To enumerate all such options available to `spark-submit`,
+run it with `--help`. Here are a few examples of common options:
 
 {% highlight bash %}
 # Run application locally on 8 cores
@@ -74,7 +79,7 @@ examples of common options:
   /path/to/examples.jar \
   100
 
-# Run on a Spark standalone cluster
+# Run on a Spark Standalone cluster in client deploy mode
 ./bin/spark-submit \
   --class org.apache.spark.examples.SparkPi \
   --master spark://207.184.161.138:7077 \
@@ -83,6 +88,17 @@ examples of common options:
   /path/to/examples.jar \
   1000
 
+# Run on a Spark Standalone cluster in cluster deploy mode with supervise
+./bin/spark-submit \
+  --class org.apache.spark.examples.SparkPi \
+  --master spark://207.184.161.138:7077 \
+  --deploy-mode cluster
+  --supervise
+  --executor-memory 20G \
+  --total-executor-cores 100 \
+  /path/to/examples.jar \
+  1000
+
 # Run on a YARN cluster
 export HADOOP_CONF_DIR=XXX
 ./bin/spark-submit \
@@ -93,7 +109,7 @@ export HADOOP_CONF_DIR=XXX
   /path/to/examples.jar \
   1000
 
-# Run a Python application on a cluster
+# Run a Python application on a Spark Standalone cluster
 ./bin/spark-submit \
   --master spark://207.184.161.138:7077 \
   examples/src/main/python/pi.py \
@@ -158,10 +174,10 @@ This can use up a significant amount of space over time and will need to be clea
 is handled automatically, and with Spark standalone, automatic cleanup can be configured with the
 `spark.worker.cleanup.appDataTtl` property.
 
-For python, the equivalent `--py-files` option can be used to distribute `.egg`, `.zip` and `.py` libraries
+For Python, the equivalent `--py-files` option can be used to distribute `.egg`, `.zip` and `.py` libraries
 to executors.
 
 # More Information
 
-Once you have deployed your application, the [cluster mode overview](cluster-overview.html) describes 
+Once you have deployed your application, the [cluster mode overview](cluster-overview.html) describes
 the components involved in distributed execution, and how to monitor and debug applications.
diff --git a/docs/tuning.md b/docs/tuning.md
index 0e2447dd46394..efaac9d3d405f 100644
--- a/docs/tuning.md
+++ b/docs/tuning.md
@@ -51,7 +51,7 @@ To register your own custom classes with Kryo, use the `registerKryoClasses` met
 
 {% highlight scala %}
 val conf = new SparkConf().setMaster(...).setAppName(...)
-conf.registerKryoClasses(Seq(classOf[MyClass1], classOf[MyClass2]))
+conf.registerKryoClasses(Array(classOf[MyClass1], classOf[MyClass2]))
 val sc = new SparkContext(conf)
 {% endhighlight %}
 
@@ -111,7 +111,7 @@ pointer-based data structures and wrapper objects. There are several ways to do
 3. Consider using numeric IDs or enumeration objects instead of strings for keys.
 4. If you have less than 32 GB of RAM, set the JVM flag `-XX:+UseCompressedOops` to make pointers be
    four bytes instead of eight. You can add these options in
-   [`spark-env.sh`](configuration.html#environment-variables-in-spark-envsh).
+   [`spark-env.sh`](configuration.html#environment-variables).
 
 ## Serialized RDD Storage
 
@@ -154,7 +154,7 @@ By default, Spark uses 60% of the configured executor memory (`spark.executor.me
 cache RDDs. This means that 40% of memory is available for any objects created during task execution.
 
 In case your tasks slow down and you find that your JVM is garbage-collecting frequently or running out of
-memory, lowering this value will help reduce the memory consumption. To change this to say 50%, you can call
+memory, lowering this value will help reduce the memory consumption. To change this to, say, 50%, you can call
 `conf.set("spark.storage.memoryFraction", "0.5")` on your SparkConf. Combined with the use of serialized caching,
 using a smaller cache should be sufficient to mitigate most of the garbage collection problems.
 In case you are interested in further tuning the Java GC, continue reading below.
@@ -190,7 +190,7 @@ temporary objects created during task execution. Some steps which may be useful
 
 * As an example, if your task is reading data from HDFS, the amount of memory used by the task can be estimated using
   the size of the data block read from HDFS. Note that the size of a decompressed block is often 2 or 3 times the
-  size of the block. So if we wish to have 3 or 4 tasks worth of working space, and the HDFS block size is 64 MB,
+  size of the block. So if we wish to have 3 or 4 tasks' worth of working space, and the HDFS block size is 64 MB,
   we can estimate size of Eden to be `4*3*64MB`.
 
 * Monitor how the frequency and time taken by garbage collection changes with the new settings.
@@ -219,7 +219,7 @@ working set of one of your tasks, such as one of the reduce tasks in `groupByKey
 Spark's shuffle operations (`sortByKey`, `groupByKey`, `reduceByKey`, `join`, etc) build a hash table
 within each task to perform the grouping, which can often be large. The simplest fix here is to
 *increase the level of parallelism*, so that each task's input set is smaller. Spark can efficiently
-support tasks as short as 200 ms, because it reuses one worker JVMs across all tasks and it has
+support tasks as short as 200 ms, because it reuses one executor JVM across many tasks and it has
 a low task launching cost, so you can safely increase the level of parallelism to more than the
 number of cores in your clusters.
 
@@ -233,6 +233,39 @@ Spark prints the serialized size of each task on the master, so you can look at
 decide whether your tasks are too large; in general tasks larger than about 20 KB are probably
 worth optimizing.
 
+## Data Locality
+
+Data locality can have a major impact on the performance of Spark jobs.  If data and the code that
+operates on it are together than computation tends to be fast.  But if code and data are separated,
+one must move to the other.  Typically it is faster to ship serialized code from place to place than
+a chunk of data because code size is much smaller than data.  Spark builds its scheduling around
+this general principle of data locality.
+
+Data locality is how close data is to the code processing it.  There are several levels of
+locality based on the data's current location.  In order from closest to farthest:
+
+- `PROCESS_LOCAL` data is in the same JVM as the running code.  This is the best locality
+  possible
+- `NODE_LOCAL` data is on the same node.  Examples might be in HDFS on the same node, or in
+  another executor on the same node.  This is a little slower than `PROCESS_LOCAL` because the data
+  has to travel between processes
+- `NO_PREF` data is accessed equally quickly from anywhere and has no locality preference
+- `RACK_LOCAL` data is on the same rack of servers.  Data is on a different server on the same rack
+  so needs to be sent over the network, typically through a single switch
+- `ANY` data is elsewhere on the network and not in the same rack
+
+Spark prefers to schedule all tasks at the best locality level, but this is not always possible.  In
+situations where there is no unprocessed data on any idle executor, Spark switches to lower locality
+levels. There are two options: a) wait until a busy CPU frees up to start a task on data on the same
+server, or b) immediately start a new task in a farther away place that requires moving data there.
+
+What Spark typically does is wait a bit in the hopes that a busy CPU frees up.  Once that timeout
+expires, it starts moving the data from far away to the free CPU.  The wait timeout for fallback
+between each level can be configured individually or all together in one parameter; see the
+`spark.locality` parameters on the [configuration page](configuration.html#scheduling) for details.
+You should increase these settings if your tasks are long and see poor locality, but the default
+usually works well.
+
 # Summary
 
 This has been a short guide to point out the main concerns you should know about when tuning a
diff --git a/ec2/spark-ec2 b/ec2/spark-ec2
index 4aa908242eeaa..3abd3f396f605 100755
--- a/ec2/spark-ec2
+++ b/ec2/spark-ec2
@@ -22,5 +22,4 @@
 #+ the underlying Python script.
 SPARK_EC2_DIR="$(dirname $0)"
 
-PYTHONPATH="${SPARK_EC2_DIR}/third_party/boto-2.4.1.zip/boto-2.4.1:$PYTHONPATH" \
-    python "${SPARK_EC2_DIR}/spark_ec2.py" "$@"
+python -Wdefault "${SPARK_EC2_DIR}/spark_ec2.py" "$@"
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 5f9e484212635..abab209a05ba0 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -21,6 +21,7 @@
 
 from __future__ import with_statement
 
+import hashlib
 import logging
 import os
 import pipes
@@ -29,6 +30,7 @@
 import string
 import subprocess
 import sys
+import tarfile
 import tempfile
 import time
 import urllib2
@@ -36,18 +38,64 @@
 from datetime import datetime
 from optparse import OptionParser
 from sys import stderr
-import boto
-from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType
-from boto import ec2
 
-DEFAULT_SPARK_VERSION = "1.1.0"
+VALID_SPARK_VERSIONS = set([
+    "0.7.3",
+    "0.8.0",
+    "0.8.1",
+    "0.9.0",
+    "0.9.1",
+    "0.9.2",
+    "1.0.0",
+    "1.0.1",
+    "1.0.2",
+    "1.1.0",
+    "1.1.1",
+    "1.2.0",
+])
+
+DEFAULT_SPARK_VERSION = "1.2.0"
+DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark"
 SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
+MESOS_SPARK_EC2_BRANCH = "branch-1.3"
 
-MESOS_SPARK_EC2_BRANCH = "v4"
 # A URL prefix from which to fetch AMI information
 AMI_PREFIX = "https://raw.github.com/mesos/spark-ec2/{b}/ami-list".format(b=MESOS_SPARK_EC2_BRANCH)
 
 
+def setup_boto():
+    # Download Boto if it's not already present in the SPARK_EC2_DIR/lib folder:
+    version = "boto-2.34.0"
+    md5 = "5556223d2d0cc4d06dd4829e671dcecd"
+    url = "https://pypi.python.org/packages/source/b/boto/%s.tar.gz" % version
+    lib_dir = os.path.join(SPARK_EC2_DIR, "lib")
+    if not os.path.exists(lib_dir):
+        os.mkdir(lib_dir)
+    boto_lib_dir = os.path.join(lib_dir, version)
+    if not os.path.isdir(boto_lib_dir):
+        tgz_file_path = os.path.join(lib_dir, "%s.tar.gz" % version)
+        print "Downloading Boto from PyPi"
+        download_stream = urllib2.urlopen(url)
+        with open(tgz_file_path, "wb") as tgz_file:
+            tgz_file.write(download_stream.read())
+        with open(tgz_file_path) as tar:
+            if hashlib.md5(tar.read()).hexdigest() != md5:
+                print >> stderr, "ERROR: Got wrong md5sum for Boto"
+                sys.exit(1)
+        tar = tarfile.open(tgz_file_path)
+        tar.extractall(path=lib_dir)
+        tar.close()
+        os.remove(tgz_file_path)
+        print "Finished downloading Boto"
+    sys.path.insert(0, boto_lib_dir)
+
+
+setup_boto()
+import boto
+from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType
+from boto import ec2
+
+
 class UsageError(Exception):
     pass
 
@@ -94,8 +142,8 @@ def parse_args():
         help="Version of Spark to use: 'X.Y.Z' or a specific git hash (default: %default)")
     parser.add_option(
         "--spark-git-repo",
-        default="https://github.com/apache/spark",
-        help="Github repo from which to checkout supplied commit hash")
+        default=DEFAULT_SPARK_GITHUB_REPO,
+        help="Github repo from which to checkout supplied commit hash (default: %default)")
     parser.add_option(
         "--hadoop-major-version", default="1",
         help="Major version of Hadoop (default: %default)")
@@ -120,6 +168,10 @@ def parse_args():
              "Only possible on EBS-backed AMIs. " +
              "EBS volumes are only attached if --ebs-vol-size > 0." +
              "Only support up to 8 EBS volumes.")
+    parser.add_option("--placement-group", type="string", default=None,
+                      help="Which placement group to try and launch " +
+                      "instances into. Assumes placement group is already " +
+                      "created.")
     parser.add_option(
         "--swap", metavar="SWAP", type="int", default=1024,
         help="Swap space to set up per node, in MB (default: %default)")
@@ -162,6 +214,10 @@ def parse_args():
     parser.add_option(
         "--copy-aws-credentials", action="store_true", default=False,
         help="Add AWS credentials to hadoop configuration to allow Spark to access S3")
+    parser.add_option(
+        "--subnet-id", default=None, help="VPC subnet to launch instances in")
+    parser.add_option(
+        "--vpc-id", default=None, help="VPC to launch instances in")
 
     (opts, args) = parser.parse_args()
     if len(args) != 2:
@@ -186,14 +242,34 @@ def parse_args():
 
 
 # Get the EC2 security group of the given name, creating it if it doesn't exist
-def get_or_make_group(conn, name):
+def get_or_make_group(conn, name, vpc_id):
     groups = conn.get_all_security_groups()
     group = [g for g in groups if g.name == name]
     if len(group) > 0:
         return group[0]
     else:
         print "Creating security group " + name
-        return conn.create_security_group(name, "Spark EC2 group")
+        return conn.create_security_group(name, "Spark EC2 group", vpc_id)
+
+
+def get_validate_spark_version(version, repo):
+    if "." in version:
+        version = version.replace("v", "")
+        if version not in VALID_SPARK_VERSIONS:
+            print >> stderr, "Don't know about Spark version: {v}".format(v=version)
+            sys.exit(1)
+        return version
+    else:
+        github_commit_url = "{repo}/commit/{commit_hash}".format(repo=repo, commit_hash=version)
+        request = urllib2.Request(github_commit_url)
+        request.get_method = lambda: 'HEAD'
+        try:
+            response = urllib2.urlopen(request)
+        except urllib2.HTTPError, e:
+            print >> stderr, "Couldn't validate Spark commit: {url}".format(url=github_commit_url)
+            print >> stderr, "Received HTTP response code of {code}.".format(code=e.code)
+            sys.exit(1)
+        return version
 
 
 # Check whether a given EC2 instance object is in a state we consider active,
@@ -203,26 +279,6 @@ def is_active(instance):
     return (instance.state in ['pending', 'running', 'stopping', 'stopped'])
 
 
-# Return correct versions of Spark and Shark, given the supplied Spark version
-def get_spark_shark_version(opts):
-    spark_shark_map = {
-        "0.7.3": "0.7.1",
-        "0.8.0": "0.8.0",
-        "0.8.1": "0.8.1",
-        "0.9.0": "0.9.0",
-        "0.9.1": "0.9.1",
-        "1.0.0": "1.0.0",
-        "1.0.1": "1.0.1",
-        "1.0.2": "1.0.2",
-        "1.1.0": "1.1.0",
-    }
-    version = opts.spark_version.replace("v", "")
-    if version not in spark_shark_map:
-        print >> stderr, "Don't know about Spark version: %s" % version
-        sys.exit(1)
-    return (version, spark_shark_map[version])
-
-
 # Attempt to resolve an appropriate AMI given the architecture and region of the request.
 # Source: http://aws.amazon.com/amazon-linux-ami/instance-type-matrix/
 # Last Updated: 2014-06-20
@@ -303,12 +359,26 @@ def launch_cluster(conn, opts, cluster_name):
             user_data_content = user_data_file.read()
 
     print "Setting up security groups..."
-    master_group = get_or_make_group(conn, cluster_name + "-master")
-    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
+    master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id)
+    slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id)
     authorized_address = opts.authorized_address
     if master_group.rules == []:  # Group was just now created
-        master_group.authorize(src_group=master_group)
-        master_group.authorize(src_group=slave_group)
+        if opts.vpc_id is None:
+            master_group.authorize(src_group=master_group)
+            master_group.authorize(src_group=slave_group)
+        else:
+            master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
+                                   src_group=master_group)
+            master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
+                                   src_group=master_group)
+            master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
+                                   src_group=master_group)
+            master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
+                                   src_group=slave_group)
+            master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
+                                   src_group=slave_group)
+            master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
+                                   src_group=slave_group)
         master_group.authorize('tcp', 22, 22, authorized_address)
         master_group.authorize('tcp', 8080, 8081, authorized_address)
         master_group.authorize('tcp', 18080, 18080, authorized_address)
@@ -320,8 +390,22 @@ def launch_cluster(conn, opts, cluster_name):
         if opts.ganglia:
             master_group.authorize('tcp', 5080, 5080, authorized_address)
     if slave_group.rules == []:  # Group was just now created
-        slave_group.authorize(src_group=master_group)
-        slave_group.authorize(src_group=slave_group)
+        if opts.vpc_id is None:
+            slave_group.authorize(src_group=master_group)
+            slave_group.authorize(src_group=slave_group)
+        else:
+            slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
+                                  src_group=master_group)
+            slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
+                                  src_group=master_group)
+            slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
+                                  src_group=master_group)
+            slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
+                                  src_group=slave_group)
+            slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
+                                  src_group=slave_group)
+            slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
+                                  src_group=slave_group)
         slave_group.authorize('tcp', 22, 22, authorized_address)
         slave_group.authorize('tcp', 8080, 8081, authorized_address)
         slave_group.authorize('tcp', 50060, 50060, authorized_address)
@@ -341,11 +425,12 @@ def launch_cluster(conn, opts, cluster_name):
     if opts.ami is None:
         opts.ami = get_spark_ami(opts)
 
-    additional_groups = []
+    # we use group ids to work around https://github.com/boto/boto/issues/350
+    additional_group_ids = []
     if opts.additional_security_group:
-        additional_groups = [sg
-                             for sg in conn.get_all_security_groups()
-                             if opts.additional_security_group in (sg.name, sg.id)]
+        additional_group_ids = [sg.id
+                                for sg in conn.get_all_security_groups()
+                                if opts.additional_security_group in (sg.name, sg.id)]
     print "Launching instances..."
 
     try:
@@ -392,9 +477,11 @@ def launch_cluster(conn, opts, cluster_name):
                 placement=zone,
                 count=num_slaves_this_zone,
                 key_name=opts.key_pair,
-                security_groups=[slave_group] + additional_groups,
+                security_group_ids=[slave_group.id] + additional_group_ids,
                 instance_type=opts.instance_type,
                 block_device_map=block_map,
+                subnet_id=opts.subnet_id,
+                placement_group=opts.placement_group,
                 user_data=user_data_content)
             my_req_ids += [req.id for req in slave_reqs]
             i += 1
@@ -413,7 +500,7 @@ def launch_cluster(conn, opts, cluster_name):
                         active_instance_ids.append(id_to_req[i].instance_id)
                 if len(active_instance_ids) == opts.slaves:
                     print "All %d slaves granted" % opts.slaves
-                    reservations = conn.get_all_instances(active_instance_ids)
+                    reservations = conn.get_all_reservations(active_instance_ids)
                     slave_nodes = []
                     for r in reservations:
                         slave_nodes += r.instances
@@ -441,12 +528,14 @@ def launch_cluster(conn, opts, cluster_name):
             num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
             if num_slaves_this_zone > 0:
                 slave_res = image.run(key_name=opts.key_pair,
-                                      security_groups=[slave_group] + additional_groups,
+                                      security_group_ids=[slave_group.id] + additional_group_ids,
                                       instance_type=opts.instance_type,
                                       placement=zone,
                                       min_count=num_slaves_this_zone,
                                       max_count=num_slaves_this_zone,
                                       block_device_map=block_map,
+                                      subnet_id=opts.subnet_id,
+                                      placement_group=opts.placement_group,
                                       user_data=user_data_content)
                 slave_nodes += slave_res.instances
                 print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
@@ -467,13 +556,16 @@ def launch_cluster(conn, opts, cluster_name):
         if opts.zone == 'all':
             opts.zone = random.choice(conn.get_all_zones()).name
         master_res = image.run(key_name=opts.key_pair,
-                               security_groups=[master_group] + additional_groups,
+                               security_group_ids=[master_group.id] + additional_group_ids,
                                instance_type=master_type,
                                placement=opts.zone,
                                min_count=1,
                                max_count=1,
                                block_device_map=block_map,
+                               subnet_id=opts.subnet_id,
+                               placement_group=opts.placement_group,
                                user_data=user_data_content)
+
         master_nodes = master_res.instances
         print "Launched master in %s, regid = %s" % (zone, master_res.id)
 
@@ -497,7 +589,7 @@ def launch_cluster(conn, opts, cluster_name):
 
 def get_existing_cluster(conn, opts, cluster_name, die_on_error=True):
     print "Searching for existing cluster " + cluster_name + "..."
-    reservations = conn.get_all_instances()
+    reservations = conn.get_all_reservations()
     master_nodes = []
     slave_nodes = []
     for res in reservations:
@@ -540,7 +632,7 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
             print slave.public_dns_name
             ssh_write(slave.public_dns_name, opts, ['tar', 'x'], dot_ssh_tar)
 
-    modules = ['spark', 'shark', 'ephemeral-hdfs', 'persistent-hdfs',
+    modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs',
                'mapreduce', 'spark-standalone', 'tachyon']
 
     if opts.hadoop_major_version == "1":
@@ -574,12 +666,6 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
     print "Done!"
 
 
-def setup_standalone_cluster(master, slave_nodes, opts):
-    slave_ips = '\n'.join([i.public_dns_name for i in slave_nodes])
-    ssh(master, opts, "echo \"%s\" > spark/conf/slaves" % (slave_ips))
-    ssh(master, opts, "/root/spark/sbin/start-all.sh")
-
-
 def setup_spark_cluster(master, opts):
     ssh(master, opts, "chmod u+x spark-ec2/setup.sh")
     ssh(master, opts, "spark-ec2/setup.sh")
@@ -633,9 +719,7 @@ def wait_for_cluster_state(conn, opts, cluster_instances, cluster_state):
     sys.stdout.flush()
 
     start_time = datetime.now()
-
     num_attempts = 0
-    conn = ec2.connect_to_region(opts.region)
 
     while True:
         time.sleep(5 * num_attempts)  # seconds
@@ -742,13 +826,11 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
     cluster_url = "%s:7077" % active_master
 
     if "." in opts.spark_version:
-        # Pre-built spark & shark deploy
-        (spark_v, shark_v) = get_spark_shark_version(opts)
+        # Pre-built Spark deploy
+        spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
     else:
         # Spark-only custom deploy
         spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version)
-        shark_v = ""
-        modules = filter(lambda x: x != "shark", modules)
 
     template_vars = {
         "master_list": '\n'.join([i.public_dns_name for i in master_nodes]),
@@ -761,7 +843,6 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
         "swap": str(opts.swap),
         "modules": '\n'.join(modules),
         "spark_version": spark_v,
-        "shark_version": shark_v,
         "hadoop_major_version": opts.hadoop_major_version,
         "spark_worker_instances": "%d" % opts.worker_instances,
         "spark_master_opts": opts.master_opts
@@ -910,6 +991,8 @@ def real_main():
     (opts, action, cluster_name) = parse_args()
 
     # Input parameter validation
+    get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
+
     if opts.wait is not None:
         # NOTE: DeprecationWarnings are silent in 2.7+ by default.
         #       To show them, run Python with the -Wdefault switch.
diff --git a/ec2/third_party/boto-2.4.1.zip b/ec2/third_party/boto-2.4.1.zip
deleted file mode 100644
index 49886b89aeaea..0000000000000
Binary files a/ec2/third_party/boto-2.4.1.zip and /dev/null differ
diff --git a/examples/pom.xml b/examples/pom.xml
index 8713230e1e8ed..4b92147725f6b 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -98,143 +98,145 @@
       <version>${project.version}</version>
     </dependency>
     <dependency>
-      <groupId>org.eclipse.jetty</groupId>
-      <artifactId>jetty-server</artifactId>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-testing-util</artifactId>
+      <version>${hbase.version}</version>
+      <scope>${hbase.deps.scope}</scope>
+      <exclusions>
+        <exclusion>
+          <!-- SPARK-4455 -->
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-annotations</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jruby</groupId>
+          <artifactId>jruby-complete</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-protocol</artifactId>
+      <version>${hbase.version}</version>
+      <scope>${hbase.deps.scope}</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-common</artifactId>
+      <version>${hbase.version}</version>
+      <scope>${hbase.deps.scope}</scope>
+      <exclusions>
+        <exclusion>
+          <!-- SPARK-4455 -->
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-annotations</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-client</artifactId>
+      <version>${hbase.version}</version>
+      <scope>${hbase.deps.scope}</scope>
+      <exclusions>
+        <exclusion>
+          <!-- SPARK-4455 -->
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-annotations</artifactId>
+        </exclusion>
+       <exclusion>
+        <groupId>io.netty</groupId>
+        <artifactId>netty</artifactId>
+       </exclusion>
+     </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-server</artifactId>
+      <version>${hbase.version}</version>
+      <scope>${hbase.deps.scope}</scope>
+      <exclusions>
+        <exclusion>
+          <!-- SPARK-4455 -->
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-annotations</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-core</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-client</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-mapreduce-client-core</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-auth</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-annotations</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-hdfs</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-hadoop1-compat</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.commons</groupId>
+          <artifactId>commons-math</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-core</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-server</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-core</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-json</artifactId>
+        </exclusion>
+        <exclusion>
+          <!-- hbase uses v2.4, which is better, but ...-->
+          <groupId>commons-io</groupId>
+          <artifactId>commons-io</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-hadoop-compat</artifactId>
+      <version>${hbase.version}</version>
+      <scope>${hbase.deps.scope}</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-hadoop-compat</artifactId>
+      <version>${hbase.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
     </dependency>
-      <dependency>
-        <groupId>org.apache.hbase</groupId>
-        <artifactId>hbase-testing-util</artifactId>
-        <version>${hbase.version}</version>
-        <exclusions>
-          <exclusion>
-            <!-- SPARK-4455 -->
-            <groupId>org.apache.hbase</groupId>
-            <artifactId>hbase-annotations</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.jruby</groupId>
-            <artifactId>jruby-complete</artifactId>
-          </exclusion>
-        </exclusions>
-      </dependency>
-      <dependency>
-        <groupId>org.apache.hbase</groupId>
-        <artifactId>hbase-protocol</artifactId>
-        <version>${hbase.version}</version>
-      </dependency>
-      <dependency>
-        <groupId>org.apache.hbase</groupId>
-        <artifactId>hbase-common</artifactId>
-        <version>${hbase.version}</version>
-        <exclusions>
-          <exclusion>
-            <!-- SPARK-4455 -->
-            <groupId>org.apache.hbase</groupId>
-            <artifactId>hbase-annotations</artifactId>
-          </exclusion>
-        </exclusions>
-      </dependency>
-      <dependency>
-        <groupId>org.apache.hbase</groupId>
-        <artifactId>hbase-client</artifactId>
-        <version>${hbase.version}</version>
-        <exclusions>
-          <exclusion>
-            <!-- SPARK-4455 -->
-            <groupId>org.apache.hbase</groupId>
-            <artifactId>hbase-annotations</artifactId>
-          </exclusion>
-         <exclusion>
-          <groupId>io.netty</groupId>
-          <artifactId>netty</artifactId>
-         </exclusion>
-       </exclusions>
-      </dependency>
-      <dependency>
-        <groupId>org.apache.hbase</groupId>
-        <artifactId>hbase-server</artifactId>
-        <version>${hbase.version}</version>
-        <exclusions>
-          <exclusion>
-            <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-core</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-client</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-mapreduce-client-core</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-auth</artifactId>
-          </exclusion>
-          <exclusion>
-            <!-- SPARK-4455 -->
-            <groupId>org.apache.hbase</groupId>
-            <artifactId>hbase-annotations</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-annotations</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-hdfs</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.apache.hbase</groupId>
-            <artifactId>hbase-hadoop1-compat</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-math</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>com.sun.jersey</groupId>
-            <artifactId>jersey-core</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.slf4j</groupId>
-            <artifactId>slf4j-api</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>com.sun.jersey</groupId>
-            <artifactId>jersey-server</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>com.sun.jersey</groupId>
-            <artifactId>jersey-core</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>com.sun.jersey</groupId>
-            <artifactId>jersey-json</artifactId>
-          </exclusion>
-          <exclusion>
-            <!-- hbase uses v2.4, which is better, but ...-->
-            <groupId>commons-io</groupId>
-            <artifactId>commons-io</artifactId>
-          </exclusion>
-        </exclusions>
-      </dependency>
-      <dependency>
-        <groupId>org.apache.hbase</groupId>
-        <artifactId>hbase-hadoop-compat</artifactId>
-        <version>${hbase.version}</version>
-      </dependency>
-      <dependency>
-        <groupId>org.apache.hbase</groupId>
-        <artifactId>hbase-hadoop-compat</artifactId>
-        <version>${hbase.version}</version>
-        <type>test-jar</type>
-        <scope>test</scope>
-      </dependency>
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-math3</artifactId>
@@ -244,11 +246,6 @@
       <artifactId>algebird-core_${scala.binary.version}</artifactId>
       <version>0.8.1</version>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
@@ -313,31 +310,6 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-shade-plugin</artifactId>
-        <configuration>
-          <shadedArtifactAttached>false</shadedArtifactAttached>
-          <outputFile>${project.build.directory}/scala-${scala.binary.version}/spark-examples-${project.version}-hadoop${hadoop.version}.jar</outputFile>
-          <artifactSet>
-            <includes>
-              <include>*:*</include>
-            </includes>
-          </artifactSet>
-          <filters>
-            <filter>
-              <artifact>com.google.guava:guava</artifact>
-              <excludes>
-                <exclude>com/google/common/base/Optional*</exclude>
-              </excludes>
-            </filter>
-            <filter>
-              <artifact>*:*</artifact>
-              <excludes>
-                <exclude>META-INF/*.SF</exclude>
-                <exclude>META-INF/*.DSA</exclude>
-                <exclude>META-INF/*.RSA</exclude>
-              </excludes>
-            </filter>
-          </filters>
-        </configuration>
         <executions>
           <execution>
             <phase>package</phase>
@@ -345,6 +317,34 @@
               <goal>shade</goal>
             </goals>
             <configuration>
+            <shadedArtifactAttached>false</shadedArtifactAttached>
+            <outputFile>${project.build.directory}/scala-${scala.binary.version}/spark-examples-${project.version}-hadoop${hadoop.version}.jar</outputFile>
+            <artifactSet>
+              <includes>
+                <include>*:*</include>
+              </includes>
+            </artifactSet>
+            <filters>
+              <filter>
+                <artifact>com.google.guava:guava</artifact>
+                <excludes>
+                  <!--
+                    Exclude all Guava classes so they're picked up from the main assembly. The
+                    dependency still needs to be compile-scoped so that the relocation below
+                    works.
+                  -->
+                  <exclude>**</exclude>
+                </excludes>
+              </filter>
+              <filter>
+                <artifact>*:*</artifact>
+                <excludes>
+                  <exclude>META-INF/*.SF</exclude>
+                  <exclude>META-INF/*.DSA</exclude>
+                  <exclude>META-INF/*.RSA</exclude>
+                </excludes>
+              </filter>
+            </filters>
               <relocations>
                 <relocation>
                   <pattern>com.google</pattern>
@@ -393,30 +393,7 @@
       </dependencies>
     </profile>
     <profile>
-      <id>hbase-hadoop2</id>
-      <activation>
-        <property>
-          <name>hbase.profile</name>
-          <value>hadoop2</value>
-        </property>
-      </activation>
-      <properties>
-        <hbase.version>0.98.7-hadoop2</hbase.version>
-      </properties>
-    </profile>
-    <profile>
-      <id>hbase-hadoop1</id>
-      <activation>
-        <property>
-          <name>!hbase.profile</name>
-        </property>
-      </activation>
-      <properties>
-        <hbase.version>0.98.7-hadoop1</hbase.version>
-      </properties>
-    </profile>
-    <profile>
-      <!-- We add a source directory specific to Scala 2.10 since Kafka 
+      <!-- We add a source directory specific to Scala 2.10 since Kafka
            only works with it -->
       <id>scala-2.10</id>
       <activation>
@@ -454,5 +431,37 @@
         </plugins>
       </build>
     </profile>
+
+    <!-- Profiles that disable inclusion of certain dependencies. -->
+    <profile>
+      <id>flume-provided</id>
+      <properties>
+        <flume.deps.scope>provided</flume.deps.scope>
+      </properties>
+    </profile>
+    <profile>
+      <id>hadoop-provided</id>
+      <properties>
+        <hadoop.deps.scope>provided</hadoop.deps.scope>
+      </properties>
+    </profile>
+    <profile>
+      <id>hbase-provided</id>
+      <properties>
+        <hbase.deps.scope>provided</hbase.deps.scope>
+      </properties>
+    </profile>
+    <profile>
+      <id>hive-provided</id>
+      <properties>
+        <hive.deps.scope>provided</hive.deps.scope>
+      </properties>
+    </profile>
+    <profile>
+      <id>parquet-provided</id>
+      <properties>
+        <parquet.deps.scope>provided</parquet.deps.scope>
+      </properties>
+    </profile>
   </profiles>
 </project>
diff --git a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala b/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
index c9e1511278ede..387c0e421334b 100644
--- a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
+++ b/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
@@ -22,7 +22,6 @@ import java.util.Properties
 import kafka.producer._
 
 import org.apache.spark.streaming._
-import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.kafka._
 import org.apache.spark.SparkConf
 
@@ -77,7 +76,7 @@ object KafkaWordCountProducer {
 
     val Array(brokers, topic, messagesPerSec, wordsPerMessage) = args
 
-    // Zookeper connection properties
+    // Zookeeper connection properties
     val props = new Properties()
     props.put("metadata.broker.list", brokers)
     props.put("serializer.class", "kafka.serializer.StringEncoder")
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
index f4b4f8d8c7b2f..247d2a5e31a8c 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
@@ -33,9 +33,9 @@
 import org.apache.spark.ml.tuning.CrossValidator;
 import org.apache.spark.ml.tuning.CrossValidatorModel;
 import org.apache.spark.ml.tuning.ParamGridBuilder;
-import org.apache.spark.sql.api.java.JavaSQLContext;
-import org.apache.spark.sql.api.java.JavaSchemaRDD;
-import org.apache.spark.sql.api.java.Row;
+import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.Row;
 
 /**
  * A simple example demonstrating model selection using CrossValidator.
@@ -55,7 +55,7 @@ public class JavaCrossValidatorExample {
   public static void main(String[] args) {
     SparkConf conf = new SparkConf().setAppName("JavaCrossValidatorExample");
     JavaSparkContext jsc = new JavaSparkContext(conf);
-    JavaSQLContext jsql = new JavaSQLContext(jsc);
+    SQLContext jsql = new SQLContext(jsc);
 
     // Prepare training documents, which are labeled.
     List<LabeledDocument> localTraining = Lists.newArrayList(
@@ -71,8 +71,7 @@ public static void main(String[] args) {
       new LabeledDocument(9L, "a e c l", 0.0),
       new LabeledDocument(10L, "spark compile", 1.0),
       new LabeledDocument(11L, "hadoop software", 0.0));
-    JavaSchemaRDD training =
-        jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
+    SchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
 
     // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
     Tokenizer tokenizer = new Tokenizer()
@@ -113,11 +112,11 @@ public static void main(String[] args) {
       new Document(5L, "l m n"),
       new Document(6L, "mapreduce spark"),
       new Document(7L, "apache hadoop"));
-    JavaSchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
+    SchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
 
     // Make predictions on test documents. cvModel uses the best model found (lrModel).
     cvModel.transform(test).registerAsTable("prediction");
-    JavaSchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
+    SchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
     for (Row r: predictions.collect()) {
       System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> score=" + r.get(2)
           + ", prediction=" + r.get(3));
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
index e25b271777ed4..5b92655e2e838 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
@@ -28,9 +28,9 @@
 import org.apache.spark.ml.classification.LogisticRegression;
 import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.sql.api.java.JavaSQLContext;
-import org.apache.spark.sql.api.java.JavaSchemaRDD;
-import org.apache.spark.sql.api.java.Row;
+import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.Row;
 
 /**
  * A simple example demonstrating ways to specify parameters for Estimators and Transformers.
@@ -44,7 +44,7 @@ public class JavaSimpleParamsExample {
   public static void main(String[] args) {
     SparkConf conf = new SparkConf().setAppName("JavaSimpleParamsExample");
     JavaSparkContext jsc = new JavaSparkContext(conf);
-    JavaSQLContext jsql = new JavaSQLContext(jsc);
+    SQLContext jsql = new SQLContext(jsc);
 
     // Prepare training data.
     // We use LabeledPoint, which is a JavaBean.  Spark SQL can convert RDDs of JavaBeans
@@ -54,7 +54,7 @@ public static void main(String[] args) {
       new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
       new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
       new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
-    JavaSchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledPoint.class);
+    SchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledPoint.class);
 
     // Create a LogisticRegression instance.  This instance is an Estimator.
     LogisticRegression lr = new LogisticRegression();
@@ -94,14 +94,14 @@ public static void main(String[] args) {
         new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
         new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
         new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
-    JavaSchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), LabeledPoint.class);
+    SchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), LabeledPoint.class);
 
     // Make predictions on test documents using the Transformer.transform() method.
     // LogisticRegression.transform will only use the 'features' column.
     // Note that model2.transform() outputs a 'probability' column instead of the usual 'score'
     // column since we renamed the lr.scoreCol parameter previously.
     model2.transform(test).registerAsTable("results");
-    JavaSchemaRDD results =
+    SchemaRDD results =
         jsql.sql("SELECT features, label, probability, prediction FROM results");
     for (Row r: results.collect()) {
       System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
index 54f18014e4b2f..74db449fada7d 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
@@ -21,6 +21,7 @@
 
 import com.google.common.collect.Lists;
 
+import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.ml.Pipeline;
 import org.apache.spark.ml.PipelineModel;
@@ -28,10 +29,9 @@
 import org.apache.spark.ml.classification.LogisticRegression;
 import org.apache.spark.ml.feature.HashingTF;
 import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.sql.api.java.JavaSQLContext;
-import org.apache.spark.sql.api.java.JavaSchemaRDD;
-import org.apache.spark.sql.api.java.Row;
-import org.apache.spark.SparkConf;
+import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.Row;
 
 /**
  * A simple text classification pipeline that recognizes "spark" from input text. It uses the Java
@@ -46,7 +46,7 @@ public class JavaSimpleTextClassificationPipeline {
   public static void main(String[] args) {
     SparkConf conf = new SparkConf().setAppName("JavaSimpleTextClassificationPipeline");
     JavaSparkContext jsc = new JavaSparkContext(conf);
-    JavaSQLContext jsql = new JavaSQLContext(jsc);
+    SQLContext jsql = new SQLContext(jsc);
 
     // Prepare training documents, which are labeled.
     List<LabeledDocument> localTraining = Lists.newArrayList(
@@ -54,8 +54,7 @@ public static void main(String[] args) {
       new LabeledDocument(1L, "b d", 0.0),
       new LabeledDocument(2L, "spark f g h", 1.0),
       new LabeledDocument(3L, "hadoop mapreduce", 0.0));
-    JavaSchemaRDD training =
-      jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
+    SchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
 
     // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
     Tokenizer tokenizer = new Tokenizer()
@@ -80,11 +79,11 @@ public static void main(String[] args) {
       new Document(5L, "l m n"),
       new Document(6L, "mapreduce spark"),
       new Document(7L, "apache hadoop"));
-    JavaSchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
+    SchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
 
     // Make predictions on test documents.
     model.transform(test).registerAsTable("prediction");
-    JavaSchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
+    SchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
     for (Row r: predictions.collect()) {
       System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> score=" + r.get(2)
           + ", prediction=" + r.get(3));
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
index 01c77bd44337e..b70804635d5c9 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
@@ -26,9 +26,9 @@
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function;
 
-import org.apache.spark.sql.api.java.JavaSQLContext;
-import org.apache.spark.sql.api.java.JavaSchemaRDD;
-import org.apache.spark.sql.api.java.Row;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.Row;
 
 public class JavaSparkSQL {
   public static class Person implements Serializable {
@@ -55,7 +55,7 @@ public void setAge(int age) {
   public static void main(String[] args) throws Exception {
     SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL");
     JavaSparkContext ctx = new JavaSparkContext(sparkConf);
-    JavaSQLContext sqlCtx = new JavaSQLContext(ctx);
+    SQLContext sqlCtx = new SQLContext(ctx);
 
     System.out.println("=== Data source: RDD ===");
     // Load a text file and convert each line to a Java Bean.
@@ -74,15 +74,15 @@ public Person call(String line) {
       });
 
     // Apply a schema to an RDD of Java Beans and register it as a table.
-    JavaSchemaRDD schemaPeople = sqlCtx.applySchema(people, Person.class);
+    SchemaRDD schemaPeople = sqlCtx.applySchema(people, Person.class);
     schemaPeople.registerTempTable("people");
 
     // SQL can be run over RDDs that have been registered as tables.
-    JavaSchemaRDD teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
+    SchemaRDD teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
 
     // The results of SQL queries are SchemaRDDs and support all the normal RDD operations.
     // The columns of a row in the result can be accessed by ordinal.
-    List<String> teenagerNames = teenagers.map(new Function<Row, String>() {
+    List<String> teenagerNames = teenagers.toJavaRDD().map(new Function<Row, String>() {
       @Override
       public String call(Row row) {
         return "Name: " + row.getString(0);
@@ -99,13 +99,13 @@ public String call(Row row) {
     // Read in the parquet file created above.
     // Parquet files are self-describing so the schema is preserved.
     // The result of loading a parquet file is also a JavaSchemaRDD.
-    JavaSchemaRDD parquetFile = sqlCtx.parquetFile("people.parquet");
+    SchemaRDD parquetFile = sqlCtx.parquetFile("people.parquet");
 
     //Parquet files can also be registered as tables and then used in SQL statements.
     parquetFile.registerTempTable("parquetFile");
-    JavaSchemaRDD teenagers2 =
+    SchemaRDD teenagers2 =
       sqlCtx.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
-    teenagerNames = teenagers2.map(new Function<Row, String>() {
+    teenagerNames = teenagers2.toJavaRDD().map(new Function<Row, String>() {
       @Override
       public String call(Row row) {
           return "Name: " + row.getString(0);
@@ -120,7 +120,7 @@ public String call(Row row) {
     // The path can be either a single text file or a directory storing text files.
     String path = "examples/src/main/resources/people.json";
     // Create a JavaSchemaRDD from the file(s) pointed by path
-    JavaSchemaRDD peopleFromJsonFile = sqlCtx.jsonFile(path);
+    SchemaRDD peopleFromJsonFile = sqlCtx.jsonFile(path);
 
     // Because the schema of a JSON dataset is automatically inferred, to write queries,
     // it is better to take a look at what is the schema.
@@ -134,11 +134,11 @@ public String call(Row row) {
     peopleFromJsonFile.registerTempTable("people");
 
     // SQL statements can be run by using the sql methods provided by sqlCtx.
-    JavaSchemaRDD teenagers3 = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
+    SchemaRDD teenagers3 = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
 
     // The results of SQL queries are JavaSchemaRDDs and support all the normal RDD operations.
     // The columns of a row in the result can be accessed by ordinal.
-    teenagerNames = teenagers3.map(new Function<Row, String>() {
+    teenagerNames = teenagers3.toJavaRDD().map(new Function<Row, String>() {
       @Override
       public String call(Row row) { return "Name: " + row.getString(0); }
     }).collect();
@@ -151,7 +151,7 @@ public String call(Row row) {
     List<String> jsonData = Arrays.asList(
           "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
     JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
-    JavaSchemaRDD peopleFromJsonRDD = sqlCtx.jsonRDD(anotherPeopleRDD);
+    SchemaRDD peopleFromJsonRDD = sqlCtx.jsonRDD(anotherPeopleRDD.rdd());
 
     // Take a look at the schema of this new JavaSchemaRDD.
     peopleFromJsonRDD.printSchema();
@@ -164,8 +164,8 @@ public String call(Row row) {
 
     peopleFromJsonRDD.registerTempTable("people2");
 
-    JavaSchemaRDD peopleWithCity = sqlCtx.sql("SELECT name, address.city FROM people2");
-    List<String> nameAndCity = peopleWithCity.map(new Function<Row, String>() {
+    SchemaRDD peopleWithCity = sqlCtx.sql("SELECT name, address.city FROM people2");
+    List<String> nameAndCity = peopleWithCity.toJavaRDD().map(new Function<Row, String>() {
       @Override
       public String call(Row row) {
         return "Name: " + row.getString(0) + ", City: " + row.getString(1);
diff --git a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
index 973049b95a7bd..1b53f3edbe92e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
@@ -28,20 +28,15 @@ object BroadcastTest {
     val bcName = if (args.length > 2) args(2) else "Http"
     val blockSize = if (args.length > 3) args(3) else "4096"
 
-    System.setProperty("spark.broadcast.factory", "org.apache.spark.broadcast." + bcName +
-      "BroadcastFactory")
-    System.setProperty("spark.broadcast.blockSize", blockSize)
     val sparkConf = new SparkConf().setAppName("Broadcast Test")
-
+      .set("spark.broadcast.factory", s"org.apache.spark.broadcast.${bcName}BroaddcastFactory")
+      .set("spark.broadcast.blockSize", blockSize)
     val sc = new SparkContext(sparkConf)
 
     val slices = if (args.length > 0) args(0).toInt else 2
     val num = if (args.length > 1) args(1).toInt else 1000000
 
-    val arr1 = new Array[Int](num)
-    for (i <- 0 until arr1.length) {
-      arr1(i) = i
-    }
+    val arr1 = (0 until num).toArray
 
     for (i <- 0 until 3) {
       println("Iteration " + i)
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala b/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala
index 9fbb0a800d735..35b8dd6c29b66 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala
@@ -27,8 +27,8 @@ object SparkPi {
     val conf = new SparkConf().setAppName("Spark Pi")
     val spark = new SparkContext(conf)
     val slices = if (args.length > 0) args(0).toInt else 2
-    val n = 100000 * slices
-    val count = spark.parallelize(1 to n, slices).map { i =>
+    val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
+    val count = spark.parallelize(1 until n, slices).map { i =>
       val x = random * 2 - 1
       val y = random * 2 - 1
       if (x*x + y*y < 1) 1 else 0
diff --git a/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala b/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
index 828cffb01ca1e..409721b01c8fd 100644
--- a/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
@@ -33,6 +33,10 @@ object Analytics extends Logging {
     if (args.length < 2) {
       System.err.println(
         "Usage: Analytics <taskType> <file> --numEPart=<num_edge_partitions> [other options]")
+      System.err.println("Supported 'taskType' as follows:")
+      System.err.println("  pagerank    Compute PageRank")
+      System.err.println("  cc          Compute the connected components of vertices")
+      System.err.println("  triangles   Count the number of triangles")
       System.exit(1)
     }
 
@@ -46,7 +50,7 @@ object Analytics extends Logging {
     }
     val options = mutable.Map(optionsList: _*)
 
-    val conf = new SparkConf().set("spark.locality.wait", "100000")
+    val conf = new SparkConf()
     GraphXUtils.registerKryoClasses(conf)
 
     val numEPart = options.remove("numEPart").map(_.toInt).getOrElse {
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
index ce6bc066bd70d..d8c7ef38ee46d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
@@ -106,5 +106,7 @@ object CrossValidatorExample {
       .foreach { case Row(id: Long, text: String, score: Double, prediction: Double) =>
       println("(" + id + ", " + text + ") --> score=" + score + ", prediction=" + prediction)
     }
+
+    sc.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
new file mode 100644
index 0000000000000..cf62772b92651
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import scopt.OptionParser
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.examples.mllib.AbstractParams
+import org.apache.spark.ml.recommendation.ALS
+import org.apache.spark.sql.{Row, SQLContext}
+
+/**
+ * An example app for ALS on MovieLens data (http://grouplens.org/datasets/movielens/).
+ * Run with
+ * {{{
+ * bin/run-example ml.MovieLensALS
+ * }}}
+ */
+object MovieLensALS {
+
+  case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: Long)
+
+  object Rating {
+    def parseRating(str: String): Rating = {
+      val fields = str.split("::")
+      assert(fields.size == 4)
+      Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong)
+    }
+  }
+
+  case class Movie(movieId: Int, title: String, genres: Seq[String])
+
+  object Movie {
+    def parseMovie(str: String): Movie = {
+      val fields = str.split("::")
+      assert(fields.size == 3)
+      Movie(fields(0).toInt, fields(1), fields(2).split("|"))
+    }
+  }
+
+  case class Params(
+      ratings: String = null,
+      movies: String = null,
+      maxIter: Int = 10,
+      regParam: Double = 0.1,
+      rank: Int = 10,
+      numBlocks: Int = 10) extends AbstractParams[Params]
+
+  def main(args: Array[String]) {
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("MovieLensALS") {
+      head("MovieLensALS: an example app for ALS on MovieLens data.")
+      opt[String]("ratings")
+        .required()
+        .text("path to a MovieLens dataset of ratings")
+        .action((x, c) => c.copy(ratings = x))
+      opt[String]("movies")
+        .required()
+        .text("path to a MovieLens dataset of movies")
+        .action((x, c) => c.copy(movies = x))
+      opt[Int]("rank")
+        .text(s"rank, default: ${defaultParams.rank}}")
+        .action((x, c) => c.copy(rank = x))
+      opt[Int]("maxIter")
+        .text(s"max number of iterations, default: ${defaultParams.maxIter}")
+        .action((x, c) => c.copy(maxIter = x))
+      opt[Double]("regParam")
+        .text(s"regularization parameter, default: ${defaultParams.regParam}")
+        .action((x, c) => c.copy(regParam = x))
+      opt[Int]("numBlocks")
+        .text(s"number of blocks, default: ${defaultParams.numBlocks}")
+        .action((x, c) => c.copy(numBlocks = x))
+      note(
+        """
+          |Example command line to run this app:
+          |
+          | bin/spark-submit --class org.apache.spark.examples.ml.MovieLensALS \
+          |  examples/target/scala-*/spark-examples-*.jar \
+          |  --rank 10 --maxIter 15 --regParam 0.1 \
+          |  --movies path/to/movielens/movies.dat \
+          |  --ratings path/to/movielens/ratings.dat
+        """.stripMargin)
+    }
+
+    parser.parse(args, defaultParams).map { params =>
+      run(params)
+    } getOrElse {
+      System.exit(1)
+    }
+  }
+
+  def run(params: Params) {
+    val conf = new SparkConf().setAppName(s"MovieLensALS with $params")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+    import sqlContext._
+
+    val ratings = sc.textFile(params.ratings).map(Rating.parseRating).cache()
+
+    val numRatings = ratings.count()
+    val numUsers = ratings.map(_.userId).distinct().count()
+    val numMovies = ratings.map(_.movieId).distinct().count()
+
+    println(s"Got $numRatings ratings from $numUsers users on $numMovies movies.")
+
+    val splits = ratings.randomSplit(Array(0.8, 0.2), 0L)
+    val training = splits(0).cache()
+    val test = splits(1).cache()
+
+    val numTraining = training.count()
+    val numTest = test.count()
+    println(s"Training: $numTraining, test: $numTest.")
+
+    ratings.unpersist(blocking = false)
+
+    val als = new ALS()
+      .setUserCol("userId")
+      .setItemCol("movieId")
+      .setRank(params.rank)
+      .setMaxIter(params.maxIter)
+      .setRegParam(params.regParam)
+      .setNumBlocks(params.numBlocks)
+
+    val model = als.fit(training)
+
+    val predictions = model.transform(test).cache()
+
+    // Evaluate the model.
+    // TODO: Create an evaluator to compute RMSE.
+    val mse = predictions.select('rating, 'prediction)
+      .flatMap { case Row(rating: Float, prediction: Float) =>
+        val err = rating.toDouble - prediction
+        val err2 = err * err
+        if (err2.isNaN) {
+          None
+        } else {
+          Some(err2)
+        }
+      }.mean()
+    val rmse = math.sqrt(mse)
+    println(s"Test RMSE = $rmse.")
+
+    // Inspect false positives.
+    predictions.registerTempTable("prediction")
+    sc.textFile(params.movies).map(Movie.parseMovie).registerTempTable("movie")
+    sqlContext.sql(
+      """
+        |SELECT userId, prediction.movieId, title, rating, prediction
+        |  FROM prediction JOIN movie ON prediction.movieId = movie.movieId
+        |  WHERE rating <= 1 AND prediction >= 4
+        |  LIMIT 100
+      """.stripMargin)
+      .collect()
+      .foreach(println)
+
+    sc.stop()
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
index 44d5b084c269a..e8a2adff929cb 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
@@ -97,5 +97,7 @@ object SimpleParamsExample {
       .foreach { case Row(features: Vector, label: Double, prob: Double, prediction: Double) =>
         println("(" + features + ", " + label + ") -> prob=" + prob + ", prediction=" + prediction)
       }
+
+    sc.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
index 92895a05e479a..b9a6ef0229def 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
@@ -85,5 +85,7 @@ object SimpleTextClassificationPipeline {
       .foreach { case Row(id: Long, text: String, score: Double, prediction: Double) =>
         println("(" + id + ", " + text + ") --> score=" + score + ", prediction=" + prediction)
       }
+
+    sc.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala
new file mode 100644
index 0000000000000..de58be38c7bfb
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.mllib.clustering.GaussianMixtureEM
+import org.apache.spark.mllib.linalg.Vectors
+
+/**
+ * An example Gaussian Mixture Model EM app. Run with
+ * {{{
+ * ./bin/run-example org.apache.spark.examples.mllib.DenseGmmEM <input> <k> <covergenceTol>
+ * }}}
+ * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
+ */
+object DenseGmmEM {
+  def main(args: Array[String]): Unit = {
+    if (args.length < 3) {
+      println("usage: DenseGmmEM <input file> <k> <convergenceTol> [maxIterations]")
+    } else {
+      val maxIterations = if (args.length > 3) args(3).toInt else 100
+      run(args(0), args(1).toInt, args(2).toDouble, maxIterations)
+    }
+  }
+
+  private def run(inputFile: String, k: Int, convergenceTol: Double, maxIterations: Int) {
+    val conf = new SparkConf().setAppName("Gaussian Mixture Model EM example")
+    val ctx  = new SparkContext(conf)
+    
+    val data = ctx.textFile(inputFile).map { line =>
+      Vectors.dense(line.trim.split(' ').map(_.toDouble))
+    }.cache()
+      
+    val clusters = new GaussianMixtureEM()
+      .setK(k)
+      .setConvergenceTol(convergenceTol)
+      .setMaxIterations(maxIterations)
+      .run(data)
+    
+    for (i <- 0 until clusters.k) {
+      println("weight=%f\nmu=%s\nsigma=\n%s\n" format 
+        (clusters.weights(i), clusters.gaussians(i).mu, clusters.gaussians(i).sigma))
+    }
+    
+    println("Cluster labels (first <= 100):")
+    val clusterLabels = clusters.predict(data)
+    clusterLabels.take(100).foreach { x =>
+      print(" " + x)
+    }
+    println()
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
index 6bb659fbd8be8..30269a7ccae97 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
@@ -23,7 +23,6 @@ import java.net.Socket
 import org.apache.spark.{SparkConf, Logging}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.{Seconds, StreamingContext}
-import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.receiver.Receiver
 
 /**
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/HdfsWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/HdfsWordCount.scala
index 6c24bc3ad09e0..4b4667fec44e6 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/HdfsWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/HdfsWordCount.scala
@@ -19,7 +19,6 @@ package org.apache.spark.examples.streaming
 
 import org.apache.spark.SparkConf
 import org.apache.spark.streaming.{Seconds, StreamingContext}
-import org.apache.spark.streaming.StreamingContext._
 
 /**
  * Counts words in new text files created in the given directory
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
index e4283e04a1b11..6ff0c47793a25 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
@@ -22,7 +22,6 @@ import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence
 
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.{Seconds, StreamingContext}
-import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.mqtt._
 import org.apache.spark.SparkConf
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala
index ae0a08c6cdb1a..2cd8073dada14 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala
@@ -19,7 +19,6 @@ package org.apache.spark.examples.streaming
 
 import org.apache.spark.SparkConf
 import org.apache.spark.streaming.{Seconds, StreamingContext}
-import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.storage.StorageLevel
 
 /**
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/QueueStream.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/QueueStream.scala
index 4caa90659111a..13ba9a43ec3c9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/QueueStream.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/QueueStream.scala
@@ -22,7 +22,6 @@ import scala.collection.mutable.SynchronizedQueue
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
 import org.apache.spark.streaming.{Seconds, StreamingContext}
-import org.apache.spark.streaming.StreamingContext._
 
 object QueueStream {
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
index 19427e629f76d..c3a05c89d817e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
@@ -25,7 +25,6 @@ import com.google.common.io.Files
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
 import org.apache.spark.streaming.{Time, Seconds, StreamingContext}
-import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.util.IntParam
 
 /**
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/StatefulNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/StatefulNetworkWordCount.scala
index ed186ea5650c4..345d0bc441351 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/StatefulNetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/StatefulNetworkWordCount.scala
@@ -20,7 +20,6 @@ package org.apache.spark.examples.streaming
 import org.apache.spark.SparkConf
 import org.apache.spark.HashPartitioner
 import org.apache.spark.streaming._
-import org.apache.spark.streaming.StreamingContext._
 
 /**
  * Counts words cumulatively in UTF8 encoded, '\n' delimited text received from the network every
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterAlgebirdCMS.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterAlgebirdCMS.scala
index 683752ac96241..62f49530edb12 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterAlgebirdCMS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterAlgebirdCMS.scala
@@ -23,7 +23,6 @@ import org.apache.spark.SparkConf
 import org.apache.spark.SparkContext._
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.{Seconds, StreamingContext}
-import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.twitter._
 
 // scalastyle:off
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterPopularTags.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterPopularTags.scala
index f55d23ab3924b..f253d75b279f7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterPopularTags.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterPopularTags.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.examples.streaming
 
 import org.apache.spark.streaming.{Seconds, StreamingContext}
-import StreamingContext._
 import org.apache.spark.SparkContext._
 import org.apache.spark.streaming.twitter._
 import org.apache.spark.SparkConf
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/ZeroMQWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/ZeroMQWordCount.scala
index 79905af381a12..6510c70bd1866 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/ZeroMQWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/ZeroMQWordCount.scala
@@ -24,7 +24,6 @@ import akka.zeromq.Subscribe
 import akka.util.ByteString
 
 import org.apache.spark.streaming.{Seconds, StreamingContext}
-import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.zeromq._
 
 import scala.language.implicitConversions
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
index 55226c0a6df60..fbacaee98690f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
@@ -19,7 +19,6 @@ package org.apache.spark.examples.streaming.clickstream
 
 import org.apache.spark.SparkContext._
 import org.apache.spark.streaming.{Seconds, StreamingContext}
-import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.examples.streaming.StreamingExamples
 // scalastyle:off
 /** Analyses a streaming dataset of web page views. This class demonstrates several types of
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 72618b6515f83..0706f1ebf66e2 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -38,37 +38,10 @@
     <dependency>
       <groupId>org.apache.flume</groupId>
       <artifactId>flume-ng-sdk</artifactId>
-      <version>${flume.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>io.netty</groupId>
-          <artifactId>netty</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.thrift</groupId>
-          <artifactId>libthrift</artifactId>
-        </exclusion>
-      </exclusions>
     </dependency>
     <dependency>
       <groupId>org.apache.flume</groupId>
       <artifactId>flume-ng-core</artifactId>
-      <version>${flume.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>io.netty</groupId>
-          <artifactId>netty</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.thrift</groupId>
-          <artifactId>libthrift</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
@@ -91,10 +64,6 @@
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
     <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
       <plugin>
         <groupId>org.apache.avro</groupId>
         <artifactId>avro-maven-plugin</artifactId>
diff --git a/external/flume-sink/src/test/resources/log4j.properties b/external/flume-sink/src/test/resources/log4j.properties
index 4411d6e20c52a..2a58e99817224 100644
--- a/external/flume-sink/src/test/resources/log4j.properties
+++ b/external/flume-sink/src/test/resources/log4j.properties
@@ -17,9 +17,8 @@
 
 # Set everything to be logged to the file streaming/target/unit-tests.log
 log4j.rootCategory=INFO, file
-# log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index a682f0e8471d8..1f2681394c583 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -48,23 +48,11 @@
     </dependency>
     <dependency>
       <groupId>org.apache.flume</groupId>
-      <artifactId>flume-ng-sdk</artifactId>
-      <version>${flume.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>io.netty</groupId>
-          <artifactId>netty</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.thrift</groupId>
-          <artifactId>libthrift</artifactId>
-        </exclusion>
-      </exclusions>
+      <artifactId>flume-ng-core</artifactId>
     </dependency>
     <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
+      <groupId>org.apache.flume</groupId>
+      <artifactId>flume-ng-sdk</artifactId>
     </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
@@ -85,11 +73,5 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
-    <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
-    </plugins>
   </build>
 </project>
diff --git a/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
index 6e1f01900071b..1e24da7f5f60c 100644
--- a/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
+++ b/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.streaming;
 
+import org.apache.spark.SparkConf;
 import org.apache.spark.streaming.api.java.JavaStreamingContext;
 import org.junit.After;
 import org.junit.Before;
@@ -27,8 +28,11 @@ public abstract class LocalJavaStreamingContext {
 
     @Before
     public void setUp() {
-        System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
-        ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000));
+        SparkConf conf = new SparkConf()
+            .setMaster("local[2]")
+            .setAppName("test")
+            .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
+        ssc = new JavaStreamingContext(conf, new Duration(1000));
         ssc.checkpoint("checkpoint");
     }
 
diff --git a/external/flume/src/test/resources/log4j.properties b/external/flume/src/test/resources/log4j.properties
index 4411d6e20c52a..9697237bfa1a3 100644
--- a/external/flume/src/test/resources/log4j.properties
+++ b/external/flume/src/test/resources/log4j.properties
@@ -15,11 +15,10 @@
 # limitations under the License.
 #
 
-# Set everything to be logged to the file streaming/target/unit-tests.log
+# Set everything to be logged to the file target/unit-tests.log
 log4j.rootCategory=INFO, file
-# log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
index 13943ed5442b9..f333e3891b5f0 100644
--- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
+++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
@@ -80,7 +80,7 @@ class FlumeStreamSuite extends FunSuite with BeforeAndAfter with Matchers with L
       val socket = new ServerSocket(trialPort)
       socket.close()
       (null, trialPort)
-    })._2
+    }, conf)._2
   }
 
   /** Setup and start the streaming context */
diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml
index b3f44471cd326..b29b0509656ba 100644
--- a/external/kafka/pom.xml
+++ b/external/kafka/pom.xml
@@ -74,11 +74,6 @@
       <version>3.2</version>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
@@ -98,11 +93,5 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
-    <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
-    </plugins>
   </build>
 </project>
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index b4ac929e0c070..df725f0c65a64 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -75,7 +75,7 @@ object KafkaUtils {
   }
 
   /**
-   * Create an input stream that pulls messages form a Kafka Broker.
+   * Create an input stream that pulls messages from a Kafka Broker.
    * Storage level of the data will be the default StorageLevel.MEMORY_AND_DISK_SER_2.
    * @param jssc      JavaStreamingContext object
    * @param zkQuorum  Zookeeper quorum (hostname:port,hostname:port,..)
@@ -93,7 +93,7 @@ object KafkaUtils {
   }
 
   /**
-   * Create an input stream that pulls messages form a Kafka Broker.
+   * Create an input stream that pulls messages from a Kafka Broker.
    * @param jssc      JavaStreamingContext object
    * @param zkQuorum  Zookeeper quorum (hostname:port,hostname:port,..).
    * @param groupId   The group id for this consumer.
@@ -113,7 +113,7 @@ object KafkaUtils {
   }
 
   /**
-   * Create an input stream that pulls messages form a Kafka Broker.
+   * Create an input stream that pulls messages from a Kafka Broker.
    * @param jssc      JavaStreamingContext object
    * @param keyTypeClass Key type of RDD
    * @param valueTypeClass value type of RDD
diff --git a/external/kafka/src/test/resources/log4j.properties b/external/kafka/src/test/resources/log4j.properties
index 4411d6e20c52a..9697237bfa1a3 100644
--- a/external/kafka/src/test/resources/log4j.properties
+++ b/external/kafka/src/test/resources/log4j.properties
@@ -15,11 +15,10 @@
 # limitations under the License.
 #
 
-# Set everything to be logged to the file streaming/target/unit-tests.log
+# Set everything to be logged to the file target/unit-tests.log
 log4j.rootCategory=INFO, file
-# log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml
index 9025915f4447e..560c8b9d18276 100644
--- a/external/mqtt/pom.xml
+++ b/external/mqtt/pom.xml
@@ -46,11 +46,6 @@
       <artifactId>org.eclipse.paho.client.mqttv3</artifactId>
       <version>1.0.1</version>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
@@ -66,15 +61,15 @@
       <artifactId>junit-interface</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.activemq</groupId>
+      <artifactId>activemq-core</artifactId>
+      <version>5.7.0</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
-    <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
-    </plugins>
   </build>
 </project>
diff --git a/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
index 6e1f01900071b..1e24da7f5f60c 100644
--- a/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
+++ b/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.streaming;
 
+import org.apache.spark.SparkConf;
 import org.apache.spark.streaming.api.java.JavaStreamingContext;
 import org.junit.After;
 import org.junit.Before;
@@ -27,8 +28,11 @@ public abstract class LocalJavaStreamingContext {
 
     @Before
     public void setUp() {
-        System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
-        ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000));
+        SparkConf conf = new SparkConf()
+            .setMaster("local[2]")
+            .setAppName("test")
+            .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
+        ssc = new JavaStreamingContext(conf, new Duration(1000));
         ssc.checkpoint("checkpoint");
     }
 
diff --git a/external/mqtt/src/test/resources/log4j.properties b/external/mqtt/src/test/resources/log4j.properties
index 4411d6e20c52a..9697237bfa1a3 100644
--- a/external/mqtt/src/test/resources/log4j.properties
+++ b/external/mqtt/src/test/resources/log4j.properties
@@ -15,11 +15,10 @@
 # limitations under the License.
 #
 
-# Set everything to be logged to the file streaming/target/unit-tests.log
+# Set everything to be logged to the file target/unit-tests.log
 log4j.rootCategory=INFO, file
-# log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
index 84595acf45ccb..fe53a29cba0c9 100644
--- a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
+++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
@@ -17,31 +17,118 @@
 
 package org.apache.spark.streaming.mqtt
 
-import org.scalatest.FunSuite
+import java.net.{URI, ServerSocket}
 
-import org.apache.spark.streaming.{Seconds, StreamingContext}
+import scala.concurrent.duration._
+import scala.language.postfixOps
+
+import org.apache.activemq.broker.{TransportConnector, BrokerService}
+import org.eclipse.paho.client.mqttv3._
+import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence
+
+import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.concurrent.Eventually
+
+import org.apache.spark.streaming.{Milliseconds, StreamingContext}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.dstream.ReceiverInputDStream
+import org.apache.spark.SparkConf
+import org.apache.spark.util.Utils
 
-class MQTTStreamSuite extends FunSuite {
-
-  val batchDuration = Seconds(1)
+class MQTTStreamSuite extends FunSuite with Eventually with BeforeAndAfter {
 
+  private val batchDuration = Milliseconds(500)
   private val master: String = "local[2]"
-
   private val framework: String = this.getClass.getSimpleName
+  private val freePort = findFreePort()
+  private val brokerUri = "//localhost:" + freePort
+  private val topic = "def"
+  private val persistenceDir = Utils.createTempDir()
 
-  test("mqtt input stream") {
-    val ssc = new StreamingContext(master, framework, batchDuration)
-    val brokerUrl = "abc"
-    val topic = "def"
+  private var ssc: StreamingContext = _
+  private var broker: BrokerService = _
+  private var connector: TransportConnector = _
+
+  before {
+    ssc = new StreamingContext(master, framework, batchDuration)
+    setupMQTT()
+  }
 
-    // tests the API, does not actually test data receiving
-    val test1: ReceiverInputDStream[String] = MQTTUtils.createStream(ssc, brokerUrl, topic)
-    val test2: ReceiverInputDStream[String] =
-      MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_AND_DISK_SER_2)
+  after {
+    if (ssc != null) {
+      ssc.stop()
+      ssc = null
+    }
+    Utils.deleteRecursively(persistenceDir)
+    tearDownMQTT()
+  }
 
-    // TODO: Actually test receiving data
+  test("mqtt input stream") {
+    val sendMessage = "MQTT demo for spark streaming"
+    val receiveStream: ReceiverInputDStream[String] =
+      MQTTUtils.createStream(ssc, "tcp:" + brokerUri, topic, StorageLevel.MEMORY_ONLY)
+    var receiveMessage: List[String] = List()
+    receiveStream.foreachRDD { rdd =>
+      if (rdd.collect.length > 0) {
+        receiveMessage = receiveMessage ::: List(rdd.first)
+        receiveMessage
+      }
+    }
+    ssc.start()
+    publishData(sendMessage)
+    eventually(timeout(10000 milliseconds), interval(100 milliseconds)) {
+      assert(sendMessage.equals(receiveMessage(0)))
+    }
     ssc.stop()
   }
+
+  private def setupMQTT() {
+    broker = new BrokerService()
+    connector = new TransportConnector()
+    connector.setName("mqtt")
+    connector.setUri(new URI("mqtt:" + brokerUri))
+    broker.addConnector(connector)
+    broker.start()
+  }
+
+  private def tearDownMQTT() {
+    if (broker != null) {
+      broker.stop()
+      broker = null
+    }
+    if (connector != null) {
+      connector.stop()
+      connector = null
+    }
+  }
+
+  private def findFreePort(): Int = {
+    Utils.startServiceOnPort(23456, (trialPort: Int) => {
+      val socket = new ServerSocket(trialPort)
+      socket.close()
+      (null, trialPort)
+    }, new SparkConf())._2
+  }
+
+  def publishData(data: String): Unit = {
+    var client: MqttClient = null
+    try {
+      val persistence: MqttClientPersistence = new MqttDefaultFilePersistence(persistenceDir.getAbsolutePath)
+      client = new MqttClient("tcp:" + brokerUri, MqttClient.generateClientId(), persistence)
+      client.connect()
+      if (client.isConnected) {
+        val msgTopic: MqttTopic = client.getTopic(topic)
+        val message: MqttMessage = new MqttMessage(data.getBytes("utf-8"))
+        message.setQos(1)
+        message.setRetained(true)
+        for (i <- 0 to 100) {
+          msgTopic.publish(message)
+        }
+      }
+    } finally {
+      client.disconnect()
+      client.close()
+      client = null
+    }
+  }
 }
diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml
index 000ace1446e5e..da6ffe7662f63 100644
--- a/external/twitter/pom.xml
+++ b/external/twitter/pom.xml
@@ -46,11 +46,6 @@
       <artifactId>twitter4j-stream</artifactId>
       <version>3.0.3</version>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
@@ -70,11 +65,5 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
-    <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
-    </plugins>
   </build>
 </project>
diff --git a/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
index 6e1f01900071b..1e24da7f5f60c 100644
--- a/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
+++ b/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.streaming;
 
+import org.apache.spark.SparkConf;
 import org.apache.spark.streaming.api.java.JavaStreamingContext;
 import org.junit.After;
 import org.junit.Before;
@@ -27,8 +28,11 @@ public abstract class LocalJavaStreamingContext {
 
     @Before
     public void setUp() {
-        System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
-        ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000));
+        SparkConf conf = new SparkConf()
+            .setMaster("local[2]")
+            .setAppName("test")
+            .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
+        ssc = new JavaStreamingContext(conf, new Duration(1000));
         ssc.checkpoint("checkpoint");
     }
 
diff --git a/external/twitter/src/test/resources/log4j.properties b/external/twitter/src/test/resources/log4j.properties
index 4411d6e20c52a..64bfc5745088f 100644
--- a/external/twitter/src/test/resources/log4j.properties
+++ b/external/twitter/src/test/resources/log4j.properties
@@ -15,11 +15,10 @@
 # limitations under the License.
 #
 
-# Set everything to be logged to the file streaming/target/unit-tests.log
+# Set everything to be logged to the filetarget/unit-tests.log
 log4j.rootCategory=INFO, file
-# log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml
index 29c452093502e..e919c2c9b19ea 100644
--- a/external/zeromq/pom.xml
+++ b/external/zeromq/pom.xml
@@ -44,12 +44,6 @@
     <dependency>
       <groupId>${akka.group}</groupId>
       <artifactId>akka-zeromq_${scala.binary.version}</artifactId>
-      <version>${akka.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
@@ -70,11 +64,5 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
-    <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
-    </plugins>
   </build>
 </project>
diff --git a/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
index 6e1f01900071b..1e24da7f5f60c 100644
--- a/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
+++ b/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.streaming;
 
+import org.apache.spark.SparkConf;
 import org.apache.spark.streaming.api.java.JavaStreamingContext;
 import org.junit.After;
 import org.junit.Before;
@@ -27,8 +28,11 @@ public abstract class LocalJavaStreamingContext {
 
     @Before
     public void setUp() {
-        System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
-        ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000));
+        SparkConf conf = new SparkConf()
+            .setMaster("local[2]")
+            .setAppName("test")
+            .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
+        ssc = new JavaStreamingContext(conf, new Duration(1000));
         ssc.checkpoint("checkpoint");
     }
 
diff --git a/external/zeromq/src/test/resources/log4j.properties b/external/zeromq/src/test/resources/log4j.properties
index 4411d6e20c52a..9697237bfa1a3 100644
--- a/external/zeromq/src/test/resources/log4j.properties
+++ b/external/zeromq/src/test/resources/log4j.properties
@@ -15,11 +15,10 @@
 # limitations under the License.
 #
 
-# Set everything to be logged to the file streaming/target/unit-tests.log
+# Set everything to be logged to the file target/unit-tests.log
 log4j.rootCategory=INFO, file
-# log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/extras/java8-tests/README.md b/extras/java8-tests/README.md
index e95b73ac7702a..dc9e87f2eeb92 100644
--- a/extras/java8-tests/README.md
+++ b/extras/java8-tests/README.md
@@ -8,7 +8,7 @@ to your Java location. The set-up depends a bit on the build system:
   `-java-home` to the sbt launch script. If a Java 8 JDK is detected sbt will automatically
   include the Java 8 test project.
 
-  `$ JAVA_HOME=/opt/jdk1.8.0/ sbt/sbt clean "test-only org.apache.spark.Java8APISuite"`
+  `$ JAVA_HOME=/opt/jdk1.8.0/ build/sbt clean "test-only org.apache.spark.Java8APISuite"`
 
 * For Maven users,
 
@@ -19,6 +19,6 @@ to your Java location. The set-up depends a bit on the build system:
   `$ JAVA_HOME=/opt/jdk1.8.0/ mvn clean install -DskipTests`
   `$ JAVA_HOME=/opt/jdk1.8.0/ mvn test -Pjava8-tests -DwildcardSuites=org.apache.spark.Java8APISuite`
 
-  Note that the above command can only be run from project root directory since this module 
-  depends on core and the test-jars of core and streaming. This means an install step is 
+  Note that the above command can only be run from project root directory since this module
+  depends on core and the test-jars of core and streaming. This means an install step is
   required to make the test dependencies visible to the Java 8 sub-project.
diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml
index c8477a6566311..0fb431808bacd 100644
--- a/extras/java8-tests/pom.xml
+++ b/extras/java8-tests/pom.xml
@@ -60,11 +60,6 @@
       <artifactId>junit-interface</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
 
   <profiles>
@@ -159,16 +154,6 @@
           </execution>
         </executions>
       </plugin>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-        <executions>
-          <execution>
-            <id>test</id>
-            <phase>none</phase>
-          </execution>
-        </executions>
-      </plugin>
     </plugins>
   </build>
 </project>
diff --git a/extras/java8-tests/src/test/resources/log4j.properties b/extras/java8-tests/src/test/resources/log4j.properties
index bb0ab319a0080..287c8e3563503 100644
--- a/extras/java8-tests/src/test/resources/log4j.properties
+++ b/extras/java8-tests/src/test/resources/log4j.properties
@@ -18,7 +18,7 @@
 # Set everything to be logged to the file target/unit-tests.log
 log4j.rootCategory=INFO, file
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
index c0d3a61119113..c815eda52bda7 100644
--- a/extras/kinesis-asl/pom.xml
+++ b/extras/kinesis-asl/pom.xml
@@ -57,11 +57,6 @@
       <artifactId>aws-java-sdk</artifactId>
       <version>${aws.java.sdk.version}</version>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-all</artifactId>
@@ -86,11 +81,5 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
-    <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
-    </plugins>
   </build>
 </project>
diff --git a/extras/kinesis-asl/src/test/resources/log4j.properties b/extras/kinesis-asl/src/test/resources/log4j.properties
index d9d08f68687d3..853ef0ed2986f 100644
--- a/extras/kinesis-asl/src/test/resources/log4j.properties
+++ b/extras/kinesis-asl/src/test/resources/log4j.properties
@@ -14,10 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+# Set everything to be logged to the file target/unit-tests.log
 log4j.rootCategory=INFO, file
-# log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 9982b36f9b62f..72374aae6da9b 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -45,15 +45,6 @@
       <artifactId>jblas</artifactId>
       <version>${jblas.version}</version>
     </dependency>
-    <dependency>
-      <groupId>org.eclipse.jetty</groupId>
-      <artifactId>jetty-server</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
@@ -63,11 +54,5 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
-    <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
-    </plugins>
   </build>
 </project>
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
index 116d1ea700175..dc8b4789c4b61 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
@@ -278,6 +278,32 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
    retVal
   }
 
+  /**
+   * Convert bi-directional edges into uni-directional ones.
+   * Some graph algorithms (e.g., TriangleCount) assume that an input graph
+   * has its edges in canonical direction.
+   * This function rewrites the vertex ids of edges so that srcIds are bigger
+   * than dstIds, and merges the duplicated edges.
+   *
+   * @param mergeFunc the user defined reduce function which should
+   * be commutative and associative and is used to combine the output
+   * of the map phase
+   *
+   * @return the resulting graph with canonical edges
+   */
+  def convertToCanonicalEdges(
+      mergeFunc: (ED, ED) => ED = (e1, e2) => e1): Graph[VD, ED] = {
+    val newEdges =
+      graph.edges
+        .map {
+          case e if e.srcId < e.dstId => ((e.srcId, e.dstId), e.attr)
+          case e => ((e.dstId, e.srcId), e.attr)
+        }
+        .reduceByKey(mergeFunc)
+        .map(e => new Edge(e._1._1, e._1._2, e._2))
+    Graph(graph.vertices, newEdges)
+  }
+
   /**
    * Execute a Pregel-like iterative vertex-parallel abstraction.  The
    * user-defined vertex-program `vprog` is executed in parallel on
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala b/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala
index 13033fee0e6b5..7372dfbd9fe98 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala
@@ -32,9 +32,9 @@ trait PartitionStrategy extends Serializable {
 object PartitionStrategy {
   /**
    * Assigns edges to partitions using a 2D partitioning of the sparse edge adjacency matrix,
-   * guaranteeing a `2 * sqrt(numParts)` bound on vertex replication.
+   * guaranteeing a `2 * sqrt(numParts) - 1` bound on vertex replication.
    *
-   * Suppose we have a graph with 11 vertices that we want to partition
+   * Suppose we have a graph with 12 vertices that we want to partition
    * over 9 machines.  We can use the following sparse matrix representation:
    *
    * <pre>
@@ -61,7 +61,7 @@ object PartitionStrategy {
    * that edges adjacent to `v11` can only be in the first column of blocks `(P0, P3,
    * P6)` or the last
    * row of blocks `(P6, P7, P8)`.  As a consequence we can guarantee that `v11` will need to be
-   * replicated to at most `2 * sqrt(numParts)` machines.
+   * replicated to at most `2 * sqrt(numParts) - 1` machines.
    *
    * Notice that `P0` has many edges and as a consequence this partitioning would lead to poor work
    * balance.  To improve balance we first multiply each vertex id by a large prime to shuffle the
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala
index 409cf60977f6f..906d42328fcb9 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala
@@ -129,44 +129,45 @@ private[impl] case class EdgeWithLocalIds[@specialized ED](
     srcId: VertexId, dstId: VertexId, localSrcId: Int, localDstId: Int, attr: ED)
 
 private[impl] object EdgeWithLocalIds {
-  implicit def lexicographicOrdering[ED] = new Ordering[EdgeWithLocalIds[ED]] {
-    override def compare(a: EdgeWithLocalIds[ED], b: EdgeWithLocalIds[ED]): Int = {
-      if (a.srcId == b.srcId) {
-        if (a.dstId == b.dstId) 0
-        else if (a.dstId < b.dstId) -1
+  implicit def lexicographicOrdering[ED]: Ordering[EdgeWithLocalIds[ED]] =
+    new Ordering[EdgeWithLocalIds[ED]] {
+      override def compare(a: EdgeWithLocalIds[ED], b: EdgeWithLocalIds[ED]): Int = {
+        if (a.srcId == b.srcId) {
+          if (a.dstId == b.dstId) 0
+          else if (a.dstId < b.dstId) -1
+          else 1
+        } else if (a.srcId < b.srcId) -1
         else 1
-      } else if (a.srcId < b.srcId) -1
-      else 1
+      }
     }
-  }
 
-  private[graphx] def edgeArraySortDataFormat[ED]
-      = new SortDataFormat[EdgeWithLocalIds[ED], Array[EdgeWithLocalIds[ED]]] {
-    override def getKey(
-        data: Array[EdgeWithLocalIds[ED]], pos: Int): EdgeWithLocalIds[ED] = {
-      data(pos)
-    }
+  private[graphx] def edgeArraySortDataFormat[ED] = {
+    new SortDataFormat[EdgeWithLocalIds[ED], Array[EdgeWithLocalIds[ED]]] {
+      override def getKey(data: Array[EdgeWithLocalIds[ED]], pos: Int): EdgeWithLocalIds[ED] = {
+        data(pos)
+      }
 
-    override def swap(data: Array[EdgeWithLocalIds[ED]], pos0: Int, pos1: Int): Unit = {
-      val tmp = data(pos0)
-      data(pos0) = data(pos1)
-      data(pos1) = tmp
-    }
+      override def swap(data: Array[EdgeWithLocalIds[ED]], pos0: Int, pos1: Int): Unit = {
+        val tmp = data(pos0)
+        data(pos0) = data(pos1)
+        data(pos1) = tmp
+      }
 
-    override def copyElement(
-        src: Array[EdgeWithLocalIds[ED]], srcPos: Int,
-        dst: Array[EdgeWithLocalIds[ED]], dstPos: Int) {
-      dst(dstPos) = src(srcPos)
-    }
+      override def copyElement(
+          src: Array[EdgeWithLocalIds[ED]], srcPos: Int,
+          dst: Array[EdgeWithLocalIds[ED]], dstPos: Int) {
+        dst(dstPos) = src(srcPos)
+      }
 
-    override def copyRange(
-        src: Array[EdgeWithLocalIds[ED]], srcPos: Int,
-        dst: Array[EdgeWithLocalIds[ED]], dstPos: Int, length: Int) {
-      System.arraycopy(src, srcPos, dst, dstPos, length)
-    }
+      override def copyRange(
+          src: Array[EdgeWithLocalIds[ED]], srcPos: Int,
+          dst: Array[EdgeWithLocalIds[ED]], dstPos: Int, length: Int) {
+        System.arraycopy(src, srcPos, dst, dstPos, length)
+      }
 
-    override def allocate(length: Int): Array[EdgeWithLocalIds[ED]] = {
-      new Array[EdgeWithLocalIds[ED]](length)
+      override def allocate(length: Int): Array[EdgeWithLocalIds[ED]] = {
+        new Array[EdgeWithLocalIds[ED]](length)
+      }
     }
   }
 }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
index 897c7ee12a436..f1550ac2e18ad 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
@@ -19,7 +19,7 @@ package org.apache.spark.graphx.impl
 
 import scala.reflect.{classTag, ClassTag}
 
-import org.apache.spark.{OneToOneDependency, Partition, Partitioner, TaskContext}
+import org.apache.spark.{OneToOneDependency, HashPartitioner, TaskContext}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 
@@ -46,7 +46,7 @@ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    * partitioner that allows co-partitioning with `partitionsRDD`.
    */
   override val partitioner =
-    partitionsRDD.partitioner.orElse(Some(Partitioner.defaultPartitioner(partitionsRDD)))
+    partitionsRDD.partitioner.orElse(Some(new HashPartitioner(partitions.size)))
 
   override def collect(): Array[Edge[ED]] = this.map(_.copy()).collect()
 
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/ShippableVertexPartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/ShippableVertexPartition.scala
index 5412d720475dc..aa320088f2088 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/ShippableVertexPartition.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/ShippableVertexPartition.scala
@@ -74,8 +74,8 @@ object ShippableVertexPartition {
    * Implicit conversion to allow invoking `VertexPartitionBase` operations directly on a
    * `ShippableVertexPartition`.
    */
-  implicit def shippablePartitionToOps[VD: ClassTag](partition: ShippableVertexPartition[VD]) =
-    new ShippableVertexPartitionOps(partition)
+  implicit def shippablePartitionToOps[VD: ClassTag](partition: ShippableVertexPartition[VD])
+    : ShippableVertexPartitionOps[VD] = new ShippableVertexPartitionOps(partition)
 
   /**
    * Implicit evidence that `ShippableVertexPartition` is a member of the
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala
index 55c7a19d1bdab..fbe53acfc32aa 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala
@@ -38,8 +38,8 @@ private[graphx] object VertexPartition {
    * Implicit conversion to allow invoking `VertexPartitionBase` operations directly on a
    * `VertexPartition`.
    */
-  implicit def partitionToOps[VD: ClassTag](partition: VertexPartition[VD]) =
-    new VertexPartitionOps(partition)
+  implicit def partitionToOps[VD: ClassTag](partition: VertexPartition[VD])
+    : VertexPartitionOps[VD] = new VertexPartitionOps(partition)
 
   /**
    * Implicit evidence that `VertexPartition` is a member of the `VertexPartitionBaseOpsConstructor`
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
index b40aa1b417a0f..4fd2548b7faf6 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
@@ -238,8 +238,8 @@ private[graphx] abstract class VertexPartitionBaseOps
    * because these methods return a `Self` and this implicit conversion re-wraps that in a
    * `VertexPartitionBaseOps`. This relies on the context bound on `Self`.
    */
-  private implicit def toOps[VD2: ClassTag](
-      partition: Self[VD2]): VertexPartitionBaseOps[VD2, Self] = {
+  private implicit def toOps[VD2: ClassTag](partition: Self[VD2])
+    : VertexPartitionBaseOps[VD2, Self] = {
     implicitly[VertexPartitionBaseOpsConstructor[Self]].toOps(partition)
   }
 }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
index 8a13c74221546..2d6a825b61726 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
@@ -133,6 +133,12 @@ object GraphGenerators {
     // This ensures that the 4 quadrants are the same size at all recursion levels
     val numVertices = math.round(
       math.pow(2.0, math.ceil(math.log(requestedNumVertices) / math.log(2.0)))).toInt
+    val numEdgesUpperBound =
+      math.pow(2.0, 2 * ((math.log(numVertices) / math.log(2.0)) - 1)).toInt
+    if (numEdgesUpperBound < numEdges) {
+      throw new IllegalArgumentException(
+        s"numEdges must be <= $numEdgesUpperBound but was $numEdges")
+    }
     var edges: Set[Edge[Int]] = Set()
     while (edges.size < numEdges) {
       if (edges.size % 100 == 0) {
diff --git a/graphx/src/test/resources/log4j.properties b/graphx/src/test/resources/log4j.properties
index 9dd05f17f012b..287c8e3563503 100644
--- a/graphx/src/test/resources/log4j.properties
+++ b/graphx/src/test/resources/log4j.properties
@@ -15,10 +15,10 @@
 # limitations under the License.
 #
 
-# Set everything to be logged to the file core/target/unit-tests.log
+# Set everything to be logged to the file target/unit-tests.log
 log4j.rootCategory=INFO, file
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
index ea94d4accb63b..9bc8007ce49cd 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
@@ -79,6 +79,21 @@ class GraphOpsSuite extends FunSuite with LocalSparkContext {
     }
   }
 
+  test ("convertToCanonicalEdges") {
+    withSpark { sc =>
+      val vertices =
+        sc.parallelize(Seq[(VertexId, String)]((1, "one"), (2, "two"), (3, "three")), 2)
+      val edges =
+        sc.parallelize(Seq(Edge(1, 2, 1), Edge(2, 1, 1), Edge(3, 2, 2)))
+      val g: Graph[String, Int] = Graph(vertices, edges)
+
+      val g1 = g.convertToCanonicalEdges()
+
+      val e = g1.edges.collect().toSet
+      assert(e === Set(Edge(1, 2, 1), Edge(2, 3, 2)))
+    }
+  }
+
   test("collectEdgesCycleDirectionOut") {
     withSpark { sc =>
       val graph = getCycleGraph(sc, 100)
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala
index 9da0064104fb6..ed9876b8dc21c 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala
@@ -386,4 +386,24 @@ class GraphSuite extends FunSuite with LocalSparkContext {
     }
   }
 
+  test("non-default number of edge partitions") {
+    val n = 10
+    val defaultParallelism = 3
+    val numEdgePartitions = 4
+    assert(defaultParallelism != numEdgePartitions)
+    val conf = new org.apache.spark.SparkConf()
+      .set("spark.default.parallelism", defaultParallelism.toString)
+    val sc = new SparkContext("local", "test", conf)
+    try {
+      val edges = sc.parallelize((1 to n).map(x => (x: VertexId, 0: VertexId)),
+        numEdgePartitions)
+      val graph = Graph.fromEdgeTuples(edges, 1)
+      val neighborAttrSums = graph.mapReduceTriplets[Int](
+        et => Iterator((et.dstId, et.srcAttr)), _ + _)
+      assert(neighborAttrSums.collect.toSet === Set((0: VertexId, n)))
+    } finally {
+      sc.stop()
+    }
+  }
+
 }
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/util/GraphGeneratorsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/util/GraphGeneratorsSuite.scala
index 3abefbe52fa8a..8d9c8ddccbb3c 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/util/GraphGeneratorsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/util/GraphGeneratorsSuite.scala
@@ -110,4 +110,14 @@ class GraphGeneratorsSuite extends FunSuite with LocalSparkContext {
     }
   }
 
+  test("SPARK-5064 GraphGenerators.rmatGraph numEdges upper bound") {
+    withSpark { sc =>
+      val g1 = GraphGenerators.rmatGraph(sc, 4, 4)
+      assert(g1.edges.count() === 4)
+      intercept[IllegalArgumentException] {
+        val g2 = GraphGenerators.rmatGraph(sc, 4, 8)
+      }
+    }
+  }
+
 }
diff --git a/make-distribution.sh b/make-distribution.sh
index 45c99e42e5a5b..4e2f400be3053 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -28,18 +28,20 @@ set -o pipefail
 set -e
 
 # Figure out where the Spark framework is installed
-FWDIR="$(cd "`dirname "$0"`"; pwd)"
-DISTDIR="$FWDIR/dist"
+SPARK_HOME="$(cd "`dirname "$0"`"; pwd)"
+DISTDIR="$SPARK_HOME/dist"
 
 SPARK_TACHYON=false
 MAKE_TGZ=false
 NAME=none
+MVN="$SPARK_HOME/build/mvn"
 
 function exit_with_usage {
   echo "make-distribution.sh - tool for making binary distributions of Spark"
   echo ""
   echo "usage:"
-  echo "./make-distribution.sh [--name] [--tgz] [--with-tachyon] <maven build options>"
+  cl_options="[--name] [--tgz] [--mvn <mvn-command>] [--with-tachyon]"
+  echo "./make-distribution.sh $cl_options <maven build options>"
   echo "See Spark's \"Building Spark\" doc for correct Maven options."
   echo ""
   exit 1
@@ -71,6 +73,10 @@ while (( "$#" )); do
     --tgz)
       MAKE_TGZ=true
       ;;
+    --mvn)
+      MVN="$2"
+      shift
+      ;;
     --name)
       NAME="$2"
       shift
@@ -109,9 +115,9 @@ if which git &>/dev/null; then
     unset GITREV
 fi
 
-if ! which mvn &>/dev/null; then
-    echo -e "You need Maven installed to build Spark."
-    echo -e "Download Maven from https://maven.apache.org/"
+if ! which $MVN &>/dev/null; then
+    echo -e "Could not locate Maven command: '$MVN'."
+    echo -e "Specify the Maven command with the --mvn flag"
     exit -1;
 fi
 
@@ -119,7 +125,7 @@ VERSION=$(mvn help:evaluate -Dexpression=project.version 2>/dev/null | grep -v "
 SPARK_HADOOP_VERSION=$(mvn help:evaluate -Dexpression=hadoop.version $@ 2>/dev/null\
     | grep -v "INFO"\
     | tail -n 1)
-SPARK_HIVE=$(mvn help:evaluate -Dexpression=project.activeProfiles -pl sql/hive $@ 2>/dev/null\
+SPARK_HIVE=$($MVN help:evaluate -Dexpression=project.activeProfiles -pl sql/hive $@ 2>/dev/null\
     | grep -v "INFO"\
     | fgrep --count "<id>hive</id>";\
     # Reset exit status to 0, otherwise the script stops here if the last grep finds nothing\
@@ -161,11 +167,11 @@ else
 fi
 
 # Build uber fat JAR
-cd "$FWDIR"
+cd "$SPARK_HOME"
 
 export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
 
-BUILD_COMMAND="mvn clean package -DskipTests $@"
+BUILD_COMMAND="$MVN clean package -DskipTests $@"
 
 # Actually build the jar
 echo -e "\nBuilding with..."
@@ -177,41 +183,42 @@ ${BUILD_COMMAND}
 rm -rf "$DISTDIR"
 mkdir -p "$DISTDIR/lib"
 echo "Spark $VERSION$GITREVSTRING built for Hadoop $SPARK_HADOOP_VERSION" > "$DISTDIR/RELEASE"
+echo "Build flags: $@" >> "$DISTDIR/RELEASE"
 
 # Copy jars
-cp "$FWDIR"/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/"
-cp "$FWDIR"/examples/target/scala*/spark-examples*.jar "$DISTDIR/lib/"
+cp "$SPARK_HOME"/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/"
+cp "$SPARK_HOME"/examples/target/scala*/spark-examples*.jar "$DISTDIR/lib/"
 # This will fail if the -Pyarn profile is not provided
 # In this case, silence the error and ignore the return code of this command
-cp "$FWDIR"/network/yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/lib/" &> /dev/null || :
+cp "$SPARK_HOME"/network/yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/lib/" &> /dev/null || :
 
 # Copy example sources (needed for python and SQL)
 mkdir -p "$DISTDIR/examples/src/main"
-cp -r "$FWDIR"/examples/src/main "$DISTDIR/examples/src/"
+cp -r "$SPARK_HOME"/examples/src/main "$DISTDIR/examples/src/"
 
 if [ "$SPARK_HIVE" == "1" ]; then
-  cp "$FWDIR"/lib_managed/jars/datanucleus*.jar "$DISTDIR/lib/"
+  cp "$SPARK_HOME"/lib_managed/jars/datanucleus*.jar "$DISTDIR/lib/"
 fi
 
 # Copy license and ASF files
-cp "$FWDIR/LICENSE" "$DISTDIR"
-cp "$FWDIR/NOTICE" "$DISTDIR"
+cp "$SPARK_HOME/LICENSE" "$DISTDIR"
+cp "$SPARK_HOME/NOTICE" "$DISTDIR"
 
-if [ -e "$FWDIR"/CHANGES.txt ]; then
-  cp "$FWDIR/CHANGES.txt" "$DISTDIR"
+if [ -e "$SPARK_HOME"/CHANGES.txt ]; then
+  cp "$SPARK_HOME/CHANGES.txt" "$DISTDIR"
 fi
 
 # Copy data files
-cp -r "$FWDIR/data" "$DISTDIR"
+cp -r "$SPARK_HOME/data" "$DISTDIR"
 
 # Copy other things
 mkdir "$DISTDIR"/conf
-cp "$FWDIR"/conf/*.template "$DISTDIR"/conf
-cp "$FWDIR/README.md" "$DISTDIR"
-cp -r "$FWDIR/bin" "$DISTDIR"
-cp -r "$FWDIR/python" "$DISTDIR"
-cp -r "$FWDIR/sbin" "$DISTDIR"
-cp -r "$FWDIR/ec2" "$DISTDIR"
+cp "$SPARK_HOME"/conf/*.template "$DISTDIR"/conf
+cp "$SPARK_HOME/README.md" "$DISTDIR"
+cp -r "$SPARK_HOME/bin" "$DISTDIR"
+cp -r "$SPARK_HOME/python" "$DISTDIR"
+cp -r "$SPARK_HOME/sbin" "$DISTDIR"
+cp -r "$SPARK_HOME/ec2" "$DISTDIR"
 
 # Download and copy in tachyon, if requested
 if [ "$SPARK_TACHYON" == "true" ]; then
@@ -243,9 +250,9 @@ fi
 
 if [ "$MAKE_TGZ" == "true" ]; then
   TARDIR_NAME=spark-$VERSION-bin-$NAME
-  TARDIR="$FWDIR/$TARDIR_NAME"
+  TARDIR="$SPARK_HOME/$TARDIR_NAME"
   rm -rf "$TARDIR"
   cp -r "$DISTDIR" "$TARDIR"
-  tar czf "spark-$VERSION-bin-$NAME.tgz" -C "$FWDIR" "$TARDIR_NAME"
+  tar czf "spark-$VERSION-bin-$NAME.tgz" -C "$SPARK_HOME" "$TARDIR_NAME"
   rm -rf "$TARDIR"
 fi
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 0a6dda0ab8c80..a0bda89ccaa71 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -29,7 +29,7 @@
   <artifactId>spark-mllib_2.10</artifactId>
   <properties>
     <sbt.project.name>mllib</sbt.project.name>
-  </properties>  
+  </properties>
   <packaging>jar</packaging>
   <name>Spark Project ML Library</name>
   <url>http://spark.apache.org/</url>
@@ -50,10 +50,6 @@
       <artifactId>spark-sql_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
-    <dependency>
-      <groupId>org.eclipse.jetty</groupId>
-      <artifactId>jetty-server</artifactId>
-    </dependency>
     <dependency>
       <groupId>org.jblas</groupId>
       <artifactId>jblas</artifactId>
@@ -80,11 +76,6 @@
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-math3</artifactId>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
@@ -129,12 +120,6 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
-    <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
-    </plugins>
     <resources>
       <resource>
         <directory>../python</directory>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
index fdbee743e8177..77d230eb4a122 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
@@ -18,12 +18,10 @@
 package org.apache.spark.ml
 
 import scala.annotation.varargs
-import scala.collection.JavaConverters._
 
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.param.{ParamMap, ParamPair, Params}
 import org.apache.spark.sql.SchemaRDD
-import org.apache.spark.sql.api.java.JavaSchemaRDD
 
 /**
  * :: AlphaComponent ::
@@ -66,40 +64,4 @@ abstract class Estimator[M <: Model[M]] extends PipelineStage with Params {
   def fit(dataset: SchemaRDD, paramMaps: Array[ParamMap]): Seq[M] = {
     paramMaps.map(fit(dataset, _))
   }
-
-  // Java-friendly versions of fit.
-
-  /**
-   * Fits a single model to the input data with optional parameters.
-   *
-   * @param dataset input dataset
-   * @param paramPairs optional list of param pairs (overwrite embedded params)
-   * @return fitted model
-   */
-  @varargs
-  def fit(dataset: JavaSchemaRDD, paramPairs: ParamPair[_]*): M = {
-    fit(dataset.schemaRDD, paramPairs: _*)
-  }
-
-  /**
-   * Fits a single model to the  input data with provided parameter map.
-   *
-   * @param dataset input dataset
-   * @param paramMap parameter map
-   * @return fitted model
-   */
-  def fit(dataset: JavaSchemaRDD, paramMap: ParamMap): M = {
-    fit(dataset.schemaRDD, paramMap)
-  }
-
-  /**
-   * Fits multiple models to the input data with multiple sets of parameters.
-   *
-   * @param dataset input dataset
-   * @param paramMaps an array of parameter maps
-   * @return fitted models, matching the input parameter maps
-   */
-  def fit(dataset: JavaSchemaRDD, paramMaps: Array[ParamMap]): java.util.List[M] = {
-    fit(dataset.schemaRDD, paramMaps).asJava
-  }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index 081a574beea5d..ad6fed178fae9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -21,8 +21,9 @@ import scala.collection.mutable.ListBuffer
 
 import org.apache.spark.Logging
 import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.param.{Params, Param, ParamMap}
-import org.apache.spark.sql.{SchemaRDD, StructType}
+import org.apache.spark.ml.param.{Param, ParamMap}
+import org.apache.spark.sql.SchemaRDD
+import org.apache.spark.sql.types.StructType
 
 /**
  * :: AlphaComponent ::
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
index 23fbd228d01cb..af56f9c435351 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
@@ -23,10 +23,9 @@ import org.apache.spark.Logging
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.param._
 import org.apache.spark.sql.SchemaRDD
-import org.apache.spark.sql.api.java.JavaSchemaRDD
 import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.expressions.ScalaUdf
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 /**
  * :: AlphaComponent ::
@@ -55,29 +54,6 @@ abstract class Transformer extends PipelineStage with Params {
    * @return transformed dataset
    */
   def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD
-
-  // Java-friendly versions of transform.
-
-  /**
-   * Transforms the dataset with optional parameters.
-   * @param dataset input datset
-   * @param paramPairs optional list of param pairs, overwrite embedded params
-   * @return transformed dataset
-   */
-  @varargs
-  def transform(dataset: JavaSchemaRDD, paramPairs: ParamPair[_]*): JavaSchemaRDD = {
-    transform(dataset.schemaRDD, paramPairs: _*).toJavaSchemaRDD
-  }
-
-  /**
-   * Transforms the dataset with provided parameter map as additional parameters.
-   * @param dataset input dataset
-   * @param paramMap additional parameters, overwrite embedded params
-   * @return transformed dataset
-   */
-  def transform(dataset: JavaSchemaRDD, paramMap: ParamMap): JavaSchemaRDD = {
-    transform(dataset.schemaRDD, paramMap).toJavaSchemaRDD
-  }
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 85b8899636ca5..8c570812f8316 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -26,6 +26,7 @@ import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.dsl._
+import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 import org.apache.spark.storage.StorageLevel
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index 0b0504e036ec9..12473cb2b5719 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -21,7 +21,8 @@ import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml._
 import org.apache.spark.ml.param._
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
-import org.apache.spark.sql.{DoubleType, Row, SchemaRDD}
+import org.apache.spark.sql.{Row, SchemaRDD}
+import org.apache.spark.sql.types.DoubleType
 
 /**
  * :: AlphaComponent ::
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index e0bfb1e484a2e..0956062643f23 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -22,7 +22,7 @@ import org.apache.spark.ml.UnaryTransformer
 import org.apache.spark.ml.param.{IntParam, ParamMap}
 import org.apache.spark.mllib.feature
 import org.apache.spark.mllib.linalg.{VectorUDT, Vector}
-import org.apache.spark.sql.catalyst.types.DataType
+import org.apache.spark.sql.types.DataType
 
 /**
  * :: AlphaComponent ::
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index 896a6b83b67bf..72825f6e02182 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -25,6 +25,7 @@ import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.dsl._
+import org.apache.spark.sql.types.{StructField, StructType}
 
 /**
  * Params for [[StandardScaler]] and [[StandardScalerModel]].
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 9352f40f372d3..e622a5cf9e6f3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.feature
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.UnaryTransformer
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.sql.{DataType, StringType, ArrayType}
+import org.apache.spark.sql.types.{DataType, StringType, ArrayType}
 
 /**
  * :: AlphaComponent ::
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
new file mode 100644
index 0000000000000..2d89e76a4c8b2
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -0,0 +1,973 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.recommendation
+
+import java.{util => ju}
+
+import scala.collection.mutable
+
+import com.github.fommil.netlib.BLAS.{getInstance => blas}
+import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
+import org.netlib.util.intW
+
+import org.apache.spark.{HashPartitioner, Logging, Partitioner}
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.param._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SchemaRDD
+import org.apache.spark.sql.catalyst.dsl._
+import org.apache.spark.sql.catalyst.expressions.Cast
+import org.apache.spark.sql.catalyst.plans.LeftOuter
+import org.apache.spark.sql.types.{DoubleType, FloatType, IntegerType, StructField, StructType}
+import org.apache.spark.util.Utils
+import org.apache.spark.util.collection.{OpenHashMap, OpenHashSet, SortDataFormat, Sorter}
+import org.apache.spark.util.random.XORShiftRandom
+
+/**
+ * Common params for ALS.
+ */
+private[recommendation] trait ALSParams extends Params with HasMaxIter with HasRegParam
+  with HasPredictionCol {
+
+  /** Param for rank of the matrix factorization. */
+  val rank = new IntParam(this, "rank", "rank of the factorization", Some(10))
+  def getRank: Int = get(rank)
+
+  /** Param for number of user blocks. */
+  val numUserBlocks = new IntParam(this, "numUserBlocks", "number of user blocks", Some(10))
+  def getNumUserBlocks: Int = get(numUserBlocks)
+
+  /** Param for number of item blocks. */
+  val numItemBlocks =
+    new IntParam(this, "numItemBlocks", "number of item blocks", Some(10))
+  def getNumItemBlocks: Int = get(numItemBlocks)
+
+  /** Param to decide whether to use implicit preference. */
+  val implicitPrefs =
+    new BooleanParam(this, "implicitPrefs", "whether to use implicit preference", Some(false))
+  def getImplicitPrefs: Boolean = get(implicitPrefs)
+
+  /** Param for the alpha parameter in the implicit preference formulation. */
+  val alpha = new DoubleParam(this, "alpha", "alpha for implicit preference", Some(1.0))
+  def getAlpha: Double = get(alpha)
+
+  /** Param for the column name for user ids. */
+  val userCol = new Param[String](this, "userCol", "column name for user ids", Some("user"))
+  def getUserCol: String = get(userCol)
+
+  /** Param for the column name for item ids. */
+  val itemCol =
+    new Param[String](this, "itemCol", "column name for item ids", Some("item"))
+  def getItemCol: String = get(itemCol)
+
+  /** Param for the column name for ratings. */
+  val ratingCol = new Param[String](this, "ratingCol", "column name for ratings", Some("rating"))
+  def getRatingCol: String = get(ratingCol)
+
+  /**
+   * Validates and transforms the input schema.
+   * @param schema input schema
+   * @param paramMap extra params
+   * @return output schema
+   */
+  protected def validateAndTransformSchema(schema: StructType, paramMap: ParamMap): StructType = {
+    val map = this.paramMap ++ paramMap
+    assert(schema(map(userCol)).dataType == IntegerType)
+    assert(schema(map(itemCol)).dataType== IntegerType)
+    val ratingType = schema(map(ratingCol)).dataType
+    assert(ratingType == FloatType || ratingType == DoubleType)
+    val predictionColName = map(predictionCol)
+    assert(!schema.fieldNames.contains(predictionColName),
+      s"Prediction column $predictionColName already exists.")
+    val newFields = schema.fields :+ StructField(map(predictionCol), FloatType, nullable = false)
+    StructType(newFields)
+  }
+}
+
+/**
+ * Model fitted by ALS.
+ */
+class ALSModel private[ml] (
+    override val parent: ALS,
+    override val fittingParamMap: ParamMap,
+    k: Int,
+    userFactors: RDD[(Int, Array[Float])],
+    itemFactors: RDD[(Int, Array[Float])])
+  extends Model[ALSModel] with ALSParams {
+
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
+    import dataset.sqlContext._
+    import org.apache.spark.ml.recommendation.ALSModel.Factor
+    val map = this.paramMap ++ paramMap
+    // TODO: Add DSL to simplify the code here.
+    val instanceTable = s"instance_$uid"
+    val userTable = s"user_$uid"
+    val itemTable = s"item_$uid"
+    val instances = dataset.as(Symbol(instanceTable))
+    val users = userFactors.map { case (id, features) =>
+      Factor(id, features)
+    }.as(Symbol(userTable))
+    val items = itemFactors.map { case (id, features) =>
+      Factor(id, features)
+    }.as(Symbol(itemTable))
+    val predict: (Seq[Float], Seq[Float]) => Float = (userFeatures, itemFeatures) => {
+      if (userFeatures != null && itemFeatures != null) {
+        blas.sdot(k, userFeatures.toArray, 1, itemFeatures.toArray, 1)
+      } else {
+        Float.NaN
+      }
+    }
+    val inputColumns = dataset.schema.fieldNames
+    val prediction =
+      predict.call(s"$userTable.features".attr, s"$itemTable.features".attr) as map(predictionCol)
+    val outputColumns = inputColumns.map(f => s"$instanceTable.$f".attr as f) :+ prediction
+    instances
+      .join(users, LeftOuter, Some(map(userCol).attr === s"$userTable.id".attr))
+      .join(items, LeftOuter, Some(map(itemCol).attr === s"$itemTable.id".attr))
+      .select(outputColumns: _*)
+  }
+
+  override private[ml] def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
+    validateAndTransformSchema(schema, paramMap)
+  }
+}
+
+private object ALSModel {
+  /** Case class to convert factors to SchemaRDDs */
+  private case class Factor(id: Int, features: Seq[Float])
+}
+
+/**
+ * Alternating Least Squares (ALS) matrix factorization.
+ *
+ * ALS attempts to estimate the ratings matrix `R` as the product of two lower-rank matrices,
+ * `X` and `Y`, i.e. `X * Yt = R`. Typically these approximations are called 'factor' matrices.
+ * The general approach is iterative. During each iteration, one of the factor matrices is held
+ * constant, while the other is solved for using least squares. The newly-solved factor matrix is
+ * then held constant while solving for the other factor matrix.
+ *
+ * This is a blocked implementation of the ALS factorization algorithm that groups the two sets
+ * of factors (referred to as "users" and "products") into blocks and reduces communication by only
+ * sending one copy of each user vector to each product block on each iteration, and only for the
+ * product blocks that need that user's feature vector. This is achieved by pre-computing some
+ * information about the ratings matrix to determine the "out-links" of each user (which blocks of
+ * products it will contribute to) and "in-link" information for each product (which of the feature
+ * vectors it receives from each user block it will depend on). This allows us to send only an
+ * array of feature vectors between each user block and product block, and have the product block
+ * find the users' ratings and update the products based on these messages.
+ *
+ * For implicit preference data, the algorithm used is based on
+ * "Collaborative Filtering for Implicit Feedback Datasets", available at
+ * [[http://dx.doi.org/10.1109/ICDM.2008.22]], adapted for the blocked approach used here.
+ *
+ * Essentially instead of finding the low-rank approximations to the rating matrix `R`,
+ * this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if
+ * r > 0 and 0 if r <= 0. The ratings then act as 'confidence' values related to strength of
+ * indicated user
+ * preferences rather than explicit ratings given to items.
+ */
+class ALS extends Estimator[ALSModel] with ALSParams {
+
+  import org.apache.spark.ml.recommendation.ALS.Rating
+
+  def setRank(value: Int): this.type = set(rank, value)
+  def setNumUserBlocks(value: Int): this.type = set(numUserBlocks, value)
+  def setNumItemBlocks(value: Int): this.type = set(numItemBlocks, value)
+  def setImplicitPrefs(value: Boolean): this.type = set(implicitPrefs, value)
+  def setAlpha(value: Double): this.type = set(alpha, value)
+  def setUserCol(value: String): this.type = set(userCol, value)
+  def setItemCol(value: String): this.type = set(itemCol, value)
+  def setRatingCol(value: String): this.type = set(ratingCol, value)
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+  def setMaxIter(value: Int): this.type = set(maxIter, value)
+  def setRegParam(value: Double): this.type = set(regParam, value)
+
+  /** Sets both numUserBlocks and numItemBlocks to the specific value. */
+  def setNumBlocks(value: Int): this.type = {
+    setNumUserBlocks(value)
+    setNumItemBlocks(value)
+    this
+  }
+
+  setMaxIter(20)
+  setRegParam(1.0)
+
+  override def fit(dataset: SchemaRDD, paramMap: ParamMap): ALSModel = {
+    import dataset.sqlContext._
+    val map = this.paramMap ++ paramMap
+    val ratings =
+      dataset.select(map(userCol).attr, map(itemCol).attr, Cast(map(ratingCol).attr, FloatType))
+        .map { row =>
+          new Rating(row.getInt(0), row.getInt(1), row.getFloat(2))
+        }
+    val (userFactors, itemFactors) = ALS.train(ratings, rank = map(rank),
+      numUserBlocks = map(numUserBlocks), numItemBlocks = map(numItemBlocks),
+      maxIter = map(maxIter), regParam = map(regParam), implicitPrefs = map(implicitPrefs),
+      alpha = map(alpha))
+    val model = new ALSModel(this, map, map(rank), userFactors, itemFactors)
+    Params.inheritValues(map, this, model)
+    model
+  }
+
+  override private[ml] def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
+    validateAndTransformSchema(schema, paramMap)
+  }
+}
+
+private[recommendation] object ALS extends Logging {
+
+  /** Rating class for better code readability. */
+  private[recommendation] case class Rating(user: Int, item: Int, rating: Float)
+
+  /** Cholesky solver for least square problems. */
+  private[recommendation] class CholeskySolver {
+
+    private val upper = "U"
+    private val info = new intW(0)
+
+    /**
+     * Solves a least squares problem with L2 regularization:
+     *
+     *   min norm(A x - b)^2^ + lambda * n * norm(x)^2^
+     *
+     * @param ne a [[NormalEquation]] instance that contains AtA, Atb, and n (number of instances)
+     * @param lambda regularization constant, which will be scaled by n
+     * @return the solution x
+     */
+    def solve(ne: NormalEquation, lambda: Double): Array[Float] = {
+      val k = ne.k
+      // Add scaled lambda to the diagonals of AtA.
+      val scaledlambda = lambda * ne.n
+      var i = 0
+      var j = 2
+      while (i < ne.triK) {
+        ne.ata(i) += scaledlambda
+        i += j
+        j += 1
+      }
+      lapack.dppsv(upper, k, 1, ne.ata, ne.atb, k, info)
+      val code = info.`val`
+      assert(code == 0, s"lapack.dppsv returned $code.")
+      val x = new Array[Float](k)
+      i = 0
+      while (i < k) {
+        x(i) = ne.atb(i).toFloat
+        i += 1
+      }
+      ne.reset()
+      x
+    }
+  }
+
+  /** Representing a normal equation (ALS' subproblem). */
+  private[recommendation] class NormalEquation(val k: Int) extends Serializable {
+
+    /** Number of entries in the upper triangular part of a k-by-k matrix. */
+    val triK = k * (k + 1) / 2
+    /** A^T^ * A */
+    val ata = new Array[Double](triK)
+    /** A^T^ * b */
+    val atb = new Array[Double](k)
+    /** Number of observations. */
+    var n = 0
+
+    private val da = new Array[Double](k)
+    private val upper = "U"
+
+    private def copyToDouble(a: Array[Float]): Unit = {
+      var i = 0
+      while (i < k) {
+        da(i) = a(i)
+        i += 1
+      }
+    }
+
+    /** Adds an observation. */
+    def add(a: Array[Float], b: Float): this.type = {
+      require(a.size == k)
+      copyToDouble(a)
+      blas.dspr(upper, k, 1.0, da, 1, ata)
+      blas.daxpy(k, b.toDouble, da, 1, atb, 1)
+      n += 1
+      this
+    }
+
+    /**
+     * Adds an observation with implicit feedback. Note that this does not increment the counter.
+     */
+    def addImplicit(a: Array[Float], b: Float, alpha: Double): this.type = {
+      require(a.size == k)
+      // Extension to the original paper to handle b < 0. confidence is a function of |b| instead
+      // so that it is never negative.
+      val confidence = 1.0 + alpha * math.abs(b)
+      copyToDouble(a)
+      blas.dspr(upper, k, confidence - 1.0, da, 1, ata)
+      // For b <= 0, the corresponding preference is 0. So the term below is only added for b > 0.
+      if (b > 0) {
+        blas.daxpy(k, confidence, da, 1, atb, 1)
+      }
+      this
+    }
+
+    /** Merges another normal equation object. */
+    def merge(other: NormalEquation): this.type = {
+      require(other.k == k)
+      blas.daxpy(ata.size, 1.0, other.ata, 1, ata, 1)
+      blas.daxpy(atb.size, 1.0, other.atb, 1, atb, 1)
+      n += other.n
+      this
+    }
+
+    /** Resets everything to zero, which should be called after each solve. */
+    def reset(): Unit = {
+      ju.Arrays.fill(ata, 0.0)
+      ju.Arrays.fill(atb, 0.0)
+      n = 0
+    }
+  }
+
+  /**
+   * Implementation of the ALS algorithm.
+   */
+  private def train(
+      ratings: RDD[Rating],
+      rank: Int = 10,
+      numUserBlocks: Int = 10,
+      numItemBlocks: Int = 10,
+      maxIter: Int = 10,
+      regParam: Double = 1.0,
+      implicitPrefs: Boolean = false,
+      alpha: Double = 1.0): (RDD[(Int, Array[Float])], RDD[(Int, Array[Float])]) = {
+    val userPart = new HashPartitioner(numUserBlocks)
+    val itemPart = new HashPartitioner(numItemBlocks)
+    val userLocalIndexEncoder = new LocalIndexEncoder(userPart.numPartitions)
+    val itemLocalIndexEncoder = new LocalIndexEncoder(itemPart.numPartitions)
+    val blockRatings = partitionRatings(ratings, userPart, itemPart).cache()
+    val (userInBlocks, userOutBlocks) = makeBlocks("user", blockRatings, userPart, itemPart)
+    // materialize blockRatings and user blocks
+    userOutBlocks.count()
+    val swappedBlockRatings = blockRatings.map {
+      case ((userBlockId, itemBlockId), RatingBlock(userIds, itemIds, localRatings)) =>
+        ((itemBlockId, userBlockId), RatingBlock(itemIds, userIds, localRatings))
+    }
+    val (itemInBlocks, itemOutBlocks) = makeBlocks("item", swappedBlockRatings, itemPart, userPart)
+    // materialize item blocks
+    itemOutBlocks.count()
+    var userFactors = initialize(userInBlocks, rank)
+    var itemFactors = initialize(itemInBlocks, rank)
+    if (implicitPrefs) {
+      for (iter <- 1 to maxIter) {
+        userFactors.setName(s"userFactors-$iter").persist()
+        val previousItemFactors = itemFactors
+        itemFactors = computeFactors(userFactors, userOutBlocks, itemInBlocks, rank, regParam,
+          userLocalIndexEncoder, implicitPrefs, alpha)
+        previousItemFactors.unpersist()
+        itemFactors.setName(s"itemFactors-$iter").persist()
+        val previousUserFactors = userFactors
+        userFactors = computeFactors(itemFactors, itemOutBlocks, userInBlocks, rank, regParam,
+          itemLocalIndexEncoder, implicitPrefs, alpha)
+        previousUserFactors.unpersist()
+      }
+    } else {
+      for (iter <- 0 until maxIter) {
+        itemFactors = computeFactors(userFactors, userOutBlocks, itemInBlocks, rank, regParam,
+          userLocalIndexEncoder)
+        userFactors = computeFactors(itemFactors, itemOutBlocks, userInBlocks, rank, regParam,
+          itemLocalIndexEncoder)
+      }
+    }
+    val userIdAndFactors = userInBlocks
+      .mapValues(_.srcIds)
+      .join(userFactors)
+      .values
+      .setName("userFactors")
+      .cache()
+    userIdAndFactors.count()
+    itemFactors.unpersist()
+    val itemIdAndFactors = itemInBlocks
+      .mapValues(_.srcIds)
+      .join(itemFactors)
+      .values
+      .setName("itemFactors")
+      .cache()
+    itemIdAndFactors.count()
+    userInBlocks.unpersist()
+    userOutBlocks.unpersist()
+    itemInBlocks.unpersist()
+    itemOutBlocks.unpersist()
+    blockRatings.unpersist()
+    val userOutput = userIdAndFactors.flatMap { case (ids, factors) =>
+      ids.view.zip(factors)
+    }
+    val itemOutput = itemIdAndFactors.flatMap { case (ids, factors) =>
+      ids.view.zip(factors)
+    }
+    (userOutput, itemOutput)
+  }
+
+  /**
+   * Factor block that stores factors (Array[Float]) in an Array.
+   */
+  private type FactorBlock = Array[Array[Float]]
+
+  /**
+   * Out-link block that stores, for each dst (item/user) block, which src (user/item) factors to
+   * send. For example, outLinkBlock(0) contains the local indices (not the original src IDs) of the
+   * src factors in this block to send to dst block 0.
+   */
+  private type OutBlock = Array[Array[Int]]
+
+  /**
+   * In-link block for computing src (user/item) factors. This includes the original src IDs
+   * of the elements within this block as well as encoded dst (item/user) indices and corresponding
+   * ratings. The dst indices are in the form of (blockId, localIndex), which are not the original
+   * dst IDs. To compute src factors, we expect receiving dst factors that match the dst indices.
+   * For example, if we have an in-link record
+   *
+   * {srcId: 0, dstBlockId: 2, dstLocalIndex: 3, rating: 5.0},
+   *
+   * and assume that the dst factors are stored as dstFactors: Map[Int, Array[Array[Float]]], which
+   * is a blockId to dst factors map, the corresponding dst factor of the record is dstFactor(2)(3).
+   *
+   * We use a CSC-like (compressed sparse column) format to store the in-link information. So we can
+   * compute src factors one after another using only one normal equation instance.
+   *
+   * @param srcIds src ids (ordered)
+   * @param dstPtrs dst pointers. Elements in range [dstPtrs(i), dstPtrs(i+1)) of dst indices and
+   *                ratings are associated with srcIds(i).
+   * @param dstEncodedIndices encoded dst indices
+   * @param ratings ratings
+   *
+   * @see [[LocalIndexEncoder]]
+   */
+  private[recommendation] case class InBlock(
+      srcIds: Array[Int],
+      dstPtrs: Array[Int],
+      dstEncodedIndices: Array[Int],
+      ratings: Array[Float]) {
+    /** Size of the block. */
+    val size: Int = ratings.size
+
+    require(dstEncodedIndices.size == size)
+    require(dstPtrs.size == srcIds.size + 1)
+  }
+
+  /**
+   * Initializes factors randomly given the in-link blocks.
+   *
+   * @param inBlocks in-link blocks
+   * @param rank rank
+   * @return initialized factor blocks
+   */
+  private def initialize(inBlocks: RDD[(Int, InBlock)], rank: Int): RDD[(Int, FactorBlock)] = {
+    // Choose a unit vector uniformly at random from the unit sphere, but from the
+    // "first quadrant" where all elements are nonnegative. This can be done by choosing
+    // elements distributed as Normal(0,1) and taking the absolute value, and then normalizing.
+    // This appears to create factorizations that have a slightly better reconstruction
+    // (<1%) compared picking elements uniformly at random in [0,1].
+    inBlocks.map { case (srcBlockId, inBlock) =>
+      val random = new XORShiftRandom(srcBlockId)
+      val factors = Array.fill(inBlock.srcIds.size) {
+        val factor = Array.fill(rank)(random.nextGaussian().toFloat)
+        val nrm = blas.snrm2(rank, factor, 1)
+        blas.sscal(rank, 1.0f / nrm, factor, 1)
+        factor
+      }
+      (srcBlockId, factors)
+    }
+  }
+
+  /**
+   * A rating block that contains src IDs, dst IDs, and ratings, stored in primitive arrays.
+   */
+  private[recommendation]
+  case class RatingBlock(srcIds: Array[Int], dstIds: Array[Int], ratings: Array[Float]) {
+    /** Size of the block. */
+    val size: Int = srcIds.size
+    require(dstIds.size == size)
+    require(ratings.size == size)
+  }
+
+  /**
+   * Builder for [[RatingBlock]]. [[mutable.ArrayBuilder]] is used to avoid boxing/unboxing.
+   */
+  private[recommendation] class RatingBlockBuilder extends Serializable {
+
+    private val srcIds = mutable.ArrayBuilder.make[Int]
+    private val dstIds = mutable.ArrayBuilder.make[Int]
+    private val ratings = mutable.ArrayBuilder.make[Float]
+    var size = 0
+
+    /** Adds a rating. */
+    def add(r: Rating): this.type = {
+      size += 1
+      srcIds += r.user
+      dstIds += r.item
+      ratings += r.rating
+      this
+    }
+
+    /** Merges another [[RatingBlockBuilder]]. */
+    def merge(other: RatingBlock): this.type = {
+      size += other.srcIds.size
+      srcIds ++= other.srcIds
+      dstIds ++= other.dstIds
+      ratings ++= other.ratings
+      this
+    }
+
+    /** Builds a [[RatingBlock]]. */
+    def build(): RatingBlock = {
+      RatingBlock(srcIds.result(), dstIds.result(), ratings.result())
+    }
+  }
+
+  /**
+   * Partitions raw ratings into blocks.
+   *
+   * @param ratings raw ratings
+   * @param srcPart partitioner for src IDs
+   * @param dstPart partitioner for dst IDs
+   *
+   * @return an RDD of rating blocks in the form of ((srcBlockId, dstBlockId), ratingBlock)
+   */
+  private def partitionRatings(
+      ratings: RDD[Rating],
+      srcPart: Partitioner,
+      dstPart: Partitioner): RDD[((Int, Int), RatingBlock)] = {
+
+     /* The implementation produces the same result as the following but generates less objects.
+
+     ratings.map { r =>
+       ((srcPart.getPartition(r.user), dstPart.getPartition(r.item)), r)
+     }.aggregateByKey(new RatingBlockBuilder)(
+         seqOp = (b, r) => b.add(r),
+         combOp = (b0, b1) => b0.merge(b1.build()))
+       .mapValues(_.build())
+     */
+
+    val numPartitions = srcPart.numPartitions * dstPart.numPartitions
+    ratings.mapPartitions { iter =>
+      val builders = Array.fill(numPartitions)(new RatingBlockBuilder)
+      iter.flatMap { r =>
+        val srcBlockId = srcPart.getPartition(r.user)
+        val dstBlockId = dstPart.getPartition(r.item)
+        val idx = srcBlockId + srcPart.numPartitions * dstBlockId
+        val builder = builders(idx)
+        builder.add(r)
+        if (builder.size >= 2048) { // 2048 * (3 * 4) = 24k
+          builders(idx) = new RatingBlockBuilder
+          Iterator.single(((srcBlockId, dstBlockId), builder.build()))
+        } else {
+          Iterator.empty
+        }
+      } ++ {
+        builders.view.zipWithIndex.filter(_._1.size > 0).map { case (block, idx) =>
+          val srcBlockId = idx % srcPart.numPartitions
+          val dstBlockId = idx / srcPart.numPartitions
+          ((srcBlockId, dstBlockId), block.build())
+        }
+      }
+    }.groupByKey().mapValues { blocks =>
+      val builder = new RatingBlockBuilder
+      blocks.foreach(builder.merge)
+      builder.build()
+    }.setName("ratingBlocks")
+  }
+
+  /**
+   * Builder for uncompressed in-blocks of (srcId, dstEncodedIndex, rating) tuples.
+   * @param encoder encoder for dst indices
+   */
+  private[recommendation] class UncompressedInBlockBuilder(encoder: LocalIndexEncoder) {
+
+    private val srcIds = mutable.ArrayBuilder.make[Int]
+    private val dstEncodedIndices = mutable.ArrayBuilder.make[Int]
+    private val ratings = mutable.ArrayBuilder.make[Float]
+
+    /**
+     * Adds a dst block of (srcId, dstLocalIndex, rating) tuples.
+     *
+     * @param dstBlockId dst block ID
+     * @param srcIds original src IDs
+     * @param dstLocalIndices dst local indices
+     * @param ratings ratings
+     */
+    def add(
+        dstBlockId: Int,
+        srcIds: Array[Int],
+        dstLocalIndices: Array[Int],
+        ratings: Array[Float]): this.type = {
+      val sz = srcIds.size
+      require(dstLocalIndices.size == sz)
+      require(ratings.size == sz)
+      this.srcIds ++= srcIds
+      this.ratings ++= ratings
+      var j = 0
+      while (j < sz) {
+        this.dstEncodedIndices += encoder.encode(dstBlockId, dstLocalIndices(j))
+        j += 1
+      }
+      this
+    }
+
+    /** Builds a [[UncompressedInBlock]]. */
+    def build(): UncompressedInBlock = {
+      new UncompressedInBlock(srcIds.result(), dstEncodedIndices.result(), ratings.result())
+    }
+  }
+
+  /**
+   * A block of (srcId, dstEncodedIndex, rating) tuples stored in primitive arrays.
+   */
+  private[recommendation] class UncompressedInBlock(
+      val srcIds: Array[Int],
+      val dstEncodedIndices: Array[Int],
+      val ratings: Array[Float]) {
+
+    /** Size the of block. */
+    def size: Int = srcIds.size
+
+    /**
+     * Compresses the block into an [[InBlock]]. The algorithm is the same as converting a
+     * sparse matrix from coordinate list (COO) format into compressed sparse column (CSC) format.
+     * Sorting is done using Spark's built-in Timsort to avoid generating too many objects.
+     */
+    def compress(): InBlock = {
+      val sz = size
+      assert(sz > 0, "Empty in-link block should not exist.")
+      sort()
+      val uniqueSrcIdsBuilder = mutable.ArrayBuilder.make[Int]
+      val dstCountsBuilder = mutable.ArrayBuilder.make[Int]
+      var preSrcId = srcIds(0)
+      uniqueSrcIdsBuilder += preSrcId
+      var curCount = 1
+      var i = 1
+      var j = 0
+      while (i < sz) {
+        val srcId = srcIds(i)
+        if (srcId != preSrcId) {
+          uniqueSrcIdsBuilder += srcId
+          dstCountsBuilder += curCount
+          preSrcId = srcId
+          j += 1
+          curCount = 0
+        }
+        curCount += 1
+        i += 1
+      }
+      dstCountsBuilder += curCount
+      val uniqueSrcIds = uniqueSrcIdsBuilder.result()
+      val numUniqueSrdIds = uniqueSrcIds.size
+      val dstCounts = dstCountsBuilder.result()
+      val dstPtrs = new Array[Int](numUniqueSrdIds + 1)
+      var sum = 0
+      i = 0
+      while (i < numUniqueSrdIds) {
+        sum += dstCounts(i)
+        i += 1
+        dstPtrs(i) = sum
+      }
+      InBlock(uniqueSrcIds, dstPtrs, dstEncodedIndices, ratings)
+    }
+
+    private def sort(): Unit = {
+      val sz = size
+      // Since there might be interleaved log messages, we insert a unique id for easy pairing.
+      val sortId = Utils.random.nextInt()
+      logDebug(s"Start sorting an uncompressed in-block of size $sz. (sortId = $sortId)")
+      val start = System.nanoTime()
+      val sorter = new Sorter(new UncompressedInBlockSort)
+      sorter.sort(this, 0, size, Ordering[IntWrapper])
+      val duration = (System.nanoTime() - start) / 1e9
+      logDebug(s"Sorting took $duration seconds. (sortId = $sortId)")
+    }
+  }
+
+  /**
+   * A wrapper that holds a primitive integer key.
+   *
+   * @see [[UncompressedInBlockSort]]
+   */
+  private class IntWrapper(var key: Int = 0) extends Ordered[IntWrapper] {
+    override def compare(that: IntWrapper): Int = {
+      key.compare(that.key)
+    }
+  }
+
+  /**
+   * [[SortDataFormat]] of [[UncompressedInBlock]] used by [[Sorter]].
+   */
+  private class UncompressedInBlockSort extends SortDataFormat[IntWrapper, UncompressedInBlock] {
+
+    override def newKey(): IntWrapper = new IntWrapper()
+
+    override def getKey(
+        data: UncompressedInBlock,
+        pos: Int,
+        reuse: IntWrapper): IntWrapper = {
+      if (reuse == null) {
+        new IntWrapper(data.srcIds(pos))
+      } else {
+        reuse.key = data.srcIds(pos)
+        reuse
+      }
+    }
+
+    override def getKey(
+        data: UncompressedInBlock,
+        pos: Int): IntWrapper = {
+      getKey(data, pos, null)
+    }
+
+    private def swapElements[@specialized(Int, Float) T](
+        data: Array[T],
+        pos0: Int,
+        pos1: Int): Unit = {
+      val tmp = data(pos0)
+      data(pos0) = data(pos1)
+      data(pos1) = tmp
+    }
+
+    override def swap(data: UncompressedInBlock, pos0: Int, pos1: Int): Unit = {
+      swapElements(data.srcIds, pos0, pos1)
+      swapElements(data.dstEncodedIndices, pos0, pos1)
+      swapElements(data.ratings, pos0, pos1)
+    }
+
+    override def copyRange(
+        src: UncompressedInBlock,
+        srcPos: Int,
+        dst: UncompressedInBlock,
+        dstPos: Int,
+        length: Int): Unit = {
+      System.arraycopy(src.srcIds, srcPos, dst.srcIds, dstPos, length)
+      System.arraycopy(src.dstEncodedIndices, srcPos, dst.dstEncodedIndices, dstPos, length)
+      System.arraycopy(src.ratings, srcPos, dst.ratings, dstPos, length)
+    }
+
+    override def allocate(length: Int): UncompressedInBlock = {
+      new UncompressedInBlock(
+        new Array[Int](length), new Array[Int](length), new Array[Float](length))
+    }
+
+    override def copyElement(
+        src: UncompressedInBlock,
+        srcPos: Int,
+        dst: UncompressedInBlock,
+        dstPos: Int): Unit = {
+      dst.srcIds(dstPos) = src.srcIds(srcPos)
+      dst.dstEncodedIndices(dstPos) = src.dstEncodedIndices(srcPos)
+      dst.ratings(dstPos) = src.ratings(srcPos)
+    }
+  }
+
+  /**
+   * Creates in-blocks and out-blocks from rating blocks.
+   * @param prefix prefix for in/out-block names
+   * @param ratingBlocks rating blocks
+   * @param srcPart partitioner for src IDs
+   * @param dstPart partitioner for dst IDs
+   * @return (in-blocks, out-blocks)
+   */
+  private def makeBlocks(
+      prefix: String,
+      ratingBlocks: RDD[((Int, Int), RatingBlock)],
+      srcPart: Partitioner,
+      dstPart: Partitioner): (RDD[(Int, InBlock)], RDD[(Int, OutBlock)]) = {
+    val inBlocks = ratingBlocks.map {
+      case ((srcBlockId, dstBlockId), RatingBlock(srcIds, dstIds, ratings)) =>
+        // The implementation is a faster version of
+        // val dstIdToLocalIndex = dstIds.toSet.toSeq.sorted.zipWithIndex.toMap
+        val start = System.nanoTime()
+        val dstIdSet = new OpenHashSet[Int](1 << 20)
+        dstIds.foreach(dstIdSet.add)
+        val sortedDstIds = new Array[Int](dstIdSet.size)
+        var i = 0
+        var pos = dstIdSet.nextPos(0)
+        while (pos != -1) {
+          sortedDstIds(i) = dstIdSet.getValue(pos)
+          pos = dstIdSet.nextPos(pos + 1)
+          i += 1
+        }
+        assert(i == dstIdSet.size)
+        ju.Arrays.sort(sortedDstIds)
+        val dstIdToLocalIndex = new OpenHashMap[Int, Int](sortedDstIds.size)
+        i = 0
+        while (i < sortedDstIds.size) {
+          dstIdToLocalIndex.update(sortedDstIds(i), i)
+          i += 1
+        }
+        logDebug(
+          "Converting to local indices took " + (System.nanoTime() - start) / 1e9 + " seconds.")
+        val dstLocalIndices = dstIds.map(dstIdToLocalIndex.apply)
+        (srcBlockId, (dstBlockId, srcIds, dstLocalIndices, ratings))
+    }.groupByKey(new HashPartitioner(srcPart.numPartitions))
+        .mapValues { iter =>
+      val builder =
+        new UncompressedInBlockBuilder(new LocalIndexEncoder(dstPart.numPartitions))
+      iter.foreach { case (dstBlockId, srcIds, dstLocalIndices, ratings) =>
+        builder.add(dstBlockId, srcIds, dstLocalIndices, ratings)
+      }
+      builder.build().compress()
+    }.setName(prefix + "InBlocks").cache()
+    val outBlocks = inBlocks.mapValues { case InBlock(srcIds, dstPtrs, dstEncodedIndices, _) =>
+      val encoder = new LocalIndexEncoder(dstPart.numPartitions)
+      val activeIds = Array.fill(dstPart.numPartitions)(mutable.ArrayBuilder.make[Int])
+      var i = 0
+      val seen = new Array[Boolean](dstPart.numPartitions)
+      while (i < srcIds.size) {
+        var j = dstPtrs(i)
+        ju.Arrays.fill(seen, false)
+        while (j < dstPtrs(i + 1)) {
+          val dstBlockId = encoder.blockId(dstEncodedIndices(j))
+          if (!seen(dstBlockId)) {
+            activeIds(dstBlockId) += i // add the local index in this out-block
+            seen(dstBlockId) = true
+          }
+          j += 1
+        }
+        i += 1
+      }
+      activeIds.map { x =>
+        x.result()
+      }
+    }.setName(prefix + "OutBlocks").cache()
+    (inBlocks, outBlocks)
+  }
+
+  /**
+   * Compute dst factors by constructing and solving least square problems.
+   *
+   * @param srcFactorBlocks src factors
+   * @param srcOutBlocks src out-blocks
+   * @param dstInBlocks dst in-blocks
+   * @param rank rank
+   * @param regParam regularization constant
+   * @param srcEncoder encoder for src local indices
+   * @param implicitPrefs whether to use implicit preference
+   * @param alpha the alpha constant in the implicit preference formulation
+   *
+   * @return dst factors
+   */
+  private def computeFactors(
+      srcFactorBlocks: RDD[(Int, FactorBlock)],
+      srcOutBlocks: RDD[(Int, OutBlock)],
+      dstInBlocks: RDD[(Int, InBlock)],
+      rank: Int,
+      regParam: Double,
+      srcEncoder: LocalIndexEncoder,
+      implicitPrefs: Boolean = false,
+      alpha: Double = 1.0): RDD[(Int, FactorBlock)] = {
+    val numSrcBlocks = srcFactorBlocks.partitions.size
+    val YtY = if (implicitPrefs) Some(computeYtY(srcFactorBlocks, rank)) else None
+    val srcOut = srcOutBlocks.join(srcFactorBlocks).flatMap {
+      case (srcBlockId, (srcOutBlock, srcFactors)) =>
+        srcOutBlock.view.zipWithIndex.map { case (activeIndices, dstBlockId) =>
+          (dstBlockId, (srcBlockId, activeIndices.map(idx => srcFactors(idx))))
+        }
+    }
+    val merged = srcOut.groupByKey(new HashPartitioner(dstInBlocks.partitions.size))
+    dstInBlocks.join(merged).mapValues {
+      case (InBlock(dstIds, srcPtrs, srcEncodedIndices, ratings), srcFactors) =>
+        val sortedSrcFactors = new Array[FactorBlock](numSrcBlocks)
+        srcFactors.foreach { case (srcBlockId, factors) =>
+          sortedSrcFactors(srcBlockId) = factors
+        }
+        val dstFactors = new Array[Array[Float]](dstIds.size)
+        var j = 0
+        val ls = new NormalEquation(rank)
+        val solver = new CholeskySolver // TODO: add NNLS solver
+        while (j < dstIds.size) {
+          ls.reset()
+          if (implicitPrefs) {
+            ls.merge(YtY.get)
+          }
+          var i = srcPtrs(j)
+          while (i < srcPtrs(j + 1)) {
+            val encoded = srcEncodedIndices(i)
+            val blockId = srcEncoder.blockId(encoded)
+            val localIndex = srcEncoder.localIndex(encoded)
+            val srcFactor = sortedSrcFactors(blockId)(localIndex)
+            val rating = ratings(i)
+            if (implicitPrefs) {
+              ls.addImplicit(srcFactor, rating, alpha)
+            } else {
+              ls.add(srcFactor, rating)
+            }
+            i += 1
+          }
+          dstFactors(j) = solver.solve(ls, regParam)
+          j += 1
+        }
+        dstFactors
+    }
+  }
+
+  /**
+   * Computes the Gramian matrix of user or item factors, which is only used in implicit preference.
+   * Caching of the input factors is handled in [[ALS#train]].
+   */
+  private def computeYtY(factorBlocks: RDD[(Int, FactorBlock)], rank: Int): NormalEquation = {
+    factorBlocks.values.aggregate(new NormalEquation(rank))(
+      seqOp = (ne, factors) => {
+        factors.foreach(ne.add(_, 0.0f))
+        ne
+      },
+      combOp = (ne1, ne2) => ne1.merge(ne2))
+  }
+
+  /**
+   * Encoder for storing (blockId, localIndex) into a single integer.
+   *
+   * We use the leading bits (including the sign bit) to store the block id and the rest to store
+   * the local index. This is based on the assumption that users/items are approximately evenly
+   * partitioned. With this assumption, we should be able to encode two billion distinct values.
+   *
+   * @param numBlocks number of blocks
+   */
+  private[recommendation] class LocalIndexEncoder(numBlocks: Int) extends Serializable {
+
+    require(numBlocks > 0, s"numBlocks must be positive but found $numBlocks.")
+
+    private[this] final val numLocalIndexBits =
+      math.min(java.lang.Integer.numberOfLeadingZeros(numBlocks - 1), 31)
+    private[this] final val localIndexMask = (1 << numLocalIndexBits) - 1
+
+    /** Encodes a (blockId, localIndex) into a single integer. */
+    def encode(blockId: Int, localIndex: Int): Int = {
+      require(blockId < numBlocks)
+      require((localIndex & ~localIndexMask) == 0)
+      (blockId << numLocalIndexBits) | localIndex
+    }
+
+    /** Gets the block id from an encoded index. */
+    @inline
+    def blockId(encoded: Int): Int = {
+      encoded >>> numLocalIndexBits
+    }
+
+    /** Gets the local index from an encoded index. */
+    @inline
+    def localIndex(encoded: Int): Int = {
+      encoded & localIndexMask
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 194b9bfd9a9e6..08fe99176424a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -24,7 +24,8 @@ import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml._
 import org.apache.spark.ml.param.{IntParam, Param, ParamMap, Params}
 import org.apache.spark.mllib.util.MLUtils
-import org.apache.spark.sql.{SchemaRDD, StructType}
+import org.apache.spark.sql.SchemaRDD
+import org.apache.spark.sql.types.StructType
 
 /**
  * Params for [[CrossValidator]] and [[CrossValidatorModel]].
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index c4e5fd8e461fc..430d763ef7ca7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -266,12 +266,16 @@ class PythonMLLibAPI extends Serializable {
       k: Int,
       maxIterations: Int,
       runs: Int,
-      initializationMode: String): KMeansModel = {
+      initializationMode: String,
+      seed: java.lang.Long): KMeansModel = {
     val kMeansAlg = new KMeans()
       .setK(k)
       .setMaxIterations(maxIterations)
       .setRuns(runs)
       .setInitializationMode(initializationMode)
+
+    if (seed != null) kMeansAlg.setSeed(seed)
+
     try {
       kMeansAlg.run(data.rdd.persist(StorageLevel.MEMORY_AND_DISK))
     } finally {
@@ -624,6 +628,21 @@ class PythonMLLibAPI extends Serializable {
     RG.normalRDD(jsc.sc, size, parts, s)
   }
 
+  /**
+   * Java stub for Python mllib RandomRDDGenerators.logNormalRDD()
+   */
+  def logNormalRDD(jsc: JavaSparkContext,
+      mean: Double,
+      std: Double,
+      size: Long,
+      numPartitions: java.lang.Integer,
+      seed: java.lang.Long): JavaRDD[Double] = {
+    val parts = getNumPartitionsOrDefault(numPartitions, jsc)
+    val s = getSeedOrDefault(seed)
+    RG.logNormalRDD(jsc.sc, mean, std, size, parts, s)
+  }
+
+
   /**
    * Java stub for Python mllib RandomRDDGenerators.poissonRDD()
    */
@@ -637,6 +656,33 @@ class PythonMLLibAPI extends Serializable {
     RG.poissonRDD(jsc.sc, mean, size, parts, s)
   }
 
+  /**
+   * Java stub for Python mllib RandomRDDGenerators.exponentialRDD()
+   */
+  def exponentialRDD(jsc: JavaSparkContext,
+      mean: Double,
+      size: Long,
+      numPartitions: java.lang.Integer,
+      seed: java.lang.Long): JavaRDD[Double] = {
+    val parts = getNumPartitionsOrDefault(numPartitions, jsc)
+    val s = getSeedOrDefault(seed)
+    RG.exponentialRDD(jsc.sc, mean, size, parts, s)
+  }
+
+  /**
+   * Java stub for Python mllib RandomRDDGenerators.gammaRDD()
+   */
+  def gammaRDD(jsc: JavaSparkContext,
+      shape: Double,
+      scale: Double,
+      size: Long,
+      numPartitions: java.lang.Integer,
+      seed: java.lang.Long): JavaRDD[Double] = {
+    val parts = getNumPartitionsOrDefault(numPartitions, jsc)
+    val s = getSeedOrDefault(seed)
+    RG.gammaRDD(jsc.sc, shape, scale, size, parts, s)
+  }
+
   /**
    * Java stub for Python mllib RandomRDDGenerators.uniformVectorRDD()
    */
@@ -663,6 +709,22 @@ class PythonMLLibAPI extends Serializable {
     RG.normalVectorRDD(jsc.sc, numRows, numCols, parts, s)
   }
 
+  /**
+   * Java stub for Python mllib RandomRDDGenerators.logNormalVectorRDD()
+   */
+  def logNormalVectorRDD(jsc: JavaSparkContext,
+      mean: Double,
+      std: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: java.lang.Integer,
+      seed: java.lang.Long): JavaRDD[Vector] = {
+    val parts = getNumPartitionsOrDefault(numPartitions, jsc)
+    val s = getSeedOrDefault(seed)
+    RG.logNormalVectorRDD(jsc.sc, mean, std, numRows, numCols, parts, s)
+  }
+
+
   /**
    * Java stub for Python mllib RandomRDDGenerators.poissonVectorRDD()
    */
@@ -677,6 +739,36 @@ class PythonMLLibAPI extends Serializable {
     RG.poissonVectorRDD(jsc.sc, mean, numRows, numCols, parts, s)
   }
 
+  /**
+   * Java stub for Python mllib RandomRDDGenerators.exponentialVectorRDD()
+   */
+  def exponentialVectorRDD(jsc: JavaSparkContext,
+      mean: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: java.lang.Integer,
+      seed: java.lang.Long): JavaRDD[Vector] = {
+    val parts = getNumPartitionsOrDefault(numPartitions, jsc)
+    val s = getSeedOrDefault(seed)
+    RG.exponentialVectorRDD(jsc.sc, mean, numRows, numCols, parts, s)
+  }
+
+  /**
+   * Java stub for Python mllib RandomRDDGenerators.gammaVectorRDD()
+   */
+  def gammaVectorRDD(jsc: JavaSparkContext,
+      shape: Double,
+      scale: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: java.lang.Integer,
+      seed: java.lang.Long): JavaRDD[Vector] = {
+    val parts = getNumPartitionsOrDefault(numPartitions, jsc)
+    val s = getSeedOrDefault(seed)
+    RG.gammaVectorRDD(jsc.sc, shape, scale, numRows, numCols, parts, s)
+  }
+
+
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index 8c8e4a161aa5b..a967df857bed3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -93,10 +93,10 @@ class NaiveBayes private (private var lambda: Double) extends Serializable with
   def run(data: RDD[LabeledPoint]) = {
     val requireNonnegativeValues: Vector => Unit = (v: Vector) => {
       val values = v match {
-        case sv: SparseVector =>
-          sv.values
-        case dv: DenseVector =>
-          dv.values
+        case SparseVector(size, indices, values) =>
+          values
+        case DenseVector(values) =>
+          values
       }
       if (!values.forall(_ >= 0.0)) {
         throw new SparkException(s"Naive Bayes requires nonnegative feature values but found $v.")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala
new file mode 100644
index 0000000000000..899fe5e9e9cf2
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.clustering
+
+import scala.collection.mutable.IndexedSeq
+
+import breeze.linalg.{DenseVector => BreezeVector, DenseMatrix => BreezeMatrix, diag, Transpose}
+
+import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors, DenseVector, DenseMatrix, BLAS}
+import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.RDD
+import org.apache.spark.util.Utils
+
+/**
+ * This class performs expectation maximization for multivariate Gaussian
+ * Mixture Models (GMMs).  A GMM represents a composite distribution of
+ * independent Gaussian distributions with associated "mixing" weights
+ * specifying each's contribution to the composite.
+ *
+ * Given a set of sample points, this class will maximize the log-likelihood 
+ * for a mixture of k Gaussians, iterating until the log-likelihood changes by 
+ * less than convergenceTol, or until it has reached the max number of iterations.
+ * While this process is generally guaranteed to converge, it is not guaranteed
+ * to find a global optimum.  
+ * 
+ * @param k The number of independent Gaussians in the mixture model
+ * @param convergenceTol The maximum change in log-likelihood at which convergence
+ * is considered to have occurred.
+ * @param maxIterations The maximum number of iterations to perform
+ */
+class GaussianMixtureEM private (
+    private var k: Int, 
+    private var convergenceTol: Double, 
+    private var maxIterations: Int,
+    private var seed: Long) extends Serializable {
+  
+  /** A default instance, 2 Gaussians, 100 iterations, 0.01 log-likelihood threshold */
+  def this() = this(2, 0.01, 100, Utils.random.nextLong())
+  
+  // number of samples per cluster to use when initializing Gaussians
+  private val nSamples = 5
+  
+  // an initializing GMM can be provided rather than using the 
+  // default random starting point
+  private var initialModel: Option[GaussianMixtureModel] = None
+  
+  /** Set the initial GMM starting point, bypassing the random initialization.
+   *  You must call setK() prior to calling this method, and the condition
+   *  (model.k == this.k) must be met; failure will result in an IllegalArgumentException
+   */
+  def setInitialModel(model: GaussianMixtureModel): this.type = {
+    if (model.k == k) {
+      initialModel = Some(model)
+    } else {
+      throw new IllegalArgumentException("mismatched cluster count (model.k != k)")
+    }
+    this
+  }
+  
+  /** Return the user supplied initial GMM, if supplied */
+  def getInitialModel: Option[GaussianMixtureModel] = initialModel
+  
+  /** Set the number of Gaussians in the mixture model.  Default: 2 */
+  def setK(k: Int): this.type = {
+    this.k = k
+    this
+  }
+  
+  /** Return the number of Gaussians in the mixture model */
+  def getK: Int = k
+  
+  /** Set the maximum number of iterations to run. Default: 100 */
+  def setMaxIterations(maxIterations: Int): this.type = {
+    this.maxIterations = maxIterations
+    this
+  }
+  
+  /** Return the maximum number of iterations to run */
+  def getMaxIterations: Int = maxIterations
+  
+  /**
+   * Set the largest change in log-likelihood at which convergence is 
+   * considered to have occurred.
+   */
+  def setConvergenceTol(convergenceTol: Double): this.type = {
+    this.convergenceTol = convergenceTol
+    this
+  }
+  
+  /**
+   * Return the largest change in log-likelihood at which convergence is
+   * considered to have occurred.
+   */
+  def getConvergenceTol: Double = convergenceTol
+
+  /** Set the random seed */
+  def setSeed(seed: Long): this.type = {
+    this.seed = seed
+    this
+  }
+
+  /** Return the random seed */
+  def getSeed: Long = seed
+
+  /** Perform expectation maximization */
+  def run(data: RDD[Vector]): GaussianMixtureModel = {
+    val sc = data.sparkContext
+    
+    // we will operate on the data as breeze data
+    val breezeData = data.map(u => u.toBreeze.toDenseVector).cache()
+    
+    // Get length of the input vectors
+    val d = breezeData.first().length
+    
+    // Determine initial weights and corresponding Gaussians.
+    // If the user supplied an initial GMM, we use those values, otherwise
+    // we start with uniform weights, a random mean from the data, and
+    // diagonal covariance matrices using component variances
+    // derived from the samples    
+    val (weights, gaussians) = initialModel match {
+      case Some(gmm) => (gmm.weights, gmm.gaussians)
+      
+      case None => {
+        val samples = breezeData.takeSample(withReplacement = true, k * nSamples, seed)
+        (Array.fill(k)(1.0 / k), Array.tabulate(k) { i => 
+          val slice = samples.view(i * nSamples, (i + 1) * nSamples)
+          new MultivariateGaussian(vectorMean(slice), initCovariance(slice)) 
+        })  
+      }
+    }
+    
+    var llh = Double.MinValue // current log-likelihood 
+    var llhp = 0.0            // previous log-likelihood
+    
+    var iter = 0
+    while(iter < maxIterations && Math.abs(llh-llhp) > convergenceTol) {
+      // create and broadcast curried cluster contribution function
+      val compute = sc.broadcast(ExpectationSum.add(weights, gaussians)_)
+      
+      // aggregate the cluster contribution for all sample points
+      val sums = breezeData.aggregate(ExpectationSum.zero(k, d))(compute.value, _ += _)
+      
+      // Create new distributions based on the partial assignments
+      // (often referred to as the "M" step in literature)
+      val sumWeights = sums.weights.sum
+      var i = 0
+      while (i < k) {
+        val mu = sums.means(i) / sums.weights(i)
+        BLAS.syr(-sums.weights(i), Vectors.fromBreeze(mu).asInstanceOf[DenseVector],
+          Matrices.fromBreeze(sums.sigmas(i)).asInstanceOf[DenseMatrix])
+        weights(i) = sums.weights(i) / sumWeights
+        gaussians(i) = new MultivariateGaussian(mu, sums.sigmas(i) / sums.weights(i))
+        i = i + 1
+      }
+   
+      llhp = llh // current becomes previous
+      llh = sums.logLikelihood // this is the freshly computed log-likelihood
+      iter += 1
+    } 
+    
+    new GaussianMixtureModel(weights, gaussians)
+  }
+    
+  /** Average of dense breeze vectors */
+  private def vectorMean(x: IndexedSeq[BreezeVector[Double]]): BreezeVector[Double] = {
+    val v = BreezeVector.zeros[Double](x(0).length)
+    x.foreach(xi => v += xi)
+    v / x.length.toDouble 
+  }
+  
+  /**
+   * Construct matrix where diagonal entries are element-wise
+   * variance of input vectors (computes biased variance)
+   */
+  private def initCovariance(x: IndexedSeq[BreezeVector[Double]]): BreezeMatrix[Double] = {
+    val mu = vectorMean(x)
+    val ss = BreezeVector.zeros[Double](x(0).length)
+    x.map(xi => (xi - mu) :^ 2.0).foreach(u => ss += u)
+    diag(ss / x.length.toDouble)
+  }
+}
+
+// companion class to provide zero constructor for ExpectationSum
+private object ExpectationSum {
+  def zero(k: Int, d: Int): ExpectationSum = {
+    new ExpectationSum(0.0, Array.fill(k)(0.0), 
+      Array.fill(k)(BreezeVector.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d,d)))
+  }
+  
+  // compute cluster contributions for each input point
+  // (U, T) => U for aggregation
+  def add(
+      weights: Array[Double], 
+      dists: Array[MultivariateGaussian])
+      (sums: ExpectationSum, x: BreezeVector[Double]): ExpectationSum = {
+    val p = weights.zip(dists).map {
+      case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(x)
+    }
+    val pSum = p.sum
+    sums.logLikelihood += math.log(pSum)
+    val xxt = x * new Transpose(x)
+    var i = 0
+    while (i < sums.k) {
+      p(i) /= pSum
+      sums.weights(i) += p(i)
+      sums.means(i) += x * p(i)
+      BLAS.syr(p(i), Vectors.fromBreeze(x).asInstanceOf[DenseVector],
+        Matrices.fromBreeze(sums.sigmas(i)).asInstanceOf[DenseMatrix])
+      i = i + 1
+    }
+    sums
+  }  
+}
+
+// Aggregation class for partial expectation results
+private class ExpectationSum(
+    var logLikelihood: Double,
+    val weights: Array[Double],
+    val means: Array[BreezeVector[Double]],
+    val sigmas: Array[BreezeMatrix[Double]]) extends Serializable {
+  
+  val k = weights.length
+  
+  def +=(x: ExpectationSum): ExpectationSum = {
+    var i = 0
+    while (i < k) {
+      weights(i) += x.weights(i)
+      means(i) += x.means(i)
+      sigmas(i) += x.sigmas(i)
+      i = i + 1
+    }
+    logLikelihood += x.logLikelihood
+    this
+  }  
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
new file mode 100644
index 0000000000000..1a2178ee7f711
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.clustering
+
+import breeze.linalg.{DenseVector => BreezeVector}
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
+import org.apache.spark.mllib.util.MLUtils
+
+/**
+ * Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points 
+ * are drawn from each Gaussian i=1..k with probability w(i); mu(i) and sigma(i) are 
+ * the respective mean and covariance for each Gaussian distribution i=1..k. 
+ * 
+ * @param weight Weights for each Gaussian distribution in the mixture, where weight(i) is
+ *               the weight for Gaussian i, and weight.sum == 1
+ * @param mu Means for each Gaussian in the mixture, where mu(i) is the mean for Gaussian i
+ * @param sigma Covariance maxtrix for each Gaussian in the mixture, where sigma(i) is the
+ *              covariance matrix for Gaussian i
+ */
+class GaussianMixtureModel(
+  val weights: Array[Double], 
+  val gaussians: Array[MultivariateGaussian]) extends Serializable {
+  
+  require(weights.length == gaussians.length, "Length of weight and Gaussian arrays must match")
+  
+  /** Number of gaussians in mixture */
+  def k: Int = weights.length
+
+  /** Maps given points to their cluster indices. */
+  def predict(points: RDD[Vector]): RDD[Int] = {
+    val responsibilityMatrix = predictSoft(points)
+    responsibilityMatrix.map(r => r.indexOf(r.max))
+  }
+  
+  /**
+   * Given the input vectors, return the membership value of each vector
+   * to all mixture components. 
+   */
+  def predictSoft(points: RDD[Vector]): RDD[Array[Double]] = {
+    val sc = points.sparkContext
+    val bcDists = sc.broadcast(gaussians)
+    val bcWeights = sc.broadcast(weights)
+    points.map { x => 
+      computeSoftAssignments(x.toBreeze.toDenseVector, bcDists.value, bcWeights.value, k)
+    }
+  }
+  
+  /**
+   * Compute the partial assignments for each vector
+   */
+  private def computeSoftAssignments(
+      pt: BreezeVector[Double],
+      dists: Array[MultivariateGaussian],
+      weights: Array[Double],
+      k: Int): Array[Double] = {
+    val p = weights.zip(dists).map {
+      case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(pt)
+    }
+    val pSum = p.sum 
+    for (i <- 0 until k) {
+      p(i) /= pSum
+    }
+    p
+  }  
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 54c301d3e9e14..11633e8242313 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -19,14 +19,14 @@ package org.apache.spark.mllib.clustering
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.annotation.Experimental
 import org.apache.spark.Logging
-import org.apache.spark.SparkContext._
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS.{axpy, scal}
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.Utils
 import org.apache.spark.util.random.XORShiftRandom
 
 /**
@@ -43,13 +43,14 @@ class KMeans private (
     private var runs: Int,
     private var initializationMode: String,
     private var initializationSteps: Int,
-    private var epsilon: Double) extends Serializable with Logging {
+    private var epsilon: Double,
+    private var seed: Long) extends Serializable with Logging {
 
   /**
    * Constructs a KMeans instance with default parameters: {k: 2, maxIterations: 20, runs: 1,
-   * initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4}.
+   * initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4, seed: random}.
    */
-  def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4)
+  def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4, Utils.random.nextLong())
 
   /** Set the number of clusters to create (k). Default: 2. */
   def setK(k: Int): this.type = {
@@ -112,6 +113,12 @@ class KMeans private (
     this
   }
 
+  /** Set the random seed for cluster initialization. */
+  def setSeed(seed: Long): this.type = {
+    this.seed = seed
+    this
+  }
+
   /**
    * Train a K-means model on the given set of points; `data` should be cached for high
    * performance, because this is an iterative algorithm.
@@ -255,7 +262,7 @@ class KMeans private (
   private def initRandom(data: RDD[VectorWithNorm])
   : Array[Array[VectorWithNorm]] = {
     // Sample all the cluster centers in one pass to avoid repeated scans
-    val sample = data.takeSample(true, runs * k, new XORShiftRandom().nextInt()).toSeq
+    val sample = data.takeSample(true, runs * k, new XORShiftRandom(this.seed).nextInt()).toSeq
     Array.tabulate(runs)(r => sample.slice(r * k, (r + 1) * k).map { v =>
       new VectorWithNorm(Vectors.dense(v.vector.toArray), v.norm)
     }.toArray)
@@ -272,45 +279,81 @@ class KMeans private (
    */
   private def initKMeansParallel(data: RDD[VectorWithNorm])
   : Array[Array[VectorWithNorm]] = {
-    // Initialize each run's center to a random point
-    val seed = new XORShiftRandom().nextInt()
+    // Initialize empty centers and point costs.
+    val centers = Array.tabulate(runs)(r => ArrayBuffer.empty[VectorWithNorm])
+    var costs = data.map(_ => Vectors.dense(Array.fill(runs)(Double.PositiveInfinity))).cache()
+
+    // Initialize each run's first center to a random point.
+    val seed = new XORShiftRandom(this.seed).nextInt()
     val sample = data.takeSample(true, runs, seed).toSeq
-    val centers = Array.tabulate(runs)(r => ArrayBuffer(sample(r).toDense))
+    val newCenters = Array.tabulate(runs)(r => ArrayBuffer(sample(r).toDense))
+
+    /** Merges new centers to centers. */
+    def mergeNewCenters(): Unit = {
+      var r = 0
+      while (r < runs) {
+        centers(r) ++= newCenters(r)
+        newCenters(r).clear()
+        r += 1
+      }
+    }
 
     // On each step, sample 2 * k points on average for each run with probability proportional
-    // to their squared distance from that run's current centers
+    // to their squared distance from that run's centers. Note that only distances between points
+    // and new centers are computed in each iteration.
     var step = 0
     while (step < initializationSteps) {
-      val bcCenters = data.context.broadcast(centers)
-      val sumCosts = data.flatMap { point =>
-        (0 until runs).map { r =>
-          (r, KMeans.pointCost(bcCenters.value(r), point))
-        }
-      }.reduceByKey(_ + _).collectAsMap()
-      val chosen = data.mapPartitionsWithIndex { (index, points) =>
+      val bcNewCenters = data.context.broadcast(newCenters)
+      val preCosts = costs
+      costs = data.zip(preCosts).map { case (point, cost) =>
+        Vectors.dense(
+          Array.tabulate(runs) { r =>
+            math.min(KMeans.pointCost(bcNewCenters.value(r), point), cost(r))
+          })
+      }.cache()
+      val sumCosts = costs
+        .aggregate(Vectors.zeros(runs))(
+          seqOp = (s, v) => {
+            // s += v
+            axpy(1.0, v, s)
+            s
+          },
+          combOp = (s0, s1) => {
+            // s0 += s1
+            axpy(1.0, s1, s0)
+            s0
+          }
+        )
+      preCosts.unpersist(blocking = false)
+      val chosen = data.zip(costs).mapPartitionsWithIndex { (index, pointsWithCosts) =>
         val rand = new XORShiftRandom(seed ^ (step << 16) ^ index)
-        points.flatMap { p =>
-          (0 until runs).filter { r =>
-            rand.nextDouble() < 2.0 * KMeans.pointCost(bcCenters.value(r), p) * k / sumCosts(r)
-          }.map((_, p))
+        pointsWithCosts.flatMap { case (p, c) =>
+          val rs = (0 until runs).filter { r =>
+            rand.nextDouble() < 2.0 * c(r) * k / sumCosts(r)
+          }
+          if (rs.length > 0) Some(p, rs) else None
         }
       }.collect()
-      chosen.foreach { case (r, p) =>
-        centers(r) += p.toDense
+      mergeNewCenters()
+      chosen.foreach { case (p, rs) =>
+        rs.foreach(newCenters(_) += p.toDense)
       }
       step += 1
     }
 
+    mergeNewCenters()
+    costs.unpersist(blocking = false)
+
     // Finally, we might have a set of more than k candidate centers for each run; weigh each
     // candidate by the number of points in the dataset mapping to it and run a local k-means++
     // on the weighted centers to pick just k of them
     val bcCenters = data.context.broadcast(centers)
     val weightMap = data.flatMap { p =>
-      (0 until runs).map { r =>
+      Iterator.tabulate(runs) { r =>
         ((r, KMeans.findClosest(bcCenters.value(r), p)._1), 1.0)
       }
     }.reduceByKey(_ + _).collectAsMap()
-    val finalCenters = (0 until runs).map { r =>
+    val finalCenters = (0 until runs).par.map { r =>
       val myCenters = centers(r).toArray
       val myWeights = (0 until myCenters.length).map(i => weightMap.getOrElse((r, i), 0.0)).toArray
       LocalKMeans.kMeansPlusPlus(r, myCenters, myWeights, k, 30)
@@ -333,7 +376,32 @@ object KMeans {
   /**
    * Trains a k-means model using the given set of parameters.
    *
-   * @param data training points stored as `RDD[Array[Double]]`
+   * @param data training points stored as `RDD[Vector]`
+   * @param k number of clusters
+   * @param maxIterations max number of iterations
+   * @param runs number of parallel runs, defaults to 1. The best model is returned.
+   * @param initializationMode initialization model, either "random" or "k-means||" (default).
+   * @param seed random seed value for cluster initialization
+   */
+  def train(
+      data: RDD[Vector],
+      k: Int,
+      maxIterations: Int,
+      runs: Int,
+      initializationMode: String,
+      seed: Long): KMeansModel = {
+    new KMeans().setK(k)
+      .setMaxIterations(maxIterations)
+      .setRuns(runs)
+      .setInitializationMode(initializationMode)
+      .setSeed(seed)
+      .run(data)
+  }
+
+  /**
+   * Trains a k-means model using the given set of parameters.
+   *
+   * @param data training points stored as `RDD[Vector]`
    * @param k number of clusters
    * @param maxIterations max number of iterations
    * @param runs number of parallel runs, defaults to 1. The best model is returned.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index 6189dce9b27da..7752c1988fdd1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -24,7 +24,6 @@ import org.apache.spark.SparkContext._
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.util.Utils
 import org.apache.spark.util.random.XORShiftRandom
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
index 1af40de2c7fcf..ced042e2f96ca 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -28,9 +28,30 @@ import org.apache.spark.rdd.{RDD, UnionRDD}
  * Evaluator for binary classification.
  *
  * @param scoreAndLabels an RDD of (score, label) pairs.
+ * @param numBins if greater than 0, then the curves (ROC curve, PR curve) computed internally
+ *                will be down-sampled to this many "bins". If 0, no down-sampling will occur.
+ *                This is useful because the curve contains a point for each distinct score
+ *                in the input, and this could be as large as the input itself -- millions of
+ *                points or more, when thousands may be entirely sufficient to summarize
+ *                the curve. After down-sampling, the curves will instead be made of approximately
+ *                `numBins` points instead. Points are made from bins of equal numbers of
+ *                consecutive points. The size of each bin is
+ *                `floor(scoreAndLabels.count() / numBins)`, which means the resulting number
+ *                of bins may not exactly equal numBins. The last bin in each partition may
+ *                be smaller as a result, meaning there may be an extra sample at
+ *                partition boundaries.
  */
 @Experimental
-class BinaryClassificationMetrics(scoreAndLabels: RDD[(Double, Double)]) extends Logging {
+class BinaryClassificationMetrics(
+    val scoreAndLabels: RDD[(Double, Double)],
+    val numBins: Int) extends Logging {
+
+  require(numBins >= 0, "numBins must be nonnegative")
+
+  /**
+   * Defaults `numBins` to 0.
+   */
+  def this(scoreAndLabels: RDD[(Double, Double)]) = this(scoreAndLabels, 0)
 
   /** Unpersist intermediate RDDs used in the computation. */
   def unpersist() {
@@ -103,7 +124,39 @@ class BinaryClassificationMetrics(scoreAndLabels: RDD[(Double, Double)]) extends
       mergeValue = (c: BinaryLabelCounter, label: Double) => c += label,
       mergeCombiners = (c1: BinaryLabelCounter, c2: BinaryLabelCounter) => c1 += c2
     ).sortByKey(ascending = false)
-    val agg = counts.values.mapPartitions { iter =>
+
+    val binnedCounts =
+      // Only down-sample if bins is > 0
+      if (numBins == 0) {
+        // Use original directly
+        counts
+      } else {
+        val countsSize = counts.count()
+        // Group the iterator into chunks of about countsSize / numBins points,
+        // so that the resulting number of bins is about numBins
+        var grouping = countsSize / numBins
+        if (grouping < 2) {
+          // numBins was more than half of the size; no real point in down-sampling to bins
+          logInfo(s"Curve is too small ($countsSize) for $numBins bins to be useful")
+          counts
+        } else {
+          if (grouping >= Int.MaxValue) {
+            logWarning(
+              s"Curve too large ($countsSize) for $numBins bins; capping at ${Int.MaxValue}")
+            grouping = Int.MaxValue
+          }
+          counts.mapPartitions(_.grouped(grouping.toInt).map { pairs =>
+            // The score of the combined point will be just the first one's score
+            val firstScore = pairs.head._1
+            // The point will contain all counts in this chunk
+            val agg = new BinaryLabelCounter()
+            pairs.foreach(pair => agg += pair._2)
+            (firstScore, agg)
+          })
+        }
+      }
+
+    val agg = binnedCounts.values.mapPartitions { iter =>
       val agg = new BinaryLabelCounter()
       iter.foreach(agg += _)
       Iterator(agg)
@@ -113,7 +166,7 @@ class BinaryClassificationMetrics(scoreAndLabels: RDD[(Double, Double)]) extends
         (agg: BinaryLabelCounter, c: BinaryLabelCounter) => agg.clone() += c)
     val totalCount = partitionwiseCumulativeCounts.last
     logInfo(s"Total counts: $totalCount")
-    val cumulativeCounts = counts.mapPartitionsWithIndex(
+    val cumulativeCounts = binnedCounts.mapPartitionsWithIndex(
       (index: Int, iter: Iterator[(Double, BinaryLabelCounter)]) => {
         val cumCount = partitionwiseCumulativeCounts(index)
         iter.map { case (score, c) =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
index 720bb70b08dbf..3260f27513c7f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
@@ -86,20 +86,20 @@ private object IDF {
         df = BDV.zeros(doc.size)
       }
       doc match {
-        case sv: SparseVector =>
-          val nnz = sv.indices.size
+        case SparseVector(size, indices, values) =>
+          val nnz = indices.size
           var k = 0
           while (k < nnz) {
-            if (sv.values(k) > 0) {
-              df(sv.indices(k)) += 1L
+            if (values(k) > 0) {
+              df(indices(k)) += 1L
             }
             k += 1
           }
-        case dv: DenseVector =>
-          val n = dv.size
+        case DenseVector(values) =>
+          val n = values.size
           var j = 0
           while (j < n) {
-            if (dv.values(j) > 0.0) {
+            if (values(j) > 0.0) {
               df(j) += 1L
             }
             j += 1
@@ -174,36 +174,17 @@ class IDFModel private[mllib] (val idf: Vector) extends Serializable {
    */
   def transform(dataset: RDD[Vector]): RDD[Vector] = {
     val bcIdf = dataset.context.broadcast(idf)
-    dataset.mapPartitions { iter =>
-      val thisIdf = bcIdf.value
-      iter.map { v =>
-        val n = v.size
-        v match {
-          case sv: SparseVector =>
-            val nnz = sv.indices.size
-            val newValues = new Array[Double](nnz)
-            var k = 0
-            while (k < nnz) {
-              newValues(k) = sv.values(k) * thisIdf(sv.indices(k))
-              k += 1
-            }
-            Vectors.sparse(n, sv.indices, newValues)
-          case dv: DenseVector =>
-            val newValues = new Array[Double](n)
-            var j = 0
-            while (j < n) {
-              newValues(j) = dv.values(j) * thisIdf(j)
-              j += 1
-            }
-            Vectors.dense(newValues)
-          case other =>
-            throw new UnsupportedOperationException(
-              s"Only sparse and dense vectors are supported but got ${other.getClass}.")
-        }
-      }
-    }
+    dataset.mapPartitions(iter => iter.map(v => IDFModel.transform(bcIdf.value, v)))
   }
 
+  /**
+   * Transforms a term frequency (TF) vector to a TF-IDF vector
+   *
+   * @param v a term frequency vector
+   * @return a TF-IDF vector
+   */
+  def transform(v: Vector): Vector = IDFModel.transform(idf, v)
+
   /**
    * Transforms term frequency (TF) vectors to TF-IDF vectors (Java version).
    * @param dataset a JavaRDD of term frequency vectors
@@ -213,3 +194,39 @@ class IDFModel private[mllib] (val idf: Vector) extends Serializable {
     transform(dataset.rdd).toJavaRDD()
   }
 }
+
+private object IDFModel {
+
+  /**
+   * Transforms a term frequency (TF) vector to a TF-IDF vector with a IDF vector
+   *
+   * @param idf an IDF vector
+   * @param v a term frequence vector
+   * @return a TF-IDF vector
+   */
+  def transform(idf: Vector, v: Vector): Vector = {
+    val n = v.size
+    v match {
+      case SparseVector(size, indices, values) =>
+        val nnz = indices.size
+        val newValues = new Array[Double](nnz)
+        var k = 0
+        while (k < nnz) {
+          newValues(k) = values(k) * idf(indices(k))
+          k += 1
+        }
+        Vectors.sparse(n, indices, newValues)
+      case DenseVector(values) =>
+        val newValues = new Array[Double](n)
+        var j = 0
+        while (j < n) {
+          newValues(j) = values(j) * idf(j)
+          j += 1
+        }
+        Vectors.dense(newValues)
+      case other =>
+        throw new UnsupportedOperationException(
+          s"Only sparse and dense vectors are supported but got ${other.getClass}.")
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
index 1ced26a9b70a2..32848e039eb81 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
@@ -52,8 +52,8 @@ class Normalizer(p: Double) extends VectorTransformer {
       // However, for sparse vector, the `index` array will not be changed,
       // so we can re-use it to save memory.
       vector match {
-        case dv: DenseVector =>
-          val values = dv.values.clone()
+        case DenseVector(vs) =>
+          val values = vs.clone()
           val size = values.size
           var i = 0
           while (i < size) {
@@ -61,15 +61,15 @@ class Normalizer(p: Double) extends VectorTransformer {
             i += 1
           }
           Vectors.dense(values)
-        case sv: SparseVector =>
-          val values = sv.values.clone()
+        case SparseVector(size, ids, vs) =>
+          val values = vs.clone()
           val nnz = values.size
           var i = 0
           while (i < nnz) {
             values(i) /= norm
             i += 1
           }
-          Vectors.sparse(sv.size, sv.indices, values)
+          Vectors.sparse(size, ids, values)
         case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
       }
     } else {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
index 8c4c5db5258d5..3c2091732f9b0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
@@ -105,8 +105,8 @@ class StandardScalerModel private[mllib] (
       // This can be avoid by having a local reference of `shift`.
       val localShift = shift
       vector match {
-        case dv: DenseVector =>
-          val values = dv.values.clone()
+        case DenseVector(vs) =>
+          val values = vs.clone()
           val size = values.size
           if (withStd) {
             // Having a local reference of `factor` to avoid overhead as the comment before.
@@ -130,8 +130,8 @@ class StandardScalerModel private[mllib] (
       // Having a local reference of `factor` to avoid overhead as the comment before.
       val localFactor = factor
       vector match {
-        case dv: DenseVector =>
-          val values = dv.values.clone()
+        case DenseVector(vs) =>
+          val values = vs.clone()
           val size = values.size
           var i = 0
           while(i < size) {
@@ -139,18 +139,17 @@ class StandardScalerModel private[mllib] (
             i += 1
           }
           Vectors.dense(values)
-        case sv: SparseVector =>
+        case SparseVector(size, indices, vs) =>
           // For sparse vector, the `index` array inside sparse vector object will not be changed,
           // so we can re-use it to save memory.
-          val indices = sv.indices
-          val values = sv.values.clone()
+          val values = vs.clone()
           val nnz = values.size
           var i = 0
           while (i < nnz) {
             values(i) *= localFactor(indices(i))
             i += 1
           }
-          Vectors.sparse(sv.size, indices, values)
+          Vectors.sparse(size, indices, values)
         case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
       }
     } else {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 7960f3cab576f..d25a7cd5b439d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -71,7 +71,8 @@ class Word2Vec extends Serializable with Logging {
   private var numPartitions = 1
   private var numIterations = 1
   private var seed = Utils.random.nextLong()
-
+  private var minCount = 5
+  
   /**
    * Sets vector size (default: 100).
    */
@@ -114,6 +115,15 @@ class Word2Vec extends Serializable with Logging {
     this
   }
 
+  /** 
+   * Sets minCount, the minimum number of times a token must appear to be included in the word2vec 
+   * model's vocabulary (default: 5).
+   */
+  def setMinCount(minCount: Int): this.type = {
+    this.minCount = minCount
+    this
+  }
+  
   private val EXP_TABLE_SIZE = 1000
   private val MAX_EXP = 6
   private val MAX_CODE_LENGTH = 40
@@ -122,9 +132,6 @@ class Word2Vec extends Serializable with Logging {
   /** context words from [-window, window] */
   private val window = 5
 
-  /** minimum frequency to consider a vocabulary word */
-  private val minCount = 5
-
   private var trainWordsCount = 0
   private var vocabSize = 0
   private var vocab: Array[VocabWord] = null
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index 9fed513becddc..3414daccd7ca4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -228,6 +228,32 @@ private[spark] object BLAS extends Serializable with Logging {
     }
     _nativeBLAS
   }
+ 
+  /**
+   * A := alpha * x * x^T^ + A
+   * @param alpha a real scalar that will be multiplied to x * x^T^.
+   * @param x the vector x that contains the n elements.
+   * @param A the symmetric matrix A. Size of n x n.
+   */
+  def syr(alpha: Double, x: DenseVector, A: DenseMatrix) {
+    val mA = A.numRows
+    val nA = A.numCols
+    require(mA == nA, s"A is not a symmetric matrix. A: $mA x $nA")
+    require(mA == x.size, s"The size of x doesn't match the rank of A. A: $mA x $nA, x: ${x.size}")
+
+    nativeBLAS.dsyr("U", x.size, alpha, x.values, 1, A.values, nA)
+
+    // Fill lower triangular part of A
+    var i = 0
+    while (i < mA) {
+      var j = i + 1
+      while (j < nA) {
+        A(j, i) = A(i, j)
+        j += 1
+      }
+      i += 1
+    }    
+  }
 
   /**
    * C := alpha * A * B + beta * C
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 327366a1a3a82..5a7281ec6dc3c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.mllib.linalg
 
-import java.util.{Random, Arrays}
+import java.util.{Arrays, Random}
 
-import breeze.linalg.{Matrix => BM, DenseMatrix => BDM, CSCMatrix => BSM}
+import scala.collection.mutable.{ArrayBuilder => MArrayBuilder, HashSet => MHashSet, ArrayBuffer}
+
+import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, Matrix => BM}
 
 /**
  * Trait for a local matrix.
@@ -80,6 +82,16 @@ sealed trait Matrix extends Serializable {
 
   /** A human readable representation of the matrix */
   override def toString: String = toBreeze.toString()
+
+  /** Map the values of this matrix using a function. Generates a new matrix. Performs the
+    * function on only the backing array. For example, an operation such as addition or
+    * subtraction will only be performed on the non-zero values in a `SparseMatrix`. */
+  private[mllib] def map(f: Double => Double): Matrix
+
+  /** Update all the values of this matrix using the function f. Performed in-place on the
+    * backing array. For example, an operation such as addition or subtraction will only be
+    * performed on the non-zero values in a `SparseMatrix`. */
+  private[mllib] def update(f: Double => Double): Matrix
 }
 
 /**
@@ -123,6 +135,122 @@ class DenseMatrix(val numRows: Int, val numCols: Int, val values: Array[Double])
   }
 
   override def copy = new DenseMatrix(numRows, numCols, values.clone())
+
+  private[mllib] def map(f: Double => Double) = new DenseMatrix(numRows, numCols, values.map(f))
+
+  private[mllib] def update(f: Double => Double): DenseMatrix = {
+    val len = values.length
+    var i = 0
+    while (i < len) {
+      values(i) = f(values(i))
+      i += 1
+    }
+    this
+  }
+
+  /** Generate a `SparseMatrix` from the given `DenseMatrix`. */
+  def toSparse(): SparseMatrix = {
+    val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble
+    val colPtrs: Array[Int] = new Array[Int](numCols + 1)
+    val rowIndices: MArrayBuilder[Int] = new MArrayBuilder.ofInt
+    var nnz = 0
+    var j = 0
+    while (j < numCols) {
+      var i = 0
+      val indStart = j * numRows
+      while (i < numRows) {
+        val v = values(indStart + i)
+        if (v != 0.0) {
+          rowIndices += i
+          spVals += v
+          nnz += 1
+        }
+        i += 1
+      }
+      j += 1
+      colPtrs(j) = nnz
+    }
+    new SparseMatrix(numRows, numCols, colPtrs, rowIndices.result(), spVals.result())
+  }
+}
+
+/**
+ * Factory methods for [[org.apache.spark.mllib.linalg.DenseMatrix]].
+ */
+object DenseMatrix {
+
+  /**
+   * Generate a `DenseMatrix` consisting of zeros.
+   * @param numRows number of rows of the matrix
+   * @param numCols number of columns of the matrix
+   * @return `DenseMatrix` with size `numRows` x `numCols` and values of zeros
+   */
+  def zeros(numRows: Int, numCols: Int): DenseMatrix =
+    new DenseMatrix(numRows, numCols, new Array[Double](numRows * numCols))
+
+  /**
+   * Generate a `DenseMatrix` consisting of ones.
+   * @param numRows number of rows of the matrix
+   * @param numCols number of columns of the matrix
+   * @return `DenseMatrix` with size `numRows` x `numCols` and values of ones
+   */
+  def ones(numRows: Int, numCols: Int): DenseMatrix =
+    new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(1.0))
+
+  /**
+   * Generate an Identity Matrix in `DenseMatrix` format.
+   * @param n number of rows and columns of the matrix
+   * @return `DenseMatrix` with size `n` x `n` and values of ones on the diagonal
+   */
+  def eye(n: Int): DenseMatrix = {
+    val identity = DenseMatrix.zeros(n, n)
+    var i = 0
+    while (i < n) {
+      identity.update(i, i, 1.0)
+      i += 1
+    }
+    identity
+  }
+
+  /**
+   * Generate a `DenseMatrix` consisting of i.i.d. uniform random numbers.
+   * @param numRows number of rows of the matrix
+   * @param numCols number of columns of the matrix
+   * @param rng a random number generator
+   * @return `DenseMatrix` with size `numRows` x `numCols` and values in U(0, 1)
+   */
+  def rand(numRows: Int, numCols: Int, rng: Random): DenseMatrix = {
+    new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(rng.nextDouble()))
+  }
+
+  /**
+   * Generate a `DenseMatrix` consisting of i.i.d. gaussian random numbers.
+   * @param numRows number of rows of the matrix
+   * @param numCols number of columns of the matrix
+   * @param rng a random number generator
+   * @return `DenseMatrix` with size `numRows` x `numCols` and values in N(0, 1)
+   */
+  def randn(numRows: Int, numCols: Int, rng: Random): DenseMatrix = {
+    new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(rng.nextGaussian()))
+  }
+
+  /**
+   * Generate a diagonal matrix in `DenseMatrix` format from the supplied values.
+   * @param vector a `Vector` that will form the values on the diagonal of the matrix
+   * @return Square `DenseMatrix` with size `values.length` x `values.length` and `values`
+   *         on the diagonal
+   */
+  def diag(vector: Vector): DenseMatrix = {
+    val n = vector.size
+    val matrix = DenseMatrix.zeros(n, n)
+    val values = vector.toArray
+    var i = 0
+    while (i < n) {
+      matrix.update(i, i, values(i))
+      i += 1
+    }
+    matrix
+  }
 }
 
 /**
@@ -156,6 +284,8 @@ class SparseMatrix(
   require(colPtrs.length == numCols + 1, "The length of the column indices should be the " +
     s"number of columns + 1. Currently, colPointers.length: ${colPtrs.length}, " +
     s"numCols: $numCols")
+  require(values.length == colPtrs.last, "The last value of colPtrs must equal the number of " +
+    s"elements. values.length: ${values.length}, colPtrs.last: ${colPtrs.last}")
 
   override def toArray: Array[Double] = {
     val arr = new Array[Double](numRows * numCols)
@@ -188,7 +318,7 @@ class SparseMatrix(
 
   private[mllib] def update(i: Int, j: Int, v: Double): Unit = {
     val ind = index(i, j)
-    if (ind == -1){
+    if (ind == -1) {
       throw new NoSuchElementException("The given row and column indices correspond to a zero " +
         "value. Only non-zero elements in Sparse Matrices can be updated.")
     } else {
@@ -197,6 +327,192 @@ class SparseMatrix(
   }
 
   override def copy = new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values.clone())
+
+  private[mllib] def map(f: Double => Double) =
+    new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values.map(f))
+
+  private[mllib] def update(f: Double => Double): SparseMatrix = {
+    val len = values.length
+    var i = 0
+    while (i < len) {
+      values(i) = f(values(i))
+      i += 1
+    }
+    this
+  }
+
+  /** Generate a `DenseMatrix` from the given `SparseMatrix`. */
+  def toDense(): DenseMatrix = {
+    new DenseMatrix(numRows, numCols, toArray)
+  }
+}
+
+/**
+ * Factory methods for [[org.apache.spark.mllib.linalg.SparseMatrix]].
+ */
+object SparseMatrix {
+
+  /**
+   * Generate a `SparseMatrix` from Coordinate List (COO) format. Input must be an array of
+   * (i, j, value) tuples. Entries that have duplicate values of i and j are
+   * added together. Tuples where value is equal to zero will be omitted.
+   * @param numRows number of rows of the matrix
+   * @param numCols number of columns of the matrix
+   * @param entries Array of (i, j, value) tuples
+   * @return The corresponding `SparseMatrix`
+   */
+  def fromCOO(numRows: Int, numCols: Int, entries: Iterable[(Int, Int, Double)]): SparseMatrix = {
+    val sortedEntries = entries.toSeq.sortBy(v => (v._2, v._1))
+    val numEntries = sortedEntries.size
+    if (sortedEntries.nonEmpty) {
+      // Since the entries are sorted by column index, we only need to check the first and the last.
+      for (col <- Seq(sortedEntries.head._2, sortedEntries.last._2)) {
+        require(col >= 0 && col < numCols, s"Column index out of range [0, $numCols): $col.")
+      }
+    }
+    val colPtrs = new Array[Int](numCols + 1)
+    val rowIndices = MArrayBuilder.make[Int]
+    rowIndices.sizeHint(numEntries)
+    val values = MArrayBuilder.make[Double]
+    values.sizeHint(numEntries)
+    var nnz = 0
+    var prevCol = 0
+    var prevRow = -1
+    var prevVal = 0.0
+    // Append a dummy entry to include the last one at the end of the loop.
+    (sortedEntries.view :+ (numRows, numCols, 1.0)).foreach { case (i, j, v) =>
+      if (v != 0) {
+        if (i == prevRow && j == prevCol) {
+          prevVal += v
+        } else {
+          if (prevVal != 0) {
+            require(prevRow >= 0 && prevRow < numRows,
+              s"Row index out of range [0, $numRows): $prevRow.")
+            nnz += 1
+            rowIndices += prevRow
+            values += prevVal
+          }
+          prevRow = i
+          prevVal = v
+          while (prevCol < j) {
+            colPtrs(prevCol + 1) = nnz
+            prevCol += 1
+          }
+        }
+      }
+    }
+    new SparseMatrix(numRows, numCols, colPtrs, rowIndices.result(), values.result())
+  }
+
+  /**
+   * Generate an Identity Matrix in `SparseMatrix` format.
+   * @param n number of rows and columns of the matrix
+   * @return `SparseMatrix` with size `n` x `n` and values of ones on the diagonal
+   */
+  def speye(n: Int): SparseMatrix = {
+    new SparseMatrix(n, n, (0 to n).toArray, (0 until n).toArray, Array.fill(n)(1.0))
+  }
+
+  /**
+   * Generates the skeleton of a random `SparseMatrix` with a given random number generator.
+   * The values of the matrix returned are undefined.
+   */
+  private def genRandMatrix(
+      numRows: Int,
+      numCols: Int,
+      density: Double,
+      rng: Random): SparseMatrix = {
+    require(numRows > 0, s"numRows must be greater than 0 but got $numRows")
+    require(numCols > 0, s"numCols must be greater than 0 but got $numCols")
+    require(density >= 0.0 && density <= 1.0,
+      s"density must be a double in the range 0.0 <= d <= 1.0. Currently, density: $density")
+    val size = numRows.toLong * numCols
+    val expected = size * density
+    assert(expected < Int.MaxValue,
+      "The expected number of nonzeros cannot be greater than Int.MaxValue.")
+    val nnz = math.ceil(expected).toInt
+    if (density == 0.0) {
+      new SparseMatrix(numRows, numCols, new Array[Int](numCols + 1), Array[Int](), Array[Double]())
+    } else if (density == 1.0) {
+      val colPtrs = Array.tabulate(numCols + 1)(j => j * numRows)
+      val rowIndices = Array.tabulate(size.toInt)(idx => idx % numRows)
+      new SparseMatrix(numRows, numCols, colPtrs, rowIndices, new Array[Double](numRows * numCols))
+    } else if (density < 0.34) {
+      // draw-by-draw, expected number of iterations is less than 1.5 * nnz
+      val entries = MHashSet[(Int, Int)]()
+      while (entries.size < nnz) {
+        entries += ((rng.nextInt(numRows), rng.nextInt(numCols)))
+      }
+      SparseMatrix.fromCOO(numRows, numCols, entries.map(v => (v._1, v._2, 1.0)))
+    } else {
+      // selection-rejection method
+      var idx = 0L
+      var numSelected = 0
+      var j = 0
+      val colPtrs = new Array[Int](numCols + 1)
+      val rowIndices = new Array[Int](nnz)
+      while (j < numCols && numSelected < nnz) {
+        var i = 0
+        while (i < numRows && numSelected < nnz) {
+          if (rng.nextDouble() < 1.0 * (nnz - numSelected) / (size - idx)) {
+            rowIndices(numSelected) = i
+            numSelected += 1
+          }
+          i += 1
+          idx += 1
+        }
+        colPtrs(j + 1) = numSelected
+        j += 1
+      }
+      new SparseMatrix(numRows, numCols, colPtrs, rowIndices, new Array[Double](nnz))
+    }
+  }
+
+  /**
+   * Generate a `SparseMatrix` consisting of i.i.d. uniform random numbers. The number of non-zero
+   * elements equal the ceiling of `numRows` x `numCols` x `density`
+   *
+   * @param numRows number of rows of the matrix
+   * @param numCols number of columns of the matrix
+   * @param density the desired density for the matrix
+   * @param rng a random number generator
+   * @return `SparseMatrix` with size `numRows` x `numCols` and values in U(0, 1)
+   */
+  def sprand(numRows: Int, numCols: Int, density: Double, rng: Random): SparseMatrix = {
+    val mat = genRandMatrix(numRows, numCols, density, rng)
+    mat.update(i => rng.nextDouble())
+  }
+
+  /**
+   * Generate a `SparseMatrix` consisting of i.i.d. gaussian random numbers.
+   * @param numRows number of rows of the matrix
+   * @param numCols number of columns of the matrix
+   * @param density the desired density for the matrix
+   * @param rng a random number generator
+   * @return `SparseMatrix` with size `numRows` x `numCols` and values in N(0, 1)
+   */
+  def sprandn(numRows: Int, numCols: Int, density: Double, rng: Random): SparseMatrix = {
+    val mat = genRandMatrix(numRows, numCols, density, rng)
+    mat.update(i => rng.nextGaussian())
+  }
+
+  /**
+   * Generate a diagonal matrix in `SparseMatrix` format from the supplied values.
+   * @param vector a `Vector` that will form the values on the diagonal of the matrix
+   * @return Square `SparseMatrix` with size `values.length` x `values.length` and non-zero
+   *         `values` on the diagonal
+   */
+  def diag(vector: Vector): SparseMatrix = {
+    val n = vector.size
+    vector match {
+      case sVec: SparseVector =>
+        SparseMatrix.fromCOO(n, n, sVec.indices.zip(sVec.values).map(v => (v._1, v._1, v._2)))
+      case dVec: DenseVector =>
+        val entries = dVec.values.zipWithIndex
+        val nnzVals = entries.filter(v => v._1 != 0.0)
+        SparseMatrix.fromCOO(n, n, nnzVals.map(v => (v._2, v._2, v._1)))
+    }
+  }
 }
 
 /**
@@ -256,72 +572,250 @@ object Matrices {
    * Generate a `DenseMatrix` consisting of zeros.
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
-   * @return `DenseMatrix` with size `numRows` x `numCols` and values of zeros
+   * @return `Matrix` with size `numRows` x `numCols` and values of zeros
    */
-  def zeros(numRows: Int, numCols: Int): Matrix =
-    new DenseMatrix(numRows, numCols, new Array[Double](numRows * numCols))
+  def zeros(numRows: Int, numCols: Int): Matrix = DenseMatrix.zeros(numRows, numCols)
 
   /**
    * Generate a `DenseMatrix` consisting of ones.
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
-   * @return `DenseMatrix` with size `numRows` x `numCols` and values of ones
+   * @return `Matrix` with size `numRows` x `numCols` and values of ones
    */
-  def ones(numRows: Int, numCols: Int): Matrix =
-    new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(1.0))
+  def ones(numRows: Int, numCols: Int): Matrix = DenseMatrix.ones(numRows, numCols)
 
   /**
-   * Generate an Identity Matrix in `DenseMatrix` format.
+   * Generate a dense Identity Matrix in `Matrix` format.
    * @param n number of rows and columns of the matrix
-   * @return `DenseMatrix` with size `n` x `n` and values of ones on the diagonal
+   * @return `Matrix` with size `n` x `n` and values of ones on the diagonal
    */
-  def eye(n: Int): Matrix = {
-    val identity = Matrices.zeros(n, n)
-    var i = 0
-    while (i < n){
-      identity.update(i, i, 1.0)
-      i += 1
-    }
-    identity
-  }
+  def eye(n: Int): Matrix = DenseMatrix.eye(n)
+
+  /**
+   * Generate a sparse Identity Matrix in `Matrix` format.
+   * @param n number of rows and columns of the matrix
+   * @return `Matrix` with size `n` x `n` and values of ones on the diagonal
+   */
+  def speye(n: Int): Matrix = SparseMatrix.speye(n)
 
   /**
    * Generate a `DenseMatrix` consisting of i.i.d. uniform random numbers.
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
    * @param rng a random number generator
-   * @return `DenseMatrix` with size `numRows` x `numCols` and values in U(0, 1)
+   * @return `Matrix` with size `numRows` x `numCols` and values in U(0, 1)
    */
-  def rand(numRows: Int, numCols: Int, rng: Random): Matrix = {
-    new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(rng.nextDouble()))
-  }
+  def rand(numRows: Int, numCols: Int, rng: Random): Matrix =
+    DenseMatrix.rand(numRows, numCols, rng)
+
+  /**
+   * Generate a `SparseMatrix` consisting of i.i.d. gaussian random numbers.
+   * @param numRows number of rows of the matrix
+   * @param numCols number of columns of the matrix
+   * @param density the desired density for the matrix
+   * @param rng a random number generator
+   * @return `Matrix` with size `numRows` x `numCols` and values in U(0, 1)
+   */
+  def sprand(numRows: Int, numCols: Int, density: Double, rng: Random): Matrix =
+    SparseMatrix.sprand(numRows, numCols, density, rng)
 
   /**
    * Generate a `DenseMatrix` consisting of i.i.d. gaussian random numbers.
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
    * @param rng a random number generator
-   * @return `DenseMatrix` with size `numRows` x `numCols` and values in N(0, 1)
+   * @return `Matrix` with size `numRows` x `numCols` and values in N(0, 1)
    */
-  def randn(numRows: Int, numCols: Int, rng: Random): Matrix = {
-    new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(rng.nextGaussian()))
-  }
+  def randn(numRows: Int, numCols: Int, rng: Random): Matrix =
+    DenseMatrix.randn(numRows, numCols, rng)
+
+  /**
+   * Generate a `SparseMatrix` consisting of i.i.d. gaussian random numbers.
+   * @param numRows number of rows of the matrix
+   * @param numCols number of columns of the matrix
+   * @param density the desired density for the matrix
+   * @param rng a random number generator
+   * @return `Matrix` with size `numRows` x `numCols` and values in N(0, 1)
+   */
+  def sprandn(numRows: Int, numCols: Int, density: Double, rng: Random): Matrix =
+    SparseMatrix.sprandn(numRows, numCols, density, rng)
 
   /**
    * Generate a diagonal matrix in `DenseMatrix` format from the supplied values.
    * @param vector a `Vector` tat will form the values on the diagonal of the matrix
-   * @return Square `DenseMatrix` with size `values.length` x `values.length` and `values`
+   * @return Square `Matrix` with size `values.length` x `values.length` and `values`
    *         on the diagonal
    */
-  def diag(vector: Vector): Matrix = {
-    val n = vector.size
-    val matrix = Matrices.eye(n)
-    val values = vector.toArray
-    var i = 0
-    while (i < n) {
-      matrix.update(i, i, values(i))
-      i += 1
+  def diag(vector: Vector): Matrix = DenseMatrix.diag(vector)
+
+  /**
+   * Horizontally concatenate a sequence of matrices. The returned matrix will be in the format
+   * the matrices are supplied in. Supplying a mix of dense and sparse matrices will result in
+   * a sparse matrix. If the Array is empty, an empty `DenseMatrix` will be returned.
+   * @param matrices array of matrices
+   * @return a single `Matrix` composed of the matrices that were horizontally concatenated
+   */
+  def horzcat(matrices: Array[Matrix]): Matrix = {
+    if (matrices.isEmpty) {
+      return new DenseMatrix(0, 0, Array[Double]())
+    } else if (matrices.size == 1) {
+      return matrices(0)
+    }
+    val numRows = matrices(0).numRows
+    var hasSparse = false
+    var numCols = 0
+    matrices.foreach { mat =>
+      require(numRows == mat.numRows, "The number of rows of the matrices in this sequence, " +
+        "don't match!")
+      mat match {
+        case sparse: SparseMatrix => hasSparse = true
+        case dense: DenseMatrix => // empty on purpose
+        case _ => throw new IllegalArgumentException("Unsupported matrix format. Expected " +
+          s"SparseMatrix or DenseMatrix. Instead got: ${mat.getClass}")
+      }
+      numCols += mat.numCols
+    }
+    if (!hasSparse) {
+      new DenseMatrix(numRows, numCols, matrices.flatMap(_.toArray))
+    } else {
+      var startCol = 0
+      val entries: Array[(Int, Int, Double)] = matrices.flatMap {
+        case spMat: SparseMatrix =>
+          var j = 0
+          val colPtrs = spMat.colPtrs
+          val rowIndices = spMat.rowIndices
+          val values = spMat.values
+          val data = new Array[(Int, Int, Double)](values.length)
+          val nCols = spMat.numCols
+          while (j < nCols) {
+            var idx = colPtrs(j)
+            while (idx < colPtrs(j + 1)) {
+              val i = rowIndices(idx)
+              val v = values(idx)
+              data(idx) = (i, j + startCol, v)
+              idx += 1
+            }
+            j += 1
+          }
+          startCol += nCols
+          data
+        case dnMat: DenseMatrix =>
+          val data = new ArrayBuffer[(Int, Int, Double)]()
+          var j = 0
+          val nCols = dnMat.numCols
+          val nRows = dnMat.numRows
+          val values = dnMat.values
+          while (j < nCols) {
+            var i = 0
+            val indStart = j * nRows
+            while (i < nRows) {
+              val v = values(indStart + i)
+              if (v != 0.0) {
+                data.append((i, j + startCol, v))
+              }
+              i += 1
+            }
+            j += 1
+          }
+          startCol += nCols
+          data
+      }
+      SparseMatrix.fromCOO(numRows, numCols, entries)
+    }
+  }
+
+  /**
+   * Vertically concatenate a sequence of matrices. The returned matrix will be in the format
+   * the matrices are supplied in. Supplying a mix of dense and sparse matrices will result in
+   * a sparse matrix. If the Array is empty, an empty `DenseMatrix` will be returned.
+   * @param matrices array of matrices
+   * @return a single `Matrix` composed of the matrices that were vertically concatenated
+   */
+  def vertcat(matrices: Array[Matrix]): Matrix = {
+    if (matrices.isEmpty) {
+      return new DenseMatrix(0, 0, Array[Double]())
+    } else if (matrices.size == 1) {
+      return matrices(0)
+    }
+    val numCols = matrices(0).numCols
+    var hasSparse = false
+    var numRows = 0
+    matrices.foreach { mat =>
+      require(numCols == mat.numCols, "The number of rows of the matrices in this sequence, " +
+        "don't match!")
+      mat match {
+        case sparse: SparseMatrix =>
+          hasSparse = true
+        case dense: DenseMatrix =>
+        case _ => throw new IllegalArgumentException("Unsupported matrix format. Expected " +
+          s"SparseMatrix or DenseMatrix. Instead got: ${mat.getClass}")
+      }
+      numRows += mat.numRows
+
+    }
+    if (!hasSparse) {
+      val allValues = new Array[Double](numRows * numCols)
+      var startRow = 0
+      matrices.foreach { mat =>
+        var j = 0
+        val nRows = mat.numRows
+        val values = mat.toArray
+        while (j < numCols) {
+          var i = 0
+          val indStart = j * numRows + startRow
+          val subMatStart = j * nRows
+          while (i < nRows) {
+            allValues(indStart + i) = values(subMatStart + i)
+            i += 1
+          }
+          j += 1
+        }
+        startRow += nRows
+      }
+      new DenseMatrix(numRows, numCols, allValues)
+    } else {
+      var startRow = 0
+      val entries: Array[(Int, Int, Double)] = matrices.flatMap {
+        case spMat: SparseMatrix =>
+          var j = 0
+          val colPtrs = spMat.colPtrs
+          val rowIndices = spMat.rowIndices
+          val values = spMat.values
+          val data = new Array[(Int, Int, Double)](values.length)
+          while (j < numCols) {
+            var idx = colPtrs(j)
+            while (idx < colPtrs(j + 1)) {
+              val i = rowIndices(idx)
+              val v = values(idx)
+              data(idx) = (i + startRow, j, v)
+              idx += 1
+            }
+            j += 1
+          }
+          startRow += spMat.numRows
+          data
+        case dnMat: DenseMatrix =>
+          val data = new ArrayBuffer[(Int, Int, Double)]()
+          var j = 0
+          val nCols = dnMat.numCols
+          val nRows = dnMat.numRows
+          val values = dnMat.values
+          while (j < nCols) {
+            var i = 0
+            val indStart = j * nRows
+            while (i < nRows) {
+              val v = values(indStart + i)
+              if (v != 0.0) {
+                data.append((i + startRow, j, v))
+              }
+              i += 1
+            }
+            j += 1
+          }
+          startRow += nRows
+          data
+      }
+      SparseMatrix.fromCOO(numRows, numCols, entries)
     }
-    matrix
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 47d1a76fa361d..7ee0224ad4662 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -27,9 +27,8 @@ import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
 
 import org.apache.spark.SparkException
 import org.apache.spark.mllib.util.NumericParser
-import org.apache.spark.sql.catalyst.annotation.SQLUserDefinedType
 import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, Row}
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 /**
  * Represents a numeric vector, whose index type is Int and value type is Double.
@@ -51,13 +50,35 @@ sealed trait Vector extends Serializable {
 
   override def equals(other: Any): Boolean = {
     other match {
-      case v: Vector =>
-        util.Arrays.equals(this.toArray, v.toArray)
+      case v2: Vector => {
+        if (this.size != v2.size) return false
+        (this, v2) match {
+          case (s1: SparseVector, s2: SparseVector) =>
+            Vectors.equals(s1.indices, s1.values, s2.indices, s2.values)
+          case (s1: SparseVector, d1: DenseVector) =>
+            Vectors.equals(s1.indices, s1.values, 0 until d1.size, d1.values)
+          case (d1: DenseVector, s1: SparseVector) =>
+            Vectors.equals(0 until d1.size, d1.values, s1.indices, s1.values)
+          case (_, _) => util.Arrays.equals(this.toArray, v2.toArray)
+        }
+      }
       case _ => false
     }
   }
 
-  override def hashCode(): Int = util.Arrays.hashCode(this.toArray)
+  override def hashCode(): Int = {
+    var result: Int = size + 31
+    this.foreachActive { case (index, value) =>
+      // ignore explict 0 for comparison between sparse and dense
+      if (value != 0) {
+        result = 31 * result + index
+        // refer to {@link java.util.Arrays.equals} for hash algorithm
+        val bits = java.lang.Double.doubleToLongBits(value)
+        result = 31 * result + (bits ^ (bits >>> 32)).toInt
+      }
+    }
+    return result
+  }
 
   /**
    * Converts the instance to a breeze vector.
@@ -108,16 +129,16 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
   override def serialize(obj: Any): Row = {
     val row = new GenericMutableRow(4)
     obj match {
-      case sv: SparseVector =>
+      case SparseVector(size, indices, values) =>
         row.setByte(0, 0)
-        row.setInt(1, sv.size)
-        row.update(2, sv.indices.toSeq)
-        row.update(3, sv.values.toSeq)
-      case dv: DenseVector =>
+        row.setInt(1, size)
+        row.update(2, indices.toSeq)
+        row.update(3, values.toSeq)
+      case DenseVector(values) =>
         row.setByte(0, 1)
         row.setNullAt(1)
         row.setNullAt(2)
-        row.update(3, dv.values.toSeq)
+        row.update(3, values.toSeq)
     }
     row
   }
@@ -268,11 +289,11 @@ object Vectors {
    * @param p norm.
    * @return norm in L^p^ space.
    */
-  private[spark] def norm(vector: Vector, p: Double): Double = {
+  def norm(vector: Vector, p: Double): Double = {
     require(p >= 1.0)
     val values = vector match {
-      case dv: DenseVector => dv.values
-      case sv: SparseVector => sv.values
+      case DenseVector(vs) => vs
+      case SparseVector(n, ids, vs) => vs
       case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
     }
     val size = values.size
@@ -312,6 +333,114 @@ object Vectors {
       math.pow(sum, 1.0 / p)
     }
   }
+ 
+  /**
+   * Returns the squared distance between two Vectors.
+   * @param v1 first Vector.
+   * @param v2 second Vector.
+   * @return squared distance between two Vectors.
+   */
+  def sqdist(v1: Vector, v2: Vector): Double = {
+    var squaredDistance = 0.0
+    (v1, v2) match { 
+      case (v1: SparseVector, v2: SparseVector) =>
+        val v1Values = v1.values
+        val v1Indices = v1.indices
+        val v2Values = v2.values
+        val v2Indices = v2.indices
+        val nnzv1 = v1Indices.size
+        val nnzv2 = v2Indices.size
+        
+        var kv1 = 0
+        var kv2 = 0
+        while (kv1 < nnzv1 || kv2 < nnzv2) {
+          var score = 0.0
+ 
+          if (kv2 >= nnzv2 || (kv1 < nnzv1 && v1Indices(kv1) < v2Indices(kv2))) {
+            score = v1Values(kv1)
+            kv1 += 1
+          } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) {
+            score = v2Values(kv2)
+            kv2 += 1
+          } else {
+            score = v1Values(kv1) - v2Values(kv2)
+            kv1 += 1
+            kv2 += 1
+          }
+          squaredDistance += score * score
+        }
+
+      case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 =>
+        squaredDistance = sqdist(v1, v2)
+
+      case (v1: DenseVector, v2: SparseVector) if v2.indices.length / v2.size < 0.5 =>
+        squaredDistance = sqdist(v2, v1)
+
+      // When a SparseVector is approximately dense, we treat it as a DenseVector
+      case (v1, v2) =>
+        squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0){ (distance, elems) =>
+          val score = elems._1 - elems._2
+          distance + score * score
+        }
+    }
+    squaredDistance
+  }
+
+  /**
+   * Returns the squared distance between DenseVector and SparseVector.
+   */
+  private[mllib] def sqdist(v1: SparseVector, v2: DenseVector): Double = {
+    var kv1 = 0
+    var kv2 = 0
+    val indices = v1.indices
+    var squaredDistance = 0.0
+    val nnzv1 = indices.size
+    val nnzv2 = v2.size
+    var iv1 = if (nnzv1 > 0) indices(kv1) else -1
+   
+    while (kv2 < nnzv2) {
+      var score = 0.0
+      if (kv2 != iv1) {
+        score = v2(kv2)
+      } else {
+        score = v1.values(kv1) - v2(kv2)
+        if (kv1 < nnzv1 - 1) {
+          kv1 += 1
+          iv1 = indices(kv1)
+        }
+      }
+      squaredDistance += score * score
+      kv2 += 1
+    }
+    squaredDistance
+  }
+
+  /**
+   * Check equality between sparse/dense vectors
+   */
+  private[mllib] def equals(
+      v1Indices: IndexedSeq[Int],
+      v1Values: Array[Double],
+      v2Indices: IndexedSeq[Int],
+      v2Values: Array[Double]): Boolean = {
+    val v1Size = v1Values.size
+    val v2Size = v2Values.size
+    var k1 = 0
+    var k2 = 0
+    var allEqual = true
+    while (allEqual) {
+      while (k1 < v1Size && v1Values(k1) == 0) k1 += 1
+      while (k2 < v2Size && v2Values(k2) == 0) k2 += 1
+
+      if (k1 >= v1Size || k2 >= v2Size) {
+        return k1 >= v1Size && k2 >= v2Size // check end alignment
+      }
+      allEqual = v1Indices(k1) == v2Indices(k2) && v1Values(k1) == v2Values(k2)
+      k1 += 1
+      k2 += 1
+    }
+    allEqual
+  }
 }
 
 /**
@@ -346,6 +475,10 @@ class DenseVector(val values: Array[Double]) extends Vector {
   }
 }
 
+object DenseVector {
+  def unapply(dv: DenseVector): Option[Array[Double]] = Some(dv.values)
+}
+
 /**
  * A sparse vector represented by an index array and an value array.
  *
@@ -393,3 +526,8 @@ class SparseVector(
     }
   }
 }
+
+object SparseVector {
+  def unapply(sv: SparseVector): Option[(Int, Array[Int], Array[Double])] =
+    Some((sv.size, sv.indices, sv.values))
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
index 06d8915f3bfa1..b60559c853a50 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
@@ -69,6 +69,11 @@ class CoordinateMatrix(
     nRows
   }
 
+  /** Transposes this CoordinateMatrix. */
+  def transpose(): CoordinateMatrix = {
+    new CoordinateMatrix(entries.map(x => MatrixEntry(x.j, x.i, x.value)), numCols(), numRows())
+  }
+
   /** Converts to IndexedRowMatrix. The number of columns must be within the integer range. */
   def toIndexedRowMatrix(): IndexedRowMatrix = {
     val nl = numCols()
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
index 5c1acca0ec532..c518271f04729 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
@@ -75,6 +75,23 @@ class IndexedRowMatrix(
     new RowMatrix(rows.map(_.vector), 0L, nCols)
   }
 
+  /**
+   * Converts this matrix to a
+   * [[org.apache.spark.mllib.linalg.distributed.CoordinateMatrix]].
+   */
+  def toCoordinateMatrix(): CoordinateMatrix = {
+    val entries = rows.flatMap { row =>
+      val rowIndex = row.index
+      row.vector match {
+        case SparseVector(size, indices, values) =>
+          Iterator.tabulate(indices.size)(i => MatrixEntry(rowIndex, indices(i), values(i)))
+        case DenseVector(values) =>
+          Iterator.tabulate(values.size)(i => MatrixEntry(rowIndex, i, values(i)))
+      }
+    }
+    new CoordinateMatrix(entries, numRows(), numCols())
+  }
+
   /**
    * Computes the singular value decomposition of this IndexedRowMatrix.
    * Denote this matrix by A (m x n), this will compute matrices U, S, V such that A = U * S * V'.
@@ -102,6 +119,9 @@ class IndexedRowMatrix(
       k: Int,
       computeU: Boolean = false,
       rCond: Double = 1e-9): SingularValueDecomposition[IndexedRowMatrix, Matrix] = {
+
+    val n = numCols().toInt
+    require(k > 0 && k <= n, s"Requested k singular values but got k=$k and numCols=$n.")
     val indices = rows.map(_.index)
     val svd = toRowMatrix().computeSVD(k, computeU, rCond)
     val U = if (computeU) {
@@ -142,7 +162,7 @@ class IndexedRowMatrix(
     val mat = BDM.zeros[Double](m, n)
     rows.collect().foreach { case IndexedRow(rowIndex, vector) =>
       val i = rowIndex.toInt
-      vector.toBreeze.activeIterator.foreach { case (j, v) =>
+      vector.foreachActive { case (j, v) =>
         mat(i, j) = v
       }
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 10a515af88802..02075edbabf85 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -131,8 +131,8 @@ class RowMatrix(
       throw new IllegalArgumentException(s"Argument with more than 65535 cols: $cols")
     }
     if (cols > 10000) {
-      val mem = cols * cols * 8
-      logWarning(s"$cols columns will require at least $mem bytes of memory!")
+      val memMB = (cols.toLong * cols) / 125000
+      logWarning(s"$cols columns will require at least $memMB megabytes of memory!")
     }
   }
 
@@ -212,7 +212,7 @@ class RowMatrix(
       tol: Double,
       mode: String): SingularValueDecomposition[RowMatrix, Matrix] = {
     val n = numCols().toInt
-    require(k > 0 && k <= n, s"Request up to n singular values but got k=$k and n=$n.")
+    require(k > 0 && k <= n, s"Requested k singular values but got k=$k and numCols=$n.")
 
     object SVDMode extends Enumeration {
       val LocalARPACK, LocalLAPACK, DistARPACK = Value
@@ -528,21 +528,21 @@ class RowMatrix(
       iter.flatMap { row =>
         val buf = new ListBuffer[((Int, Int), Double)]()
         row match {
-          case sv: SparseVector =>
-            val nnz = sv.indices.size
+          case SparseVector(size, indices, values) =>
+            val nnz = indices.size
             var k = 0
             while (k < nnz) {
-              scaled(k) = sv.values(k) / q(sv.indices(k))
+              scaled(k) = values(k) / q(indices(k))
               k += 1
             }
             k = 0
             while (k < nnz) {
-              val i = sv.indices(k)
+              val i = indices(k)
               val iVal = scaled(k)
               if (iVal != 0 && rand.nextDouble() < p(i)) {
                 var l = k + 1
                 while (l < nnz) {
-                  val j = sv.indices(l)
+                  val j = indices(l)
                   val jVal = scaled(l)
                   if (jVal != 0 && rand.nextDouble() < p(j)) {
                     buf += (((i, j), iVal * jVal))
@@ -552,11 +552,11 @@ class RowMatrix(
               }
               k += 1
             }
-          case dv: DenseVector =>
-            val n = dv.values.size
+          case DenseVector(values) =>
+            val n = values.size
             var i = 0
             while (i < n) {
-              scaled(i) = dv.values(i) / q(i)
+              scaled(i) = values(i) / q(i)
               i += 1
             }
             i = 0
@@ -588,8 +588,8 @@ class RowMatrix(
     val n = numCols().toInt
     val mat = BDM.zeros[Double](m, n)
     var i = 0
-    rows.collect().foreach { v =>
-      v.toBreeze.activeIterator.foreach { case (j, v) =>
+    rows.collect().foreach { vector =>
+      vector.foreachActive { case (j, v) =>
         mat(i, j) = v
       }
       i += 1
@@ -620,11 +620,9 @@ object RowMatrix {
     // TODO: Find a better home (breeze?) for this method.
     val n = v.size
     v match {
-      case dv: DenseVector =>
-        blas.dspr("U", n, alpha, dv.values, 1, U)
-      case sv: SparseVector =>
-        val indices = sv.indices
-        val values = sv.values
+      case DenseVector(values) =>
+        blas.dspr("U", n, alpha, values, 1, U)
+      case SparseVector(size, indices, values) =>
         val nnz = indices.length
         var colStartIdx = 0
         var prevCol = 0
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
index 45dbf6044fcc5..1ca0f36c6ac34 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
@@ -20,6 +20,7 @@ package org.apache.spark.mllib.optimization
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS.{axpy, dot, scal}
+import org.apache.spark.mllib.util.MLUtils
 
 /**
  * :: DeveloperApi ::
@@ -66,9 +67,10 @@ class LogisticGradient extends Gradient {
     scal(gradientMultiplier, gradient)
     val loss =
       if (label > 0) {
-        math.log1p(math.exp(margin)) // log1p is log(1+p) but more accurate for small p
+        // The following is equivalent to log(1 + exp(margin)) but more numerically stable.
+        MLUtils.log1pExp(margin)
       } else {
-        math.log1p(math.exp(margin)) - margin
+        MLUtils.log1pExp(margin) - margin
       }
 
     (gradient, loss)
@@ -83,9 +85,10 @@ class LogisticGradient extends Gradient {
     val gradientMultiplier = (1.0 / (1.0 + math.exp(margin))) - label
     axpy(gradientMultiplier, data, cumGradient)
     if (label > 0) {
-      math.log1p(math.exp(margin))
+      // The following is equivalent to log(1 + exp(margin)) but more numerically stable.
+      MLUtils.log1pExp(margin)
     } else {
-      math.log1p(math.exp(margin)) - margin
+      MLUtils.log1pExp(margin) - margin
     }
   }
 }
@@ -94,16 +97,16 @@ class LogisticGradient extends Gradient {
  * :: DeveloperApi ::
  * Compute gradient and loss for a Least-squared loss function, as used in linear regression.
  * This is correct for the averaged least squares loss function (mean squared error)
- *              L = 1/n ||A weights-y||^2
+ *              L = 1/2n ||A weights-y||^2
  * See also the documentation for the precise formulation.
  */
 @DeveloperApi
 class LeastSquaresGradient extends Gradient {
   override def compute(data: Vector, label: Double, weights: Vector): (Vector, Double) = {
     val diff = dot(data, weights) - label
-    val loss = diff * diff
+    val loss = diff * diff / 2.0
     val gradient = data.copy
-    scal(2.0 * diff, gradient)
+    scal(diff, gradient)
     (gradient, loss)
   }
 
@@ -113,8 +116,8 @@ class LeastSquaresGradient extends Gradient {
       weights: Vector,
       cumGradient: Vector): Double = {
     val diff = dot(data, weights) - label
-    axpy(2.0 * diff, data, cumGradient)
-    diff * diff
+    axpy(diff, data, cumGradient)
+    diff * diff / 2.0
   }
 }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
index 51f9b8657c640..405bae62ee8b6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.mllib.random
 
-import org.apache.commons.math3.distribution.PoissonDistribution
+import org.apache.commons.math3.distribution.{ExponentialDistribution,
+  GammaDistribution, LogNormalDistribution, PoissonDistribution}
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom}
@@ -88,14 +89,76 @@ class StandardNormalGenerator extends RandomDataGenerator[Double] {
 @DeveloperApi
 class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] {
 
-  private var rng = new PoissonDistribution(mean)
+  private val rng = new PoissonDistribution(mean)
 
   override def nextValue(): Double = rng.sample()
 
   override def setSeed(seed: Long) {
-    rng = new PoissonDistribution(mean)
     rng.reseedRandomGenerator(seed)
   }
 
   override def copy(): PoissonGenerator = new PoissonGenerator(mean)
 }
+
+/**
+ * :: DeveloperApi ::
+ * Generates i.i.d. samples from the exponential distribution with the given mean.
+ *
+ * @param mean mean for the exponential distribution.
+ */
+@DeveloperApi
+class ExponentialGenerator(val mean: Double) extends RandomDataGenerator[Double] {
+
+  private val rng = new ExponentialDistribution(mean)
+
+  override def nextValue(): Double = rng.sample()
+
+  override def setSeed(seed: Long) {
+    rng.reseedRandomGenerator(seed)
+  }
+
+  override def copy(): ExponentialGenerator = new ExponentialGenerator(mean)
+}
+
+/**
+ * :: DeveloperApi ::
+ * Generates i.i.d. samples from the gamma distribution with the given shape and scale.
+ *
+ * @param shape shape for the gamma distribution.
+ * @param scale scale for the gamma distribution
+ */
+@DeveloperApi
+class GammaGenerator(val shape: Double, val scale: Double) extends RandomDataGenerator[Double] {
+
+  private val rng = new GammaDistribution(shape, scale)
+
+  override def nextValue(): Double = rng.sample()
+
+  override def setSeed(seed: Long) {
+    rng.reseedRandomGenerator(seed)
+  }
+
+  override def copy(): GammaGenerator = new GammaGenerator(shape, scale)
+}
+
+/**
+ * :: DeveloperApi ::
+ * Generates i.i.d. samples from the log normal distribution with the
+ * given mean and standard deviation.
+ *
+ * @param mean mean for the log normal distribution.
+ * @param std standard deviation for the log normal distribution
+ */
+@DeveloperApi
+class LogNormalGenerator(val mean: Double, val std: Double) extends RandomDataGenerator[Double] {
+
+  private val rng = new LogNormalDistribution(mean, std)
+
+  override def nextValue(): Double = rng.sample()
+
+  override def setSeed(seed: Long) {
+    rng.reseedRandomGenerator(seed)
+  }
+
+  override def copy(): LogNormalGenerator = new LogNormalGenerator(mean, std)
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index c5f4b084321f7..955c593a085d5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -176,6 +176,176 @@ object RandomRDDs {
     JavaDoubleRDD.fromRDD(poissonRDD(jsc.sc, mean, size))
   }
 
+  /**
+   * Generates an RDD comprised of i.i.d. samples from the exponential distribution with
+   * the input mean.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param mean Mean, or 1 / lambda, for the exponential distribution.
+   * @param size Size of the RDD.
+   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
+   * @param seed Random seed (default: a random long integer).
+   * @return RDD[Double] comprised of i.i.d. samples ~ Pois(mean).
+   */
+  def exponentialRDD(
+      sc: SparkContext,
+      mean: Double,
+      size: Long,
+      numPartitions: Int = 0,
+      seed: Long = Utils.random.nextLong()): RDD[Double] = {
+    val exponential = new ExponentialGenerator(mean)
+    randomRDD(sc, exponential, size, numPartitionsOrDefault(sc, numPartitions), seed)
+  }
+
+  /**
+   * Java-friendly version of [[RandomRDDs#exponentialRDD]].
+   */
+  def exponentialJavaRDD(
+      jsc: JavaSparkContext,
+      mean: Double,
+      size: Long,
+      numPartitions: Int,
+      seed: Long): JavaDoubleRDD = {
+    JavaDoubleRDD.fromRDD(exponentialRDD(jsc.sc, mean, size, numPartitions, seed))
+  }
+
+  /**
+   * [[RandomRDDs#exponentialJavaRDD]] with the default seed.
+   */
+  def exponentialJavaRDD(
+      jsc: JavaSparkContext,
+      mean: Double,
+      size: Long,
+      numPartitions: Int): JavaDoubleRDD = {
+    JavaDoubleRDD.fromRDD(exponentialRDD(jsc.sc, mean, size, numPartitions))
+  }
+
+  /**
+   * [[RandomRDDs#exponentialJavaRDD]] with the default number of partitions and the default seed.
+   */
+  def exponentialJavaRDD(jsc: JavaSparkContext, mean: Double, size: Long): JavaDoubleRDD = {
+    JavaDoubleRDD.fromRDD(exponentialRDD(jsc.sc, mean, size))
+  }
+
+  /**
+   * Generates an RDD comprised of i.i.d. samples from the gamma distribution with the input
+   *  shape and scale.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param shape shape parameter (> 0) for the gamma distribution
+   * @param scale scale parameter (> 0) for the gamma distribution  
+   * @param size Size of the RDD.
+   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
+   * @param seed Random seed (default: a random long integer).
+   * @return RDD[Double] comprised of i.i.d. samples ~ Pois(mean).
+   */
+  def gammaRDD(
+      sc: SparkContext,
+      shape: Double,
+      scale: Double,
+      size: Long,
+      numPartitions: Int = 0,
+      seed: Long = Utils.random.nextLong()): RDD[Double] = {
+    val gamma = new GammaGenerator(shape, scale)
+    randomRDD(sc, gamma, size, numPartitionsOrDefault(sc, numPartitions), seed)
+  }
+
+  /**
+   * Java-friendly version of [[RandomRDDs#gammaRDD]].
+   */
+  def gammaJavaRDD(
+      jsc: JavaSparkContext,
+      shape: Double,
+      scale: Double,
+      size: Long,
+      numPartitions: Int,
+      seed: Long): JavaDoubleRDD = {
+    JavaDoubleRDD.fromRDD(gammaRDD(jsc.sc, shape, scale, size, numPartitions, seed))
+  }
+
+  /**
+   * [[RandomRDDs#gammaJavaRDD]] with the default seed.
+   */
+  def gammaJavaRDD(
+      jsc: JavaSparkContext,
+      shape: Double,
+      scale: Double,
+      size: Long,
+      numPartitions: Int): JavaDoubleRDD = {
+    JavaDoubleRDD.fromRDD(gammaRDD(jsc.sc, shape, scale, size, numPartitions))
+  }
+
+  /**
+   * [[RandomRDDs#gammaJavaRDD]] with the default number of partitions and the default seed.
+   */
+  def gammaJavaRDD(
+    jsc: JavaSparkContext,
+    shape: Double,
+    scale: Double,
+    size: Long): JavaDoubleRDD = {
+    JavaDoubleRDD.fromRDD(gammaRDD(jsc.sc, shape, scale, size))
+  }
+
+  /**
+   * Generates an RDD comprised of i.i.d. samples from the log normal distribution with the input
+   *  mean and standard deviation
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param mean mean for the log normal distribution
+   * @param std standard deviation for the log normal distribution  
+   * @param size Size of the RDD.
+   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
+   * @param seed Random seed (default: a random long integer).
+   * @return RDD[Double] comprised of i.i.d. samples ~ Pois(mean).
+   */
+  def logNormalRDD(
+      sc: SparkContext,
+      mean: Double,
+      std: Double,
+      size: Long,
+      numPartitions: Int = 0,
+      seed: Long = Utils.random.nextLong()): RDD[Double] = {
+    val logNormal = new LogNormalGenerator(mean, std)
+    randomRDD(sc, logNormal, size, numPartitionsOrDefault(sc, numPartitions), seed)
+  }
+
+  /**
+   * Java-friendly version of [[RandomRDDs#logNormalRDD]].
+   */
+  def logNormalJavaRDD(
+      jsc: JavaSparkContext,
+      mean: Double,
+      std: Double,
+      size: Long,
+      numPartitions: Int,
+      seed: Long): JavaDoubleRDD = {
+    JavaDoubleRDD.fromRDD(logNormalRDD(jsc.sc, mean, std, size, numPartitions, seed))
+  }
+
+  /**
+   * [[RandomRDDs#logNormalJavaRDD]] with the default seed.
+   */
+  def logNormalJavaRDD(
+      jsc: JavaSparkContext,
+      mean: Double,
+      std: Double,
+      size: Long,
+      numPartitions: Int): JavaDoubleRDD = {
+    JavaDoubleRDD.fromRDD(logNormalRDD(jsc.sc, mean, std, size, numPartitions))
+  }
+
+  /**
+   * [[RandomRDDs#logNormalJavaRDD]] with the default number of partitions and the default seed.
+   */
+  def logNormalJavaRDD(
+    jsc: JavaSparkContext,
+    mean: Double,
+    std: Double,
+    size: Long): JavaDoubleRDD = {
+    JavaDoubleRDD.fromRDD(logNormalRDD(jsc.sc, mean, std, size))
+  }
+
+
   /**
    * :: DeveloperApi ::
    * Generates an RDD comprised of i.i.d. samples produced by the input RandomDataGenerator.
@@ -307,6 +477,72 @@ object RandomRDDs {
     normalVectorRDD(jsc.sc, numRows, numCols).toJavaRDD()
   }
 
+  /**
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from a
+   * log normal distribution.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param mean Mean of the log normal distribution.
+   * @param std Standard deviation of the log normal distribution.
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
+   * @param seed Random seed (default: a random long integer).
+   * @return RDD[Vector] with vectors containing i.i.d. samples.
+   */
+  def logNormalVectorRDD(
+      sc: SparkContext,
+      mean: Double,
+      std: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int = 0,
+      seed: Long = Utils.random.nextLong()): RDD[Vector] = {
+    val logNormal = new LogNormalGenerator(mean, std)
+    randomVectorRDD(sc, logNormal, numRows, numCols,
+      numPartitionsOrDefault(sc, numPartitions), seed)
+  }
+
+  /**
+   * Java-friendly version of [[RandomRDDs#logNormalVectorRDD]].
+   */
+  def logNormalJavaVectorRDD(
+      jsc: JavaSparkContext,
+      mean: Double,
+      std: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int,
+      seed: Long): JavaRDD[Vector] = {
+    logNormalVectorRDD(jsc.sc, mean, std, numRows, numCols, numPartitions, seed).toJavaRDD()
+  }
+
+  /**
+   * [[RandomRDDs#logNormalJavaVectorRDD]] with the default seed.
+   */
+  def logNormalJavaVectorRDD(
+      jsc: JavaSparkContext,
+      mean: Double,
+      std: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int): JavaRDD[Vector] = {
+    logNormalVectorRDD(jsc.sc, mean, std, numRows, numCols, numPartitions).toJavaRDD()
+  }
+
+  /**
+   * [[RandomRDDs#logNormalJavaVectorRDD]] with the default number of partitions and
+   * the default seed.
+   */
+  def logNormalJavaVectorRDD(
+      jsc: JavaSparkContext,
+      mean: Double,
+      std: Double,
+      numRows: Long,
+      numCols: Int): JavaRDD[Vector] = {
+    logNormalVectorRDD(jsc.sc, mean, std, numRows, numCols).toJavaRDD()
+  }
+
   /**
    * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
    * Poisson distribution with the input mean.
@@ -366,6 +602,133 @@ object RandomRDDs {
     poissonVectorRDD(jsc.sc, mean, numRows, numCols).toJavaRDD()
   }
 
+  /**
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * exponential distribution with the input mean.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param mean Mean, or 1 / lambda, for the Exponential distribution.
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
+   * @param seed Random seed (default: a random long integer).
+   * @return RDD[Vector] with vectors containing i.i.d. samples ~ Exp(mean).
+   */
+  def exponentialVectorRDD(
+      sc: SparkContext,
+      mean: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int = 0,
+      seed: Long = Utils.random.nextLong()): RDD[Vector] = {
+    val exponential = new ExponentialGenerator(mean)
+    randomVectorRDD(sc, exponential, numRows, numCols,
+      numPartitionsOrDefault(sc, numPartitions), seed)
+  }
+
+  /**
+   * Java-friendly version of [[RandomRDDs#exponentialVectorRDD]].
+   */
+  def exponentialJavaVectorRDD(
+      jsc: JavaSparkContext,
+      mean: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int,
+      seed: Long): JavaRDD[Vector] = {
+    exponentialVectorRDD(jsc.sc, mean, numRows, numCols, numPartitions, seed).toJavaRDD()
+  }
+
+  /**
+   * [[RandomRDDs#exponentialJavaVectorRDD]] with the default seed.
+   */
+  def exponentialJavaVectorRDD(
+      jsc: JavaSparkContext,
+      mean: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int): JavaRDD[Vector] = {
+    exponentialVectorRDD(jsc.sc, mean, numRows, numCols, numPartitions).toJavaRDD()
+  }
+
+  /**
+   * [[RandomRDDs#exponentialJavaVectorRDD]] with the default number of partitions
+   * and the default seed.
+   */
+  def exponentialJavaVectorRDD(
+      jsc: JavaSparkContext,
+      mean: Double,
+      numRows: Long,
+      numCols: Int): JavaRDD[Vector] = {
+    exponentialVectorRDD(jsc.sc, mean, numRows, numCols).toJavaRDD()
+  }
+
+
+  /**
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * gamma distribution with the input shape and scale.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param shape shape parameter (> 0) for the gamma distribution.
+   * @param scale scale parameter (> 0) for the gamma distribution. 
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
+   * @param seed Random seed (default: a random long integer).
+   * @return RDD[Vector] with vectors containing i.i.d. samples ~ Exp(mean).
+   */
+  def gammaVectorRDD(
+      sc: SparkContext,
+      shape: Double,
+      scale: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int = 0,
+      seed: Long = Utils.random.nextLong()): RDD[Vector] = {
+    val gamma = new GammaGenerator(shape, scale)
+    randomVectorRDD(sc, gamma, numRows, numCols, numPartitionsOrDefault(sc, numPartitions), seed)
+  }
+
+  /**
+   * Java-friendly version of [[RandomRDDs#gammaVectorRDD]].
+   */
+  def gammaJavaVectorRDD(
+      jsc: JavaSparkContext,
+      shape: Double,
+      scale: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int,
+      seed: Long): JavaRDD[Vector] = {
+    gammaVectorRDD(jsc.sc, shape, scale, numRows, numCols, numPartitions, seed).toJavaRDD()
+  }
+
+  /**
+   * [[RandomRDDs#gammaJavaVectorRDD]] with the default seed.
+   */
+  def gammaJavaVectorRDD(
+      jsc: JavaSparkContext,
+      shape: Double,
+      scale: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int): JavaRDD[Vector] = {
+    gammaVectorRDD(jsc.sc, shape, scale, numRows, numCols, numPartitions).toJavaRDD()
+  }
+
+  /**
+   * [[RandomRDDs#gammaJavaVectorRDD]] with the default number of partitions and the default seed.
+   */
+  def gammaJavaVectorRDD(
+      jsc: JavaSparkContext,
+      shape: Double,
+      scale: Double,
+      numRows: Long,
+      numCols: Int): JavaRDD[Vector] = {
+    gammaVectorRDD(jsc.sc, shape, scale, numRows, numCols).toJavaRDD()
+  }
+
+
   /**
    * :: DeveloperApi ::
    * Generates an RDD[Vector] with vectors containing i.i.d. samples produced by the
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index 90ac252226006..5f84677be238d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -90,7 +90,7 @@ case class Rating(user: Int, product: Int, rating: Double)
  *
  * Essentially instead of finding the low-rank approximations to the rating matrix `R`,
  * this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if
- * r > 0 and 0 if r = 0. The ratings then act as 'confidence' values related to strength of
+ * r > 0 and 0 if r <= 0. The ratings then act as 'confidence' values related to strength of
  * indicated user
  * preferences rather than explicit ratings given to items.
  */
@@ -116,6 +116,7 @@ class ALS private (
 
   /** storage level for user/product in/out links */
   private var intermediateRDDStorageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK
+  private var finalRDDStorageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK
 
   /**
    * Set the number of blocks for both user blocks and product blocks to parallelize the computation
@@ -204,6 +205,19 @@ class ALS private (
     this
   }
 
+  /**
+   * :: DeveloperApi ::
+   * Sets storage level for final RDDs (user/product used in MatrixFactorizationModel). The default
+   * value is `MEMORY_AND_DISK`. Users can change it to a serialized storage, e.g. 
+   * `MEMORY_AND_DISK_SER` and set `spark.rdd.compress` to `true` to reduce the space requirement,
+   * at the cost of speed.
+   */
+  @DeveloperApi
+  def setFinalRDDStorageLevel(storageLevel: StorageLevel): this.type = {
+    this.finalRDDStorageLevel = storageLevel
+    this
+  }
+
   /**
    * Run ALS with the configured parameters on an input RDD of (user, product, rating) triples.
    * Returns a MatrixFactorizationModel with feature vectors for each user and product.
@@ -307,8 +321,8 @@ class ALS private (
     val usersOut = unblockFactors(users, userOutLinks)
     val productsOut = unblockFactors(products, productOutLinks)
 
-    usersOut.setName("usersOut").persist(StorageLevel.MEMORY_AND_DISK)
-    productsOut.setName("productsOut").persist(StorageLevel.MEMORY_AND_DISK)
+    usersOut.setName("usersOut").persist(finalRDDStorageLevel)
+    productsOut.setName("productsOut").persist(finalRDDStorageLevel)
 
     // Materialize usersOut and productsOut.
     usersOut.count()
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
index f9791c6571782..8ecd5c6ad93c0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
@@ -45,7 +45,7 @@ class LassoModel (
 /**
  * Train a regression model with L1-regularization using Stochastic Gradient Descent.
  * This solves the l1-regularized least squares regression formulation
- *          f(weights) = 1/n ||A weights-y||^2  + regParam ||weights||_1
+ *          f(weights) = 1/2n ||A weights-y||^2  + regParam ||weights||_1
  * Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
  * its corresponding right hand side label y.
  * See also the documentation for the precise formulation.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
index c8cad773f5efb..076ba35051c9d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
@@ -45,7 +45,7 @@ class RidgeRegressionModel (
 /**
  * Train a regression model with L2-regularization using Stochastic Gradient Descent.
  * This solves the l1-regularized least squares regression formulation
- *          f(weights) = 1/n ||A weights-y||^2  + regParam/2 ||weights||^2
+ *          f(weights) = 1/2n ||A weights-y||^2  + regParam/2 ||weights||^2
  * Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
  * its corresponding right hand side label y.
  * See also the documentation for the precise formulation.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
index 8db0442a7a569..b549b7c475fc3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
@@ -22,7 +22,6 @@ import scala.reflect.ClassTag
 import org.apache.spark.Logging
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.dstream.DStream
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
new file mode 100644
index 0000000000000..fd186b5ee6f72
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.distribution
+
+import breeze.linalg.{DenseVector => DBV, DenseMatrix => DBM, diag, max, eigSym}
+
+import org.apache.spark.annotation.DeveloperApi;
+import org.apache.spark.mllib.linalg.{Vectors, Vector, Matrices, Matrix}
+import org.apache.spark.mllib.util.MLUtils
+
+/**
+ * :: DeveloperApi ::
+ * This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
+ * the event that the covariance matrix is singular, the density will be computed in a
+ * reduced dimensional subspace under which the distribution is supported.
+ * (see [[http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case]])
+ * 
+ * @param mu The mean vector of the distribution
+ * @param sigma The covariance matrix of the distribution
+ */
+@DeveloperApi
+class MultivariateGaussian (
+    val mu: Vector, 
+    val sigma: Matrix) extends Serializable {
+
+  require(sigma.numCols == sigma.numRows, "Covariance matrix must be square")
+  require(mu.size == sigma.numCols, "Mean vector length must match covariance matrix size")
+  
+  private val breezeMu = mu.toBreeze.toDenseVector
+  
+  /**
+   * private[mllib] constructor
+   * 
+   * @param mu The mean vector of the distribution
+   * @param sigma The covariance matrix of the distribution
+   */
+  private[mllib] def this(mu: DBV[Double], sigma: DBM[Double]) = {
+    this(Vectors.fromBreeze(mu), Matrices.fromBreeze(sigma))
+  }
+  
+  /**
+   * Compute distribution dependent constants:
+   *    rootSigmaInv = D^(-1/2)^ * U, where sigma = U * D * U.t
+   *    u = log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^) 
+   */
+  private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants
+  
+  /** Returns density of this multivariate Gaussian at given point, x */
+  def pdf(x: Vector): Double = {
+    pdf(x.toBreeze.toDenseVector)
+  }
+  
+  /** Returns the log-density of this multivariate Gaussian at given point, x */
+  def logpdf(x: Vector): Double = {
+    logpdf(x.toBreeze.toDenseVector)
+  }
+  
+  /** Returns density of this multivariate Gaussian at given point, x */
+  private[mllib] def pdf(x: DBV[Double]): Double = {
+    math.exp(logpdf(x))
+  }
+  
+  /** Returns the log-density of this multivariate Gaussian at given point, x */
+  private[mllib] def logpdf(x: DBV[Double]): Double = {
+    val delta = x - breezeMu
+    val v = rootSigmaInv * delta
+    u + v.t * v * -0.5
+  }
+  
+  /**
+   * Calculate distribution dependent components used for the density function:
+   *    pdf(x) = (2*pi)^(-k/2)^ * det(sigma)^(-1/2)^ * exp((-1/2) * (x-mu).t * inv(sigma) * (x-mu))
+   * where k is length of the mean vector.
+   * 
+   * We here compute distribution-fixed parts 
+   *  log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^)
+   * and
+   *  D^(-1/2)^ * U, where sigma = U * D * U.t
+   *  
+   * Both the determinant and the inverse can be computed from the singular value decomposition
+   * of sigma.  Noting that covariance matrices are always symmetric and positive semi-definite,
+   * we can use the eigendecomposition. We also do not compute the inverse directly; noting
+   * that 
+   * 
+   *    sigma = U * D * U.t
+   *    inv(Sigma) = U * inv(D) * U.t 
+   *               = (D^{-1/2}^ * U).t * (D^{-1/2}^ * U)
+   * 
+   * and thus
+   * 
+   *    -0.5 * (x-mu).t * inv(Sigma) * (x-mu) = -0.5 * norm(D^{-1/2}^ * U  * (x-mu))^2^
+   *  
+   * To guard against singular covariance matrices, this method computes both the 
+   * pseudo-determinant and the pseudo-inverse (Moore-Penrose).  Singular values are considered
+   * to be non-zero only if they exceed a tolerance based on machine precision, matrix size, and
+   * relation to the maximum singular value (same tolerance used by, e.g., Octave).
+   */
+  private def calculateCovarianceConstants: (DBM[Double], Double) = {
+    val eigSym.EigSym(d, u) = eigSym(sigma.toBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t
+    
+    // For numerical stability, values are considered to be non-zero only if they exceed tol.
+    // This prevents any inverted value from exceeding (eps * n * max(d))^-1
+    val tol = MLUtils.EPSILON * max(d) * d.length
+    
+    try {
+      // log(pseudo-determinant) is sum of the logs of all non-zero singular values
+      val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum
+      
+      // calculate the root-pseudo-inverse of the diagonal matrix of singular values 
+      // by inverting the square root of all non-zero values
+      val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))
+    
+      (pinvS * u, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
+    } catch {
+      case uex: UnsupportedOperationException =>
+        throw new IllegalArgumentException("Covariance matrix has no non-zero singular values")
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index 73e7e32c6db31..b3e8ed9af8c51 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -64,13 +64,6 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
     val rfModel = rf.run(input)
     rfModel.trees(0)
   }
-
-  /**
-   * Trains a decision tree model over an RDD. This is deprecated because it hides the static
-   * methods with the same name in Java.
-   */
-  @deprecated("Please use DecisionTree.run instead.", "1.2.0")
-  def train(input: RDD[LabeledPoint]): DecisionTreeModel = run(input)
 }
 
 object DecisionTree extends Serializable with Logging {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index cf51d041c65a9..ed8e6a796f8c4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -68,6 +68,15 @@ case class BoostingStrategy(
 @Experimental
 object BoostingStrategy {
 
+  /**
+   * Returns default configuration for the boosting algorithm
+   * @param algo Learning goal.  Supported: "Classification" or "Regression"
+   * @return Configuration for boosting algorithm
+   */
+  def defaultParams(algo: String): BoostingStrategy = {
+    defaultParams(Algo.fromString(algo))
+  }
+
   /**
    * Returns default configuration for the boosting algorithm
    * @param algo Learning goal.  Supported:
@@ -75,15 +84,15 @@ object BoostingStrategy {
    *             [[org.apache.spark.mllib.tree.configuration.Algo.Regression]]
    * @return Configuration for boosting algorithm
    */
-  def defaultParams(algo: String): BoostingStrategy = {
-    val treeStrategy = Strategy.defaultStrategy(algo)
-    treeStrategy.maxDepth = 3
+  def defaultParams(algo: Algo): BoostingStrategy = {
+    val treeStragtegy = Strategy.defaultStategy(algo)
+    treeStragtegy.maxDepth = 3
     algo match {
-      case "Classification" =>
-        treeStrategy.numClasses = 2
-        new BoostingStrategy(treeStrategy, LogLoss)
-      case "Regression" =>
-        new BoostingStrategy(treeStrategy, SquaredError)
+      case Algo.Classification =>
+        treeStragtegy.numClasses = 2
+        new BoostingStrategy(treeStragtegy, LogLoss)
+      case Algo.Regression =>
+        new BoostingStrategy(treeStragtegy, SquaredError)
       case _ =>
         throw new IllegalArgumentException(s"$algo is not supported by boosting.")
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
index d5cd89ab94e81..972959885f396 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -173,11 +173,19 @@ object Strategy {
    * Construct a default set of parameters for [[org.apache.spark.mllib.tree.DecisionTree]]
    * @param algo  "Classification" or "Regression"
    */
-  def defaultStrategy(algo: String): Strategy = algo match {
-    case "Classification" =>
+  def defaultStrategy(algo: String): Strategy = {
+    defaultStategy(Algo.fromString(algo))
+  }
+
+  /**
+   * Construct a default set of parameters for [[org.apache.spark.mllib.tree.DecisionTree]]
+   * @param algo Algo.Classification or Algo.Regression
+   */
+  def defaultStategy(algo: Algo): Strategy = algo match {
+    case Algo.Classification =>
       new Strategy(algo = Classification, impurity = Gini, maxDepth = 10,
         numClasses = 2)
-    case "Regression" =>
+    case Algo.Regression =>
       new Strategy(algo = Regression, impurity = Variance, maxDepth = 10,
         numClasses = 0)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala
index 7ce9fa6f86c42..55213e695638c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala
@@ -20,6 +20,7 @@ package org.apache.spark.mllib.tree.loss
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.model.TreeEnsembleModel
+import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 
 /**
@@ -61,13 +62,8 @@ object LogLoss extends Loss {
     data.map { case point =>
       val prediction = model.predict(point.features)
       val margin = 2.0 * point.label * prediction
-      // The following are equivalent to 2.0 * log(1 + exp(-margin)) but are more numerically
-      // stable.
-      if (margin >= 0) {
-        2.0 * math.log1p(math.exp(-margin))
-      } else {
-        2.0 * (-margin + math.log1p(math.exp(margin)))
-      }
+      // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable.
+      2.0 * MLUtils.log1pExp(-margin)
     }.mean()
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index b0d05ae33e1b5..5d6ddd47f67d6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -19,8 +19,7 @@ package org.apache.spark.mllib.util
 
 import scala.reflect.ClassTag
 
-import breeze.linalg.{DenseVector => BDV, SparseVector => BSV,
-  squaredDistance => breezeSquaredDistance}
+import breeze.linalg.{DenseVector => BDV, SparseVector => BSV}
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.SparkContext
@@ -28,7 +27,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.rdd.PartitionwiseSampledRDD
 import org.apache.spark.util.random.BernoulliCellSampler
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors}
+import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS.dot
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.StreamingContext
@@ -39,7 +38,7 @@ import org.apache.spark.streaming.dstream.DStream
  */
 object MLUtils {
 
-  private[util] lazy val EPSILON = {
+  private[mllib] lazy val EPSILON = {
     var eps = 1.0
     while ((1.0 + (eps / 2.0)) != 1.0) {
       eps /= 2.0
@@ -154,10 +153,12 @@ object MLUtils {
   def saveAsLibSVMFile(data: RDD[LabeledPoint], dir: String) {
     // TODO: allow to specify label precision and feature precision.
     val dataStr = data.map { case LabeledPoint(label, features) =>
-      val featureStrings = features.toBreeze.activeIterator.map { case (i, v) =>
-        s"${i + 1}:$v"
+      val sb = new StringBuilder(label.toString)
+      features.foreachActive { case (i, v) =>
+        sb += ' '
+        sb ++= s"${i + 1}:$v"
       }
-      (Iterator(label) ++ featureStrings).mkString(" ")
+      sb.mkString
     }
     dataStr.saveAsTextFile(dir)
   }
@@ -264,7 +265,7 @@ object MLUtils {
     }
     Vectors.fromBreeze(vector1)
   }
-
+ 
   /**
    * Returns the squared Euclidean distance between two vectors. The following formula will be used
    * if it does not introduce too much numerical error:
@@ -314,13 +315,27 @@ object MLUtils {
       val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) /
         (sqDist + EPSILON)
       if (precisionBound2 > precision) {
-        // TODO: breezeSquaredDistance is slow,
-        // so we should replace it with our own implementation.
-        sqDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze)
+        sqDist = Vectors.sqdist(v1, v2)
       }
     } else {
-      sqDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze)
+      sqDist = Vectors.sqdist(v1, v2)
     }
     sqDist
   }
+
+  /**
+   * When `x` is positive and large, computing `math.log(1 + math.exp(x))` will lead to arithmetic
+   * overflow. This will happen when `x > 709.78` which is not a very large number.
+   * It can be addressed by rewriting the formula into `x + math.log1p(math.exp(-x))` when `x > 0`.
+   *
+   * @param x a floating-point value as input.
+   * @return the result of `math.log(1 + math.exp(x))`.
+   */
+  private[mllib] def log1pExp(x: Double): Double = {
+    if (x > 0) {
+      x + math.log1p(math.exp(-x))
+    } else {
+      math.log1p(math.exp(x))
+    }
+  }
 }
diff --git a/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java b/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java
index 42846677ed285..47f1f46c6c260 100644
--- a/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java
@@ -26,10 +26,9 @@
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.ml.classification.LogisticRegression;
 import org.apache.spark.ml.feature.StandardScaler;
-import org.apache.spark.sql.api.java.JavaSQLContext;
-import org.apache.spark.sql.api.java.JavaSchemaRDD;
-import static org.apache.spark.mllib.classification.LogisticRegressionSuite
-  .generateLogisticInputAsList;
+import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.SQLContext;
+import static org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInputAsList;
 
 /**
  * Test Pipeline construction and fitting in Java.
@@ -37,13 +36,13 @@
 public class JavaPipelineSuite {
 
   private transient JavaSparkContext jsc;
-  private transient JavaSQLContext jsql;
-  private transient JavaSchemaRDD dataset;
+  private transient SQLContext jsql;
+  private transient SchemaRDD dataset;
 
   @Before
   public void setUp() {
     jsc = new JavaSparkContext("local", "JavaPipelineSuite");
-    jsql = new JavaSQLContext(jsc);
+    jsql = new SQLContext(jsc);
     JavaRDD<LabeledPoint> points =
       jsc.parallelize(generateLogisticInputAsList(1.0, 1.0, 100, 42), 2);
     dataset = jsql.applySchema(points, LabeledPoint.class);
@@ -66,7 +65,7 @@ public void pipeline() {
       .setStages(new PipelineStage[] {scaler, lr});
     PipelineModel model = pipeline.fit(dataset);
     model.transform(dataset).registerTempTable("prediction");
-    JavaSchemaRDD predictions = jsql.sql("SELECT label, score, prediction FROM prediction");
-    predictions.collect();
+    SchemaRDD predictions = jsql.sql("SELECT label, score, prediction FROM prediction");
+    predictions.collectAsList();
   }
 }
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
index 76eb7f00329f2..2eba83335bb58 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
@@ -26,21 +26,20 @@
 
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.sql.api.java.JavaSQLContext;
-import org.apache.spark.sql.api.java.JavaSchemaRDD;
-import static org.apache.spark.mllib.classification.LogisticRegressionSuite
-  .generateLogisticInputAsList;
+import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.SQLContext;
+import static org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInputAsList;
 
 public class JavaLogisticRegressionSuite implements Serializable {
 
   private transient JavaSparkContext jsc;
-  private transient JavaSQLContext jsql;
-  private transient JavaSchemaRDD dataset;
+  private transient SQLContext jsql;
+  private transient SchemaRDD dataset;
 
   @Before
   public void setUp() {
     jsc = new JavaSparkContext("local", "JavaLogisticRegressionSuite");
-    jsql = new JavaSQLContext(jsc);
+    jsql = new SQLContext(jsc);
     List<LabeledPoint> points = generateLogisticInputAsList(1.0, 1.0, 100, 42);
     dataset = jsql.applySchema(jsc.parallelize(points, 2), LabeledPoint.class);
   }
@@ -56,8 +55,8 @@ public void logisticRegression() {
     LogisticRegression lr = new LogisticRegression();
     LogisticRegressionModel model = lr.fit(dataset);
     model.transform(dataset).registerTempTable("prediction");
-    JavaSchemaRDD predictions = jsql.sql("SELECT label, score, prediction FROM prediction");
-    predictions.collect();
+    SchemaRDD predictions = jsql.sql("SELECT label, score, prediction FROM prediction");
+    predictions.collectAsList();
   }
 
   @Test
@@ -68,8 +67,8 @@ public void logisticRegressionWithSetters() {
     LogisticRegressionModel model = lr.fit(dataset);
     model.transform(dataset, model.threshold().w(0.8)) // overwrite threshold
       .registerTempTable("prediction");
-    JavaSchemaRDD predictions = jsql.sql("SELECT label, score, prediction FROM prediction");
-    predictions.collect();
+    SchemaRDD predictions = jsql.sql("SELECT label, score, prediction FROM prediction");
+    predictions.collectAsList();
   }
 
   @Test
diff --git a/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java b/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java
index a266ebd2071a1..a9f1c4a2c3ca7 100644
--- a/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java
@@ -30,21 +30,20 @@
 import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator;
 import org.apache.spark.ml.param.ParamMap;
 import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.sql.api.java.JavaSQLContext;
-import org.apache.spark.sql.api.java.JavaSchemaRDD;
-import static org.apache.spark.mllib.classification.LogisticRegressionSuite
-  .generateLogisticInputAsList;
+import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.SQLContext;
+import static org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInputAsList;
 
 public class JavaCrossValidatorSuite implements Serializable {
 
   private transient JavaSparkContext jsc;
-  private transient JavaSQLContext jsql;
-  private transient JavaSchemaRDD dataset;
+  private transient SQLContext jsql;
+  private transient SchemaRDD dataset;
 
   @Before
   public void setUp() {
     jsc = new JavaSparkContext("local", "JavaCrossValidatorSuite");
-    jsql = new JavaSQLContext(jsc);
+    jsql = new SQLContext(jsc);
     List<LabeledPoint> points = generateLogisticInputAsList(1.0, 1.0, 100, 42);
     dataset = jsql.applySchema(jsc.parallelize(points, 2), LabeledPoint.class);
   }
diff --git a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
index 064263e02cd11..fbc26167ce66f 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
@@ -49,6 +49,7 @@ public void tearDown() {
   public void tfIdf() {
     // The tests are to check Java compatibility.
     HashingTF tf = new HashingTF();
+    @SuppressWarnings("unchecked")
     JavaRDD<ArrayList<String>> documents = sc.parallelize(Lists.newArrayList(
       Lists.newArrayList("this is a sentence".split(" ")),
       Lists.newArrayList("this is another sentence".split(" ")),
@@ -68,6 +69,7 @@ public void tfIdf() {
   public void tfIdfMinimumDocumentFrequency() {
     // The tests are to check Java compatibility.
     HashingTF tf = new HashingTF();
+    @SuppressWarnings("unchecked")
     JavaRDD<ArrayList<String>> documents = sc.parallelize(Lists.newArrayList(
       Lists.newArrayList("this is a sentence".split(" ")),
       Lists.newArrayList("this is another sentence".split(" ")),
diff --git a/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
new file mode 100644
index 0000000000000..704d484d0b585
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.linalg;
+
+import static org.junit.Assert.*;
+import org.junit.Test;
+
+import java.io.Serializable;
+import java.util.Random;
+
+public class JavaMatricesSuite implements Serializable {
+
+    @Test
+    public void randMatrixConstruction() {
+        Random rng = new Random(24);
+        Matrix r = Matrices.rand(3, 4, rng);
+        rng.setSeed(24);
+        DenseMatrix dr = DenseMatrix.rand(3, 4, rng);
+        assertArrayEquals(r.toArray(), dr.toArray(), 0.0);
+
+        rng.setSeed(24);
+        Matrix rn = Matrices.randn(3, 4, rng);
+        rng.setSeed(24);
+        DenseMatrix drn = DenseMatrix.randn(3, 4, rng);
+        assertArrayEquals(rn.toArray(), drn.toArray(), 0.0);
+
+        rng.setSeed(24);
+        Matrix s = Matrices.sprand(3, 4, 0.5, rng);
+        rng.setSeed(24);
+        SparseMatrix sr = SparseMatrix.sprand(3, 4, 0.5, rng);
+        assertArrayEquals(s.toArray(), sr.toArray(), 0.0);
+
+        rng.setSeed(24);
+        Matrix sn = Matrices.sprandn(3, 4, 0.5, rng);
+        rng.setSeed(24);
+        SparseMatrix srn = SparseMatrix.sprandn(3, 4, 0.5, rng);
+        assertArrayEquals(sn.toArray(), srn.toArray(), 0.0);
+    }
+
+    @Test
+    public void identityMatrixConstruction() {
+        Matrix r = Matrices.eye(2);
+        DenseMatrix dr = DenseMatrix.eye(2);
+        SparseMatrix sr = SparseMatrix.speye(2);
+        assertArrayEquals(r.toArray(), dr.toArray(), 0.0);
+        assertArrayEquals(sr.toArray(), dr.toArray(), 0.0);
+        assertArrayEquals(r.toArray(), new double[]{1.0, 0.0, 0.0, 1.0}, 0.0);
+    }
+
+    @Test
+    public void diagonalMatrixConstruction() {
+        Vector v = Vectors.dense(1.0, 0.0, 2.0);
+        Vector sv = Vectors.sparse(3, new int[]{0, 2}, new double[]{1.0, 2.0});
+
+        Matrix m = Matrices.diag(v);
+        Matrix sm = Matrices.diag(sv);
+        DenseMatrix d = DenseMatrix.diag(v);
+        DenseMatrix sd = DenseMatrix.diag(sv);
+        SparseMatrix s = SparseMatrix.diag(v);
+        SparseMatrix ss = SparseMatrix.diag(sv);
+
+        assertArrayEquals(m.toArray(), sm.toArray(), 0.0);
+        assertArrayEquals(d.toArray(), sm.toArray(), 0.0);
+        assertArrayEquals(d.toArray(), sd.toArray(), 0.0);
+        assertArrayEquals(sd.toArray(), s.toArray(), 0.0);
+        assertArrayEquals(s.toArray(), ss.toArray(), 0.0);
+        assertArrayEquals(s.values(), ss.values(), 0.0);
+        assert(s.values().length == 2);
+        assert(ss.values().length == 2);
+        assert(s.colPtrs().length == 4);
+        assert(ss.colPtrs().length == 4);
+    }
+
+    @Test
+    public void zerosMatrixConstruction() {
+        Matrix z = Matrices.zeros(2, 2);
+        Matrix one = Matrices.ones(2, 2);
+        DenseMatrix dz = DenseMatrix.zeros(2, 2);
+        DenseMatrix done = DenseMatrix.ones(2, 2);
+
+        assertArrayEquals(z.toArray(), new double[]{0.0, 0.0, 0.0, 0.0}, 0.0);
+        assertArrayEquals(dz.toArray(), new double[]{0.0, 0.0, 0.0, 0.0}, 0.0);
+        assertArrayEquals(one.toArray(), new double[]{1.0, 1.0, 1.0, 1.0}, 0.0);
+        assertArrayEquals(done.toArray(), new double[]{1.0, 1.0, 1.0, 1.0}, 0.0);
+    }
+
+    @Test
+    public void sparseDenseConversion() {
+        int m = 3;
+        int n = 2;
+        double[] values = new double[]{1.0, 2.0, 4.0, 5.0};
+        double[] allValues = new double[]{1.0, 2.0, 0.0, 0.0, 4.0, 5.0};
+        int[] colPtrs = new int[]{0, 2, 4};
+        int[] rowIndices = new int[]{0, 1, 1, 2};
+
+        SparseMatrix spMat1 = new SparseMatrix(m, n, colPtrs, rowIndices, values);
+        DenseMatrix deMat1 = new DenseMatrix(m, n, allValues);
+
+        SparseMatrix spMat2 = deMat1.toSparse();
+        DenseMatrix deMat2 = spMat1.toDense();
+
+        assertArrayEquals(spMat1.toArray(), spMat2.toArray(), 0.0);
+        assertArrayEquals(deMat1.toArray(), deMat2.toArray(), 0.0);
+    }
+
+    @Test
+    public void concatenateMatrices() {
+        int m = 3;
+        int n = 2;
+
+        Random rng = new Random(42);
+        SparseMatrix spMat1 = SparseMatrix.sprand(m, n, 0.5, rng);
+        rng.setSeed(42);
+        DenseMatrix deMat1 = DenseMatrix.rand(m, n, rng);
+        Matrix deMat2 = Matrices.eye(3);
+        Matrix spMat2 = Matrices.speye(3);
+        Matrix deMat3 = Matrices.eye(2);
+        Matrix spMat3 = Matrices.speye(2);
+
+        Matrix spHorz = Matrices.horzcat(new Matrix[]{spMat1, spMat2});
+        Matrix deHorz1 = Matrices.horzcat(new Matrix[]{deMat1, deMat2});
+        Matrix deHorz2 = Matrices.horzcat(new Matrix[]{spMat1, deMat2});
+        Matrix deHorz3 = Matrices.horzcat(new Matrix[]{deMat1, spMat2});
+
+        assert(deHorz1.numRows() == 3);
+        assert(deHorz2.numRows() == 3);
+        assert(deHorz3.numRows() == 3);
+        assert(spHorz.numRows() == 3);
+        assert(deHorz1.numCols() == 5);
+        assert(deHorz2.numCols() == 5);
+        assert(deHorz3.numCols() == 5);
+        assert(spHorz.numCols() == 5);
+
+        Matrix spVert = Matrices.vertcat(new Matrix[]{spMat1, spMat3});
+        Matrix deVert1 = Matrices.vertcat(new Matrix[]{deMat1, deMat3});
+        Matrix deVert2 = Matrices.vertcat(new Matrix[]{spMat1, deMat3});
+        Matrix deVert3 = Matrices.vertcat(new Matrix[]{deMat1, spMat3});
+
+        assert(deVert1.numRows() == 5);
+        assert(deVert2.numRows() == 5);
+        assert(deVert3.numRows() == 5);
+        assert(spVert.numRows() == 5);
+        assert(deVert1.numCols() == 2);
+        assert(deVert2.numCols() == 2);
+        assert(deVert3.numCols() == 2);
+        assert(spVert.numCols() == 2);
+    }
+}
diff --git a/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java b/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
index a725736ca1a58..fcc13c00cbdc5 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
@@ -69,6 +69,21 @@ public void testNormalRDD() {
     }
   }
 
+  @Test
+  public void testLNormalRDD() {
+    double mean = 4.0;
+    double std = 2.0;
+    long m = 1000L;
+    int p = 2;
+    long seed = 1L;
+    JavaDoubleRDD rdd1 = logNormalJavaRDD(sc, mean, std, m);
+    JavaDoubleRDD rdd2 = logNormalJavaRDD(sc, mean, std, m, p);
+    JavaDoubleRDD rdd3 = logNormalJavaRDD(sc, mean, std, m, p, seed);
+    for (JavaDoubleRDD rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+      Assert.assertEquals(m, rdd.count());
+    }
+  }
+
   @Test
   public void testPoissonRDD() {
     double mean = 2.0;
@@ -83,6 +98,36 @@ public void testPoissonRDD() {
     }
   }
 
+  @Test
+  public void testExponentialRDD() {
+    double mean = 2.0;
+    long m = 1000L;
+    int p = 2;
+    long seed = 1L;
+    JavaDoubleRDD rdd1 = exponentialJavaRDD(sc, mean, m);
+    JavaDoubleRDD rdd2 = exponentialJavaRDD(sc, mean, m, p);
+    JavaDoubleRDD rdd3 = exponentialJavaRDD(sc, mean, m, p, seed);
+    for (JavaDoubleRDD rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+      Assert.assertEquals(m, rdd.count());
+    }
+  }
+
+  @Test
+  public void testGammaRDD() {
+    double shape = 1.0;
+    double scale = 2.0;
+    long m = 1000L;
+    int p = 2;
+    long seed = 1L;
+    JavaDoubleRDD rdd1 = gammaJavaRDD(sc, shape, scale, m);
+    JavaDoubleRDD rdd2 = gammaJavaRDD(sc, shape, scale, m, p);
+    JavaDoubleRDD rdd3 = gammaJavaRDD(sc, shape, scale, m, p, seed);
+    for (JavaDoubleRDD rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+      Assert.assertEquals(m, rdd.count());
+    }
+  }
+
+
   @Test
   @SuppressWarnings("unchecked")
   public void testUniformVectorRDD() {
@@ -115,6 +160,24 @@ public void testNormalVectorRDD() {
     }
   }
 
+  @Test
+  @SuppressWarnings("unchecked")
+  public void testLogNormalVectorRDD() {
+    double mean = 4.0;
+    double std = 2.0;  
+    long m = 100L;
+    int n = 10;
+    int p = 2;
+    long seed = 1L;
+    JavaRDD<Vector> rdd1 = logNormalJavaVectorRDD(sc, mean, std, m, n);
+    JavaRDD<Vector> rdd2 = logNormalJavaVectorRDD(sc, mean, std, m, n, p);
+    JavaRDD<Vector> rdd3 = logNormalJavaVectorRDD(sc, mean, std, m, n, p, seed);
+    for (JavaRDD<Vector> rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+      Assert.assertEquals(m, rdd.count());
+      Assert.assertEquals(n, rdd.first().size());
+    }
+  }
+
   @Test
   @SuppressWarnings("unchecked")
   public void testPoissonVectorRDD() {
@@ -131,4 +194,40 @@ public void testPoissonVectorRDD() {
       Assert.assertEquals(n, rdd.first().size());
     }
   }
+
+  @Test
+  @SuppressWarnings("unchecked")
+  public void testExponentialVectorRDD() {
+    double mean = 2.0;
+    long m = 100L;
+    int n = 10;
+    int p = 2;
+    long seed = 1L;
+    JavaRDD<Vector> rdd1 = exponentialJavaVectorRDD(sc, mean, m, n);
+    JavaRDD<Vector> rdd2 = exponentialJavaVectorRDD(sc, mean, m, n, p);
+    JavaRDD<Vector> rdd3 = exponentialJavaVectorRDD(sc, mean, m, n, p, seed);
+    for (JavaRDD<Vector> rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+      Assert.assertEquals(m, rdd.count());
+      Assert.assertEquals(n, rdd.first().size());
+    }
+  }
+
+  @Test
+  @SuppressWarnings("unchecked")
+  public void testGammaVectorRDD() {
+    double shape = 1.0;
+    double scale = 2.0;
+    long m = 100L;
+    int n = 10;
+    int p = 2;
+    long seed = 1L;
+    JavaRDD<Vector> rdd1 = gammaJavaVectorRDD(sc, shape, scale, m, n);
+    JavaRDD<Vector> rdd2 = gammaJavaVectorRDD(sc, shape, scale, m, n, p);
+    JavaRDD<Vector> rdd3 = gammaJavaVectorRDD(sc, shape, scale, m, n, p, seed);
+    for (JavaRDD<Vector> rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+      Assert.assertEquals(m, rdd.count());
+      Assert.assertEquals(n, rdd.first().size());
+    }
+  }
+
 }
diff --git a/mllib/src/test/resources/log4j.properties b/mllib/src/test/resources/log4j.properties
index a469badf603c6..9697237bfa1a3 100644
--- a/mllib/src/test/resources/log4j.properties
+++ b/mllib/src/test/resources/log4j.properties
@@ -15,10 +15,10 @@
 # limitations under the License.
 #
 
-# Set everything to be logged to the file core/target/unit-tests.log
+# Set everything to be logged to the file target/unit-tests.log
 log4j.rootCategory=INFO, file
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
new file mode 100644
index 0000000000000..cdd4db1b5b7dc
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -0,0 +1,435 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.recommendation
+
+import java.util.Random
+
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+import com.github.fommil.netlib.BLAS.{getInstance => blas}
+import org.scalatest.FunSuite
+
+import org.apache.spark.Logging
+import org.apache.spark.ml.recommendation.ALS._
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{Row, SQLContext}
+
+class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
+
+  private var sqlContext: SQLContext = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    sqlContext = new SQLContext(sc)
+  }
+
+  test("LocalIndexEncoder") {
+    val random = new Random
+    for (numBlocks <- Seq(1, 2, 5, 10, 20, 50, 100)) {
+      val encoder = new LocalIndexEncoder(numBlocks)
+      val maxLocalIndex = Int.MaxValue / numBlocks
+      val tests = Seq.fill(5)((random.nextInt(numBlocks), random.nextInt(maxLocalIndex))) ++
+        Seq((0, 0), (numBlocks - 1, maxLocalIndex))
+      tests.foreach { case (blockId, localIndex) =>
+        val err = s"Failed with numBlocks=$numBlocks, blockId=$blockId, and localIndex=$localIndex."
+        val encoded = encoder.encode(blockId, localIndex)
+        assert(encoder.blockId(encoded) === blockId, err)
+        assert(encoder.localIndex(encoded) === localIndex, err)
+      }
+    }
+  }
+
+  test("normal equation construction with explict feedback") {
+    val k = 2
+    val ne0 = new NormalEquation(k)
+      .add(Array(1.0f, 2.0f), 3.0f)
+      .add(Array(4.0f, 5.0f), 6.0f)
+    assert(ne0.k === k)
+    assert(ne0.triK === k * (k + 1) / 2)
+    assert(ne0.n === 2)
+    // NumPy code that computes the expected values:
+    // A = np.matrix("1 2; 4 5")
+    // b = np.matrix("3; 6")
+    // ata = A.transpose() * A
+    // atb = A.transpose() * b
+    assert(Vectors.dense(ne0.ata) ~== Vectors.dense(17.0, 22.0, 29.0) relTol 1e-8)
+    assert(Vectors.dense(ne0.atb) ~== Vectors.dense(27.0, 36.0) relTol 1e-8)
+
+    val ne1 = new NormalEquation(2)
+      .add(Array(7.0f, 8.0f), 9.0f)
+    ne0.merge(ne1)
+    assert(ne0.n === 3)
+    // NumPy code that computes the expected values:
+    // A = np.matrix("1 2; 4 5; 7 8")
+    // b = np.matrix("3; 6; 9")
+    // ata = A.transpose() * A
+    // atb = A.transpose() * b
+    assert(Vectors.dense(ne0.ata) ~== Vectors.dense(66.0, 78.0, 93.0) relTol 1e-8)
+    assert(Vectors.dense(ne0.atb) ~== Vectors.dense(90.0, 108.0) relTol 1e-8)
+
+    intercept[IllegalArgumentException] {
+      ne0.add(Array(1.0f), 2.0f)
+    }
+    intercept[IllegalArgumentException] {
+      ne0.add(Array(1.0f, 2.0f, 3.0f), 4.0f)
+    }
+    intercept[IllegalArgumentException] {
+      val ne2 = new NormalEquation(3)
+      ne0.merge(ne2)
+    }
+
+    ne0.reset()
+    assert(ne0.n === 0)
+    assert(ne0.ata.forall(_ == 0.0))
+    assert(ne0.atb.forall(_ == 0.0))
+  }
+
+  test("normal equation construction with implicit feedback") {
+    val k = 2
+    val alpha = 0.5
+    val ne0 = new NormalEquation(k)
+      .addImplicit(Array(-5.0f, -4.0f), -3.0f, alpha)
+      .addImplicit(Array(-2.0f, -1.0f), 0.0f, alpha)
+      .addImplicit(Array(1.0f, 2.0f), 3.0f, alpha)
+    assert(ne0.k === k)
+    assert(ne0.triK === k * (k + 1) / 2)
+    assert(ne0.n === 0) // addImplicit doesn't increase the count.
+    // NumPy code that computes the expected values:
+    // alpha = 0.5
+    // A = np.matrix("-5 -4; -2 -1; 1 2")
+    // b = np.matrix("-3; 0; 3")
+    // b1 = b > 0
+    // c = 1.0 + alpha * np.abs(b)
+    // C = np.diag(c.A1)
+    // I = np.eye(3)
+    // ata = A.transpose() * (C - I) * A
+    // atb = A.transpose() * C * b1
+    assert(Vectors.dense(ne0.ata) ~== Vectors.dense(39.0, 33.0, 30.0) relTol 1e-8)
+    assert(Vectors.dense(ne0.atb) ~== Vectors.dense(2.5, 5.0) relTol 1e-8)
+  }
+
+  test("CholeskySolver") {
+    val k = 2
+    val ne0 = new NormalEquation(k)
+      .add(Array(1.0f, 2.0f), 4.0f)
+      .add(Array(1.0f, 3.0f), 9.0f)
+      .add(Array(1.0f, 4.0f), 16.0f)
+    val ne1 = new NormalEquation(k)
+      .merge(ne0)
+
+    val chol = new CholeskySolver
+    val x0 = chol.solve(ne0, 0.0).map(_.toDouble)
+    // NumPy code that computes the expected solution:
+    // A = np.matrix("1 2; 1 3; 1 4")
+    // b = b = np.matrix("3; 6")
+    // x0 = np.linalg.lstsq(A, b)[0]
+    assert(Vectors.dense(x0) ~== Vectors.dense(-8.333333, 6.0) relTol 1e-6)
+
+    assert(ne0.n === 0)
+    assert(ne0.ata.forall(_ == 0.0))
+    assert(ne0.atb.forall(_ == 0.0))
+
+    val x1 = chol.solve(ne1, 0.5).map(_.toDouble)
+    // NumPy code that computes the expected solution, where lambda is scaled by n:
+    // x0 = np.linalg.solve(A.transpose() * A + 0.5 * 3 * np.eye(2), A.transpose() * b)
+    assert(Vectors.dense(x1) ~== Vectors.dense(-0.1155556, 3.28) relTol 1e-6)
+  }
+
+  test("RatingBlockBuilder") {
+    val emptyBuilder = new RatingBlockBuilder()
+    assert(emptyBuilder.size === 0)
+    val emptyBlock = emptyBuilder.build()
+    assert(emptyBlock.srcIds.isEmpty)
+    assert(emptyBlock.dstIds.isEmpty)
+    assert(emptyBlock.ratings.isEmpty)
+
+    val builder0 = new RatingBlockBuilder()
+      .add(Rating(0, 1, 2.0f))
+      .add(Rating(3, 4, 5.0f))
+    assert(builder0.size === 2)
+    val builder1 = new RatingBlockBuilder()
+      .add(Rating(6, 7, 8.0f))
+      .merge(builder0.build())
+    assert(builder1.size === 3)
+    val block = builder1.build()
+    val ratings = Seq.tabulate(block.size) { i =>
+      (block.srcIds(i), block.dstIds(i), block.ratings(i))
+    }.toSet
+    assert(ratings === Set((0, 1, 2.0f), (3, 4, 5.0f), (6, 7, 8.0f)))
+  }
+
+  test("UncompressedInBlock") {
+    val encoder = new LocalIndexEncoder(10)
+    val uncompressed = new UncompressedInBlockBuilder(encoder)
+      .add(0, Array(1, 0, 2), Array(0, 1, 4), Array(1.0f, 2.0f, 3.0f))
+      .add(1, Array(3, 0), Array(2, 5), Array(4.0f, 5.0f))
+      .build()
+    assert(uncompressed.size === 5)
+    val records = Seq.tabulate(uncompressed.size) { i =>
+      val dstEncodedIndex = uncompressed.dstEncodedIndices(i)
+      val dstBlockId = encoder.blockId(dstEncodedIndex)
+      val dstLocalIndex = encoder.localIndex(dstEncodedIndex)
+      (uncompressed.srcIds(i), dstBlockId, dstLocalIndex, uncompressed.ratings(i))
+    }.toSet
+    val expected =
+      Set((1, 0, 0, 1.0f), (0, 0, 1, 2.0f), (2, 0, 4, 3.0f), (3, 1, 2, 4.0f), (0, 1, 5, 5.0f))
+    assert(records === expected)
+
+    val compressed = uncompressed.compress()
+    assert(compressed.size === 5)
+    assert(compressed.srcIds.toSeq === Seq(0, 1, 2, 3))
+    assert(compressed.dstPtrs.toSeq === Seq(0, 2, 3, 4, 5))
+    var decompressed = ArrayBuffer.empty[(Int, Int, Int, Float)]
+    var i = 0
+    while (i < compressed.srcIds.size) {
+      var j = compressed.dstPtrs(i)
+      while (j < compressed.dstPtrs(i + 1)) {
+        val dstEncodedIndex = compressed.dstEncodedIndices(j)
+        val dstBlockId = encoder.blockId(dstEncodedIndex)
+        val dstLocalIndex = encoder.localIndex(dstEncodedIndex)
+        decompressed += ((compressed.srcIds(i), dstBlockId, dstLocalIndex, compressed.ratings(j)))
+        j += 1
+      }
+      i += 1
+    }
+    assert(decompressed.toSet === expected)
+  }
+
+  /**
+   * Generates an explicit feedback dataset for testing ALS.
+   * @param numUsers number of users
+   * @param numItems number of items
+   * @param rank rank
+   * @param noiseStd the standard deviation of additive Gaussian noise on training data
+   * @param seed random seed
+   * @return (training, test)
+   */
+  def genExplicitTestData(
+      numUsers: Int,
+      numItems: Int,
+      rank: Int,
+      noiseStd: Double = 0.0,
+      seed: Long = 11L): (RDD[Rating], RDD[Rating]) = {
+    val trainingFraction = 0.6
+    val testFraction = 0.3
+    val totalFraction = trainingFraction + testFraction
+    val random = new Random(seed)
+    val userFactors = genFactors(numUsers, rank, random)
+    val itemFactors = genFactors(numItems, rank, random)
+    val training = ArrayBuffer.empty[Rating]
+    val test = ArrayBuffer.empty[Rating]
+    for ((userId, userFactor) <- userFactors; (itemId, itemFactor) <- itemFactors) {
+      val x = random.nextDouble()
+      if (x < totalFraction) {
+        val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1)
+        if (x < trainingFraction) {
+          val noise = noiseStd * random.nextGaussian()
+          training += Rating(userId, itemId, rating + noise.toFloat)
+        } else {
+          test += Rating(userId, itemId, rating)
+        }
+      }
+    }
+    logInfo(s"Generated an explicit feedback dataset with ${training.size} ratings for training " +
+      s"and ${test.size} for test.")
+    (sc.parallelize(training, 2), sc.parallelize(test, 2))
+  }
+
+  /**
+   * Generates an implicit feedback dataset for testing ALS.
+   * @param numUsers number of users
+   * @param numItems number of items
+   * @param rank rank
+   * @param noiseStd the standard deviation of additive Gaussian noise on training data
+   * @param seed random seed
+   * @return (training, test)
+   */
+  def genImplicitTestData(
+      numUsers: Int,
+      numItems: Int,
+      rank: Int,
+      noiseStd: Double = 0.0,
+      seed: Long = 11L): (RDD[Rating], RDD[Rating]) = {
+    // The assumption of the implicit feedback model is that unobserved ratings are more likely to
+    // be negatives.
+    val positiveFraction = 0.8
+    val negativeFraction = 1.0 - positiveFraction
+    val trainingFraction = 0.6
+    val testFraction = 0.3
+    val totalFraction = trainingFraction + testFraction
+    val random = new Random(seed)
+    val userFactors = genFactors(numUsers, rank, random)
+    val itemFactors = genFactors(numItems, rank, random)
+    val training = ArrayBuffer.empty[Rating]
+    val test = ArrayBuffer.empty[Rating]
+    for ((userId, userFactor) <- userFactors; (itemId, itemFactor) <- itemFactors) {
+      val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1)
+      val threshold = if (rating > 0) positiveFraction else negativeFraction
+      val observed = random.nextDouble() < threshold
+      if (observed) {
+        val x = random.nextDouble()
+        if (x < totalFraction) {
+          if (x < trainingFraction) {
+            val noise = noiseStd * random.nextGaussian()
+            training += Rating(userId, itemId, rating + noise.toFloat)
+          } else {
+            test += Rating(userId, itemId, rating)
+          }
+        }
+      }
+    }
+    logInfo(s"Generated an implicit feedback dataset with ${training.size} ratings for training " +
+      s"and ${test.size} for test.")
+    (sc.parallelize(training, 2), sc.parallelize(test, 2))
+  }
+
+  /**
+   * Generates random user/item factors, with i.i.d. values drawn from U(a, b).
+   * @param size number of users/items
+   * @param rank number of features
+   * @param random random number generator
+   * @param a min value of the support (default: -1)
+   * @param b max value of the support (default: 1)
+   * @return a sequence of (ID, factors) pairs
+   */
+  private def genFactors(
+      size: Int,
+      rank: Int,
+      random: Random,
+      a: Float = -1.0f,
+      b: Float = 1.0f): Seq[(Int, Array[Float])] = {
+    require(size > 0 && size < Int.MaxValue / 3)
+    require(b > a)
+    val ids = mutable.Set.empty[Int]
+    while (ids.size < size) {
+      ids += random.nextInt()
+    }
+    val width = b - a
+    ids.toSeq.sorted.map(id => (id, Array.fill(rank)(a + random.nextFloat() * width)))
+  }
+
+  /**
+   * Test ALS using the given training/test splits and parameters.
+   * @param training training dataset
+   * @param test test dataset
+   * @param rank rank of the matrix factorization
+   * @param maxIter max number of iterations
+   * @param regParam regularization constant
+   * @param implicitPrefs whether to use implicit preference
+   * @param numUserBlocks number of user blocks
+   * @param numItemBlocks number of item blocks
+   * @param targetRMSE target test RMSE
+   */
+  def testALS(
+      training: RDD[Rating],
+      test: RDD[Rating],
+      rank: Int,
+      maxIter: Int,
+      regParam: Double,
+      implicitPrefs: Boolean = false,
+      numUserBlocks: Int = 2,
+      numItemBlocks: Int = 3,
+      targetRMSE: Double = 0.05): Unit = {
+    val sqlContext = this.sqlContext
+    import sqlContext.{createSchemaRDD, symbolToUnresolvedAttribute}
+    val als = new ALS()
+      .setRank(rank)
+      .setRegParam(regParam)
+      .setImplicitPrefs(implicitPrefs)
+      .setNumUserBlocks(numUserBlocks)
+      .setNumItemBlocks(numItemBlocks)
+    val alpha = als.getAlpha
+    val model = als.fit(training)
+    val predictions = model.transform(test)
+      .select('rating, 'prediction)
+      .map { case Row(rating: Float, prediction: Float) =>
+        (rating.toDouble, prediction.toDouble)
+      }
+    val rmse =
+      if (implicitPrefs) {
+        // TODO: Use a better (rank-based?) evaluation metric for implicit feedback.
+        // We limit the ratings and the predictions to interval [0, 1] and compute the weighted RMSE
+        // with the confidence scores as weights.
+        val (totalWeight, weightedSumSq) = predictions.map { case (rating, prediction) =>
+          val confidence = 1.0 + alpha * math.abs(rating)
+          val rating01 = math.max(math.min(rating, 1.0), 0.0)
+          val prediction01 = math.max(math.min(prediction, 1.0), 0.0)
+          val err = prediction01 - rating01
+          (confidence, confidence * err * err)
+        }.reduce { case ((c0, e0), (c1, e1)) =>
+          (c0 + c1, e0 + e1)
+        }
+        math.sqrt(weightedSumSq / totalWeight)
+      } else {
+        val mse = predictions.map { case (rating, prediction) =>
+          val err = rating - prediction
+          err * err
+        }.mean()
+        math.sqrt(mse)
+      }
+    logInfo(s"Test RMSE is $rmse.")
+    assert(rmse < targetRMSE)
+  }
+
+  test("exact rank-1 matrix") {
+    val (training, test) = genExplicitTestData(numUsers = 20, numItems = 40, rank = 1)
+    testALS(training, test, maxIter = 1, rank = 1, regParam = 1e-5, targetRMSE = 0.001)
+    testALS(training, test, maxIter = 1, rank = 2, regParam = 1e-5, targetRMSE = 0.001)
+  }
+
+  test("approximate rank-1 matrix") {
+    val (training, test) =
+      genExplicitTestData(numUsers = 20, numItems = 40, rank = 1, noiseStd = 0.01)
+    testALS(training, test, maxIter = 2, rank = 1, regParam = 0.01, targetRMSE = 0.02)
+    testALS(training, test, maxIter = 2, rank = 2, regParam = 0.01, targetRMSE = 0.02)
+  }
+
+  test("approximate rank-2 matrix") {
+    val (training, test) =
+      genExplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
+    testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, targetRMSE = 0.03)
+    testALS(training, test, maxIter = 4, rank = 3, regParam = 0.01, targetRMSE = 0.03)
+  }
+
+  test("different block settings") {
+    val (training, test) =
+      genExplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
+    for ((numUserBlocks, numItemBlocks) <- Seq((1, 1), (1, 2), (2, 1), (2, 2))) {
+      testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, targetRMSE = 0.03,
+        numUserBlocks = numUserBlocks, numItemBlocks = numItemBlocks)
+    }
+  }
+
+  test("more blocks than ratings") {
+    val (training, test) =
+      genExplicitTestData(numUsers = 4, numItems = 4, rank = 1)
+    testALS(training, test, maxIter = 2, rank = 1, regParam = 1e-4, targetRMSE = 0.002,
+     numItemBlocks = 5, numUserBlocks = 5)
+  }
+
+  test("implicit feedback") {
+    val (training, test) =
+      genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
+    testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, implicitPrefs = true,
+      targetRMSE = 0.3)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
index 4e812994405b3..94b0e00f37267 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
@@ -178,15 +178,16 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext with M
     // Use half as many iterations as the previous test.
     val lr = new LogisticRegressionWithSGD().setIntercept(true)
     lr.optimizer.
-      setStepSize(10.0).
+      setStepSize(1.0).
       setNumIterations(10).
       setRegParam(1.0)
 
     val model = lr.run(testRDD, initialWeights)
 
     // Test the weights
-    assert(model.weights(0) ~== -430000.0 relTol 20000.0)
-    assert(model.intercept ~== 370000.0 relTol 20000.0)
+    // With regularization, the resulting weights will be smaller.
+    assert(model.weights(0) ~== -0.14 relTol 0.02)
+    assert(model.intercept ~== 0.25 relTol 0.02)
 
     val validationData = LogisticRegressionSuite.generateLogisticInput(A, B, nPoints, 17)
     val validationRDD = sc.parallelize(validationData, 2)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala
new file mode 100644
index 0000000000000..198997b5bb2b2
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.clustering
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.linalg.{Vectors, Matrices}
+import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+
+class GMMExpectationMaximizationSuite extends FunSuite with MLlibTestSparkContext {
+  test("single cluster") {
+    val data = sc.parallelize(Array(
+      Vectors.dense(6.0, 9.0),
+      Vectors.dense(5.0, 10.0),
+      Vectors.dense(4.0, 11.0)
+    ))
+    
+    // expectations
+    val Ew = 1.0
+    val Emu = Vectors.dense(5.0, 10.0)
+    val Esigma = Matrices.dense(2, 2, Array(2.0 / 3.0, -2.0 / 3.0, -2.0 / 3.0, 2.0 / 3.0))
+
+    val seeds = Array(314589, 29032897, 50181, 494821, 4660)
+    seeds.foreach { seed =>
+      val gmm = new GaussianMixtureEM().setK(1).setSeed(seed).run(data)
+      assert(gmm.weights(0) ~== Ew absTol 1E-5)
+      assert(gmm.gaussians(0).mu ~== Emu absTol 1E-5)
+      assert(gmm.gaussians(0).sigma ~== Esigma absTol 1E-5)
+    }
+  }
+  
+  test("two clusters") {
+    val data = sc.parallelize(Array(
+      Vectors.dense(-5.1971), Vectors.dense(-2.5359), Vectors.dense(-3.8220),
+      Vectors.dense(-5.2211), Vectors.dense(-5.0602), Vectors.dense( 4.7118),
+      Vectors.dense( 6.8989), Vectors.dense( 3.4592), Vectors.dense( 4.6322),
+      Vectors.dense( 5.7048), Vectors.dense( 4.6567), Vectors.dense( 5.5026),
+      Vectors.dense( 4.5605), Vectors.dense( 5.2043), Vectors.dense( 6.2734)
+    ))
+  
+    // we set an initial gaussian to induce expected results
+    val initialGmm = new GaussianMixtureModel(
+      Array(0.5, 0.5),
+      Array(
+        new MultivariateGaussian(Vectors.dense(-1.0), Matrices.dense(1, 1, Array(1.0))),
+        new MultivariateGaussian(Vectors.dense(1.0), Matrices.dense(1, 1, Array(1.0)))
+      )
+    )
+    
+    val Ew = Array(1.0 / 3.0, 2.0 / 3.0)
+    val Emu = Array(Vectors.dense(-4.3673), Vectors.dense(5.1604))
+    val Esigma = Array(Matrices.dense(1, 1, Array(1.1098)), Matrices.dense(1, 1, Array(0.86644)))
+    
+    val gmm = new GaussianMixtureEM()
+      .setK(2)
+      .setInitialModel(initialGmm)
+      .run(data)
+      
+    assert(gmm.weights(0) ~== Ew(0) absTol 1E-3)
+    assert(gmm.weights(1) ~== Ew(1) absTol 1E-3)
+    assert(gmm.gaussians(0).mu ~== Emu(0) absTol 1E-3)
+    assert(gmm.gaussians(1).mu ~== Emu(1) absTol 1E-3)
+    assert(gmm.gaussians(0).sigma ~== Esigma(0) absTol 1E-3)
+    assert(gmm.gaussians(1).sigma ~== Esigma(1) absTol 1E-3)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
index 9ebef8466c831..caee5917000aa 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
@@ -90,6 +90,27 @@ class KMeansSuite extends FunSuite with MLlibTestSparkContext {
     assert(model.clusterCenters.size === 3)
   }
 
+  test("deterministic initialization") {
+    // Create a large-ish set of points for clustering
+    val points = List.tabulate(1000)(n => Vectors.dense(n, n))
+    val rdd = sc.parallelize(points, 3)
+
+    for (initMode <- Seq(RANDOM, K_MEANS_PARALLEL)) {
+      // Create three deterministic models and compare cluster means
+      val model1 = KMeans.train(rdd, k = 10, maxIterations = 2, runs = 1,
+        initializationMode = initMode, seed = 42)
+      val centers1 = model1.clusterCenters
+
+      val model2 = KMeans.train(rdd, k = 10, maxIterations = 2, runs = 1,
+        initializationMode = initMode, seed = 42)
+      val centers2 = model2.clusterCenters
+
+      centers1.zip(centers2).foreach { case (c1, c2) =>
+        assert(c1 ~== c2 absTol 1E-14)
+      }
+    }
+  }
+
   test("single cluster with big dataset") {
     val smallData = Array(
       Vectors.dense(1.0, 2.0, 6.0),
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala
index 8a18e2971cab6..e0224f960cc43 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala
@@ -124,4 +124,40 @@ class BinaryClassificationMetricsSuite extends FunSuite with MLlibTestSparkConte
 
     validateMetrics(metrics, thresholds, rocCurve, prCurve, f1, f2, precisions, recalls)
   }
+
+  test("binary evaluation metrics with downsampling") {
+    val scoreAndLabels = Seq(
+      (0.1, 0.0), (0.2, 0.0), (0.3, 1.0), (0.4, 0.0), (0.5, 0.0),
+      (0.6, 1.0), (0.7, 1.0), (0.8, 0.0), (0.9, 1.0))
+
+    val scoreAndLabelsRDD = sc.parallelize(scoreAndLabels, 1)
+
+    val original = new BinaryClassificationMetrics(scoreAndLabelsRDD)
+    val originalROC = original.roc().collect().sorted.toList
+    // Add 2 for (0,0) and (1,1) appended at either end
+    assert(2 + scoreAndLabels.size == originalROC.size)
+    assert(
+      List(
+        (0.0, 0.0), (0.0, 0.25), (0.2, 0.25), (0.2, 0.5), (0.2, 0.75),
+        (0.4, 0.75), (0.6, 0.75), (0.6, 1.0), (0.8, 1.0), (1.0, 1.0),
+        (1.0, 1.0)
+      ) ==
+      originalROC)
+
+    val numBins = 4
+
+    val downsampled = new BinaryClassificationMetrics(scoreAndLabelsRDD, numBins)
+    val downsampledROC = downsampled.roc().collect().sorted.toList
+    assert(
+      // May have to add 1 if the sample factor didn't divide evenly
+      2 + (numBins + (if (scoreAndLabels.size % numBins == 0) 0 else 1)) ==
+      downsampledROC.size)
+    assert(
+      List(
+        (0.0, 0.0), (0.2, 0.25), (0.2, 0.75), (0.6, 0.75), (0.8, 1.0),
+        (1.0, 1.0), (1.0, 1.0)
+      ) ==
+      downsampledROC)
+  }
+
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala
index 30147e7fd948f..0a5cad7caf8e4 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala
@@ -19,8 +19,7 @@ package org.apache.spark.mllib.feature
 
 import org.scalatest.FunSuite
 
-import org.apache.spark.SparkContext._
-import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors, Vector}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
@@ -41,18 +40,26 @@ class IDFSuite extends FunSuite with MLlibTestSparkContext {
       math.log((m + 1.0) / (x + 1.0))
     })
     assert(model.idf ~== expected absTol 1e-12)
-    val tfidf = model.transform(termFrequencies).cache().zipWithIndex().map(_.swap).collectAsMap()
-    assert(tfidf.size === 3)
-    val tfidf0 = tfidf(0L).asInstanceOf[SparseVector]
-    assert(tfidf0.indices === Array(1, 3))
-    assert(Vectors.dense(tfidf0.values) ~==
-      Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
-    val tfidf1 = tfidf(1L).asInstanceOf[DenseVector]
-    assert(Vectors.dense(tfidf1.values) ~==
-      Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
-    val tfidf2 = tfidf(2L).asInstanceOf[SparseVector]
-    assert(tfidf2.indices === Array(1))
-    assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
+
+    val assertHelper = (tfidf: Array[Vector]) => {
+      assert(tfidf.size === 3)
+      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
+      assert(tfidf0.indices === Array(1, 3))
+      assert(Vectors.dense(tfidf0.values) ~==
+          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
+      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
+      assert(Vectors.dense(tfidf1.values) ~==
+          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
+      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
+      assert(tfidf2.indices === Array(1))
+      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
+    }
+    // Transforms a RDD
+    val tfidf = model.transform(termFrequencies).collect()
+    assertHelper(tfidf)
+    // Transforms local vectors
+    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
+    assertHelper(localTfidf)
   }
 
   test("idf minimum document frequency filtering") {
@@ -74,18 +81,26 @@ class IDFSuite extends FunSuite with MLlibTestSparkContext {
       }
     })
     assert(model.idf ~== expected absTol 1e-12)
-    val tfidf = model.transform(termFrequencies).cache().zipWithIndex().map(_.swap).collectAsMap()
-    assert(tfidf.size === 3)
-    val tfidf0 = tfidf(0L).asInstanceOf[SparseVector]
-    assert(tfidf0.indices === Array(1, 3))
-    assert(Vectors.dense(tfidf0.values) ~==
-      Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
-    val tfidf1 = tfidf(1L).asInstanceOf[DenseVector]
-    assert(Vectors.dense(tfidf1.values) ~==
-      Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
-    val tfidf2 = tfidf(2L).asInstanceOf[SparseVector]
-    assert(tfidf2.indices === Array(1))
-    assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
+
+    val assertHelper = (tfidf: Array[Vector]) => {
+      assert(tfidf.size === 3)
+      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
+      assert(tfidf0.indices === Array(1, 3))
+      assert(Vectors.dense(tfidf0.values) ~==
+          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
+      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
+      assert(Vectors.dense(tfidf1.values) ~==
+          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
+      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
+      assert(tfidf2.indices === Array(1))
+      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
+    }
+    // Transforms a RDD
+    val tfidf = model.transform(termFrequencies).collect()
+    assertHelper(tfidf)
+    // Transforms local vectors
+    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
+    assertHelper(localTfidf)
   }
 
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
index 5d70c914f14b0..771878e925ea7 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
@@ -127,6 +127,47 @@ class BLASSuite extends FunSuite {
     }
   }
 
+  test("syr") {
+    val dA = new DenseMatrix(4, 4,
+      Array(0.0, 1.2, 2.2, 3.1, 1.2, 3.2, 5.3, 4.6, 2.2, 5.3, 1.8, 3.0, 3.1, 4.6, 3.0, 0.8))
+    val x = new DenseVector(Array(0.0, 2.7, 3.5, 2.1))
+    val alpha = 0.15
+
+    val expected = new DenseMatrix(4, 4,
+      Array(0.0, 1.2, 2.2, 3.1, 1.2, 4.2935, 6.7175, 5.4505, 2.2, 6.7175, 3.6375, 4.1025, 3.1,
+        5.4505, 4.1025, 1.4615))
+
+    syr(alpha, x, dA)
+
+    assert(dA ~== expected absTol 1e-15)
+ 
+    val dB =
+      new DenseMatrix(3, 4, Array(0.0, 1.2, 2.2, 3.1, 1.2, 3.2, 5.3, 4.6, 2.2, 5.3, 1.8, 3.0))
+
+    withClue("Matrix A must be a symmetric Matrix") {
+      intercept[Exception] {
+        syr(alpha, x, dB)
+      }
+    }
+ 
+    val dC =
+      new DenseMatrix(3, 3, Array(0.0, 1.2, 2.2, 1.2, 3.2, 5.3, 2.2, 5.3, 1.8))
+
+    withClue("Size of vector must match the rank of matrix") {
+      intercept[Exception] {
+        syr(alpha, x, dC)
+      }
+    }
+ 
+    val y = new DenseVector(Array(0.0, 2.7, 3.5, 2.1, 1.5))
+
+    withClue("Size of vector must match the rank of matrix") {
+      intercept[Exception] {
+        syr(alpha, y, dA)
+      }
+    }
+  }
+
   test("gemm") {
 
     val dA =
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index 322a0e9242918..a35d0fe389fdd 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -43,9 +43,9 @@ class MatricesSuite extends FunSuite {
 
   test("sparse matrix construction") {
     val m = 3
-    val n = 2
+    val n = 4
     val values = Array(1.0, 2.0, 4.0, 5.0)
-    val colPtrs = Array(0, 2, 4)
+    val colPtrs = Array(0, 2, 2, 4, 4)
     val rowIndices = Array(1, 2, 1, 2)
     val mat = Matrices.sparse(m, n, colPtrs, rowIndices, values).asInstanceOf[SparseMatrix]
     assert(mat.numRows === m)
@@ -53,6 +53,13 @@ class MatricesSuite extends FunSuite {
     assert(mat.values.eq(values), "should not copy data")
     assert(mat.colPtrs.eq(colPtrs), "should not copy data")
     assert(mat.rowIndices.eq(rowIndices), "should not copy data")
+
+    val entries: Array[(Int, Int, Double)] = Array((2, 2, 3.0), (1, 0, 1.0), (2, 0, 2.0),
+        (1, 2, 2.0), (2, 2, 2.0), (1, 2, 2.0), (0, 0, 0.0))
+
+    val mat2 = SparseMatrix.fromCOO(m, n, entries)
+    assert(mat.toBreeze === mat2.toBreeze)
+    assert(mat2.values.length == 4)
   }
 
   test("sparse matrix construction with wrong number of elements") {
@@ -117,6 +124,142 @@ class MatricesSuite extends FunSuite {
     assert(sparseMat.values(2) === 10.0)
   }
 
+  test("toSparse, toDense") {
+    val m = 3
+    val n = 2
+    val values = Array(1.0, 2.0, 4.0, 5.0)
+    val allValues = Array(1.0, 2.0, 0.0, 0.0, 4.0, 5.0)
+    val colPtrs = Array(0, 2, 4)
+    val rowIndices = Array(0, 1, 1, 2)
+
+    val spMat1 = new SparseMatrix(m, n, colPtrs, rowIndices, values)
+    val deMat1 = new DenseMatrix(m, n, allValues)
+
+    val spMat2 = deMat1.toSparse()
+    val deMat2 = spMat1.toDense()
+
+    assert(spMat1.toBreeze === spMat2.toBreeze)
+    assert(deMat1.toBreeze === deMat2.toBreeze)
+  }
+
+  test("map, update") {
+    val m = 3
+    val n = 2
+    val values = Array(1.0, 2.0, 4.0, 5.0)
+    val allValues = Array(1.0, 2.0, 0.0, 0.0, 4.0, 5.0)
+    val colPtrs = Array(0, 2, 4)
+    val rowIndices = Array(0, 1, 1, 2)
+
+    val spMat1 = new SparseMatrix(m, n, colPtrs, rowIndices, values)
+    val deMat1 = new DenseMatrix(m, n, allValues)
+    val deMat2 = deMat1.map(_ * 2)
+    val spMat2 = spMat1.map(_ * 2)
+    deMat1.update(_ * 2)
+    spMat1.update(_ * 2)
+
+    assert(spMat1.toArray === spMat2.toArray)
+    assert(deMat1.toArray === deMat2.toArray)
+  }
+
+  test("horzcat, vertcat, eye, speye") {
+    val m = 3
+    val n = 2
+    val values = Array(1.0, 2.0, 4.0, 5.0)
+    val allValues = Array(1.0, 2.0, 0.0, 0.0, 4.0, 5.0)
+    val colPtrs = Array(0, 2, 4)
+    val rowIndices = Array(0, 1, 1, 2)
+
+    val spMat1 = new SparseMatrix(m, n, colPtrs, rowIndices, values)
+    val deMat1 = new DenseMatrix(m, n, allValues)
+    val deMat2 = Matrices.eye(3)
+    val spMat2 = Matrices.speye(3)
+    val deMat3 = Matrices.eye(2)
+    val spMat3 = Matrices.speye(2)
+
+    val spHorz = Matrices.horzcat(Array(spMat1, spMat2))
+    val spHorz2 = Matrices.horzcat(Array(spMat1, deMat2))
+    val spHorz3 = Matrices.horzcat(Array(deMat1, spMat2))
+    val deHorz1 = Matrices.horzcat(Array(deMat1, deMat2))
+
+    val deHorz2 = Matrices.horzcat(Array[Matrix]())
+
+    assert(deHorz1.numRows === 3)
+    assert(spHorz2.numRows === 3)
+    assert(spHorz3.numRows === 3)
+    assert(spHorz.numRows === 3)
+    assert(deHorz1.numCols === 5)
+    assert(spHorz2.numCols === 5)
+    assert(spHorz3.numCols === 5)
+    assert(spHorz.numCols === 5)
+    assert(deHorz2.numRows === 0)
+    assert(deHorz2.numCols === 0)
+    assert(deHorz2.toArray.length === 0)
+
+    assert(deHorz1.toBreeze.toDenseMatrix === spHorz2.toBreeze.toDenseMatrix)
+    assert(spHorz2.toBreeze === spHorz3.toBreeze)
+    assert(spHorz(0, 0) === 1.0)
+    assert(spHorz(2, 1) === 5.0)
+    assert(spHorz(0, 2) === 1.0)
+    assert(spHorz(1, 2) === 0.0)
+    assert(spHorz(1, 3) === 1.0)
+    assert(spHorz(2, 4) === 1.0)
+    assert(spHorz(1, 4) === 0.0)
+    assert(deHorz1(0, 0) === 1.0)
+    assert(deHorz1(2, 1) === 5.0)
+    assert(deHorz1(0, 2) === 1.0)
+    assert(deHorz1(1, 2) == 0.0)
+    assert(deHorz1(1, 3) === 1.0)
+    assert(deHorz1(2, 4) === 1.0)
+    assert(deHorz1(1, 4) === 0.0)
+
+    intercept[IllegalArgumentException] {
+      Matrices.horzcat(Array(spMat1, spMat3))
+    }
+
+    intercept[IllegalArgumentException] {
+      Matrices.horzcat(Array(deMat1, spMat3))
+    }
+
+    val spVert = Matrices.vertcat(Array(spMat1, spMat3))
+    val deVert1 = Matrices.vertcat(Array(deMat1, deMat3))
+    val spVert2 = Matrices.vertcat(Array(spMat1, deMat3))
+    val spVert3 = Matrices.vertcat(Array(deMat1, spMat3))
+    val deVert2 = Matrices.vertcat(Array[Matrix]())
+
+    assert(deVert1.numRows === 5)
+    assert(spVert2.numRows === 5)
+    assert(spVert3.numRows === 5)
+    assert(spVert.numRows === 5)
+    assert(deVert1.numCols === 2)
+    assert(spVert2.numCols === 2)
+    assert(spVert3.numCols === 2)
+    assert(spVert.numCols === 2)
+    assert(deVert2.numRows === 0)
+    assert(deVert2.numCols === 0)
+    assert(deVert2.toArray.length === 0)
+
+    assert(deVert1.toBreeze.toDenseMatrix === spVert2.toBreeze.toDenseMatrix)
+    assert(spVert2.toBreeze === spVert3.toBreeze)
+    assert(spVert(0, 0) === 1.0)
+    assert(spVert(2, 1) === 5.0)
+    assert(spVert(3, 0) === 1.0)
+    assert(spVert(3, 1) === 0.0)
+    assert(spVert(4, 1) === 1.0)
+    assert(deVert1(0, 0) === 1.0)
+    assert(deVert1(2, 1) === 5.0)
+    assert(deVert1(3, 0) === 1.0)
+    assert(deVert1(3, 1) === 0.0)
+    assert(deVert1(4, 1) === 1.0)
+
+    intercept[IllegalArgumentException] {
+      Matrices.vertcat(Array(spMat1, spMat2))
+    }
+
+    intercept[IllegalArgumentException] {
+      Matrices.vertcat(Array(deMat1, spMat2))
+    }
+  }
+
   test("zeros") {
     val mat = Matrices.zeros(2, 3).asInstanceOf[DenseMatrix]
     assert(mat.numRows === 2)
@@ -162,4 +305,29 @@ class MatricesSuite extends FunSuite {
     assert(mat.numCols === 2)
     assert(mat.values.toSeq === Seq(1.0, 0.0, 0.0, 2.0))
   }
+
+  test("sprand") {
+    val rng = mock[Random]
+    when(rng.nextInt(4)).thenReturn(0, 1, 1, 3, 2, 2, 0, 1, 3, 0)
+    when(rng.nextDouble()).thenReturn(1.0, 2.0, 3.0, 4.0, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)
+    val mat = SparseMatrix.sprand(4, 4, 0.25, rng)
+    assert(mat.numRows === 4)
+    assert(mat.numCols === 4)
+    assert(mat.rowIndices.toSeq === Seq(3, 0, 2, 1))
+    assert(mat.values.toSeq === Seq(1.0, 2.0, 3.0, 4.0))
+    val mat2 = SparseMatrix.sprand(2, 3, 1.0, rng)
+    assert(mat2.rowIndices.toSeq === Seq(0, 1, 0, 1, 0, 1))
+    assert(mat2.colPtrs.toSeq === Seq(0, 2, 4, 6))
+  }
+
+  test("sprandn") {
+    val rng = mock[Random]
+    when(rng.nextInt(4)).thenReturn(0, 1, 1, 3, 2, 2, 0, 1, 3, 0)
+    when(rng.nextGaussian()).thenReturn(1.0, 2.0, 3.0, 4.0)
+    val mat = SparseMatrix.sprandn(4, 4, 0.25, rng)
+    assert(mat.numRows === 4)
+    assert(mat.numCols === 4)
+    assert(mat.rowIndices.toSeq === Seq(3, 0, 2, 1))
+    assert(mat.values.toSeq === Seq(1.0, 2.0, 3.0, 4.0))
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
index f99f01450992a..5def899cea117 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.mllib.linalg
 
-import breeze.linalg.{DenseMatrix => BDM}
+import scala.util.Random
+
+import breeze.linalg.{DenseMatrix => BDM, squaredDistance => breezeSquaredDistance}
 import org.scalatest.FunSuite
 
 import org.apache.spark.SparkException
@@ -87,6 +89,24 @@ class VectorsSuite extends FunSuite {
     }
   }
 
+  test("vectors equals with explicit 0") {
+    val dv1 = Vectors.dense(Array(0, 0.9, 0, 0.8, 0))
+    val sv1 = Vectors.sparse(5, Array(1, 3), Array(0.9, 0.8))
+    val sv2 = Vectors.sparse(5, Array(0, 1, 2, 3, 4), Array(0, 0.9, 0, 0.8, 0))
+
+    val vectors = Seq(dv1, sv1, sv2)
+    for (v <- vectors; u <- vectors) {
+      assert(v === u)
+      assert(v.## === u.##)
+    }
+
+    val another = Vectors.sparse(5, Array(0, 1, 3), Array(0, 0.9, 0.2))
+    for (v <- vectors) {
+      assert(v != another)
+      assert(v.## != another.##)
+    }
+  }
+
   test("indexing dense vectors") {
     val vec = Vectors.dense(1.0, 2.0, 3.0, 4.0)
     assert(vec(0) === 1.0)
@@ -175,6 +195,33 @@ class VectorsSuite extends FunSuite {
     assert(v.size === x.rows)
   }
 
+  test("sqdist") {
+    val random = new Random()
+    for (m <- 1 until 1000 by 100) {
+      val nnz = random.nextInt(m)
+
+      val indices1 = random.shuffle(0 to m - 1).slice(0, nnz).sorted.toArray
+      val values1 = Array.fill(nnz)(random.nextDouble)
+      val sparseVector1 = Vectors.sparse(m, indices1, values1)
+
+      val indices2 = random.shuffle(0 to m - 1).slice(0, nnz).sorted.toArray
+      val values2 = Array.fill(nnz)(random.nextDouble)
+      val sparseVector2 = Vectors.sparse(m, indices2, values2)
+
+      val denseVector1 = Vectors.dense(sparseVector1.toArray)
+      val denseVector2 = Vectors.dense(sparseVector2.toArray)
+
+      val squaredDist = breezeSquaredDistance(sparseVector1.toBreeze, sparseVector2.toBreeze)
+
+      // SparseVector vs. SparseVector 
+      assert(Vectors.sqdist(sparseVector1, sparseVector2) ~== squaredDist relTol 1E-8) 
+      // DenseVector  vs. SparseVector
+      assert(Vectors.sqdist(denseVector1, sparseVector2) ~== squaredDist relTol 1E-8)
+      // DenseVector  vs. DenseVector
+      assert(Vectors.sqdist(denseVector1, denseVector2) ~== squaredDist relTol 1E-8)
+    }    
+  }
+
   test("foreachActive") {
     val dv = Vectors.dense(0.0, 1.2, 3.1, 0.0)
     val sv = Vectors.sparse(4, Seq((1, 1.2), (2, 3.1), (3, 0.0)))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala
index f8709751efce6..80bef814ce50d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala
@@ -73,6 +73,11 @@ class CoordinateMatrixSuite extends FunSuite with MLlibTestSparkContext {
     assert(mat.toBreeze() === expected)
   }
 
+  test("transpose") {
+    val transposed = mat.transpose()
+    assert(mat.toBreeze().t === transposed.toBreeze())
+  }
+
   test("toIndexedRowMatrix") {
     val indexedRowMatrix = mat.toIndexedRowMatrix()
     val expected = BDM(
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
index e25bc02b06c9a..b86c2ca5ff136 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
@@ -80,6 +80,14 @@ class IndexedRowMatrixSuite extends FunSuite with MLlibTestSparkContext {
     assert(rowMat.rows.collect().toSeq === data.map(_.vector).toSeq)
   }
 
+  test("toCoordinateMatrix") {
+    val idxRowMat = new IndexedRowMatrix(indexedRows)
+    val coordMat = idxRowMat.toCoordinateMatrix()
+    assert(coordMat.numRows() === m)
+    assert(coordMat.numCols() === n)
+    assert(coordMat.toBreeze() === idxRowMat.toBreeze())
+  }
+
   test("multiply a local matrix") {
     val A = new IndexedRowMatrix(indexedRows)
     val B = Matrices.dense(3, 2, Array(0.0, 1.0, 2.0, 3.0, 4.0, 5.0))
@@ -113,6 +121,13 @@ class IndexedRowMatrixSuite extends FunSuite with MLlibTestSparkContext {
     assert(closeToZero(U * brzDiag(s) * V.t - localA))
   }
 
+  test("validate k in svd") {
+    val A = new IndexedRowMatrix(indexedRows)
+    intercept[IllegalArgumentException] {
+      A.computeSVD(-1)
+    }
+  }
+
   def closeToZero(G: BDM[Double]): Boolean = {
     G.valuesIterator.map(math.abs).sum < 1e-6
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
index dbf55ff81ca99..3309713e91f87 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
@@ -171,6 +171,14 @@ class RowMatrixSuite extends FunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("validate k in svd") {
+    for (mat <- Seq(denseMat, sparseMat)) {
+      intercept[IllegalArgumentException] {
+        mat.computeSVD(-1)
+      }
+    }
+  }
+
   def closeToZero(G: BDM[Double]): Boolean = {
     G.valuesIterator.map(math.abs).sum < 1e-6
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
index 3df7c128af5ab..b792d819fdabb 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.mllib.random
 
+import scala.math
+
 import org.scalatest.FunSuite
 
 import org.apache.spark.util.StatCounter
@@ -25,7 +27,6 @@ import org.apache.spark.util.StatCounter
 class RandomDataGeneratorSuite extends FunSuite {
 
   def apiChecks(gen: RandomDataGenerator[Double]) {
-
     // resetting seed should generate the same sequence of random numbers
     gen.setSeed(42L)
     val array1 = (0 until 1000).map(_ => gen.nextValue())
@@ -79,6 +80,26 @@ class RandomDataGeneratorSuite extends FunSuite {
     distributionChecks(normal, 0.0, 1.0)
   }
 
+  test("LogNormalGenerator") {
+    List((0.0, 1.0), (0.0, 2.0), (2.0, 1.0), (2.0, 2.0)).map {
+      case (mean: Double, vari: Double) =>
+        val normal = new LogNormalGenerator(mean, math.sqrt(vari))
+        apiChecks(normal)
+
+        // mean of log normal = e^(mean + var / 2)
+        val expectedMean = math.exp(mean + 0.5 * vari)
+
+        // variance of log normal = (e^var - 1) * e^(2 * mean + var)
+        val expectedStd = math.sqrt((math.exp(vari) - 1.0) * math.exp(2.0 * mean + vari))
+
+        // since sampling error increases with variance, let's set
+        // the absolute tolerance as a percentage
+        val epsilon = 0.05 * expectedStd * expectedStd
+
+        distributionChecks(normal, expectedMean, expectedStd, epsilon)
+    }
+  }
+
   test("PoissonGenerator") {
     // mean = 0.0 will not pass the API checks since 0.0 is always deterministically produced.
     for (mean <- List(1.0, 5.0, 100.0)) {
@@ -87,4 +108,33 @@ class RandomDataGeneratorSuite extends FunSuite {
       distributionChecks(poisson, mean, math.sqrt(mean), 0.1)
     }
   }
+
+  test("ExponentialGenerator") {
+    // mean = 0.0 will not pass the API checks since 0.0 is always deterministically produced.
+    for (mean <- List(2.0, 5.0, 10.0, 50.0, 100.0)) {
+      val exponential = new ExponentialGenerator(mean)
+      apiChecks(exponential)
+      // var of exp = lambda^-2 = (1.0 / mean)^-2 = mean^2
+
+      // since sampling error increases with variance, let's set
+      // the absolute tolerance as a percentage
+      val epsilon = 0.05 * mean * mean
+
+      distributionChecks(exponential, mean, mean, epsilon)
+    }
+  }
+
+  test("GammaGenerator") {
+    // mean = 0.0 will not pass the API checks since 0.0 is always deterministically produced.
+    List((1.0, 2.0), (2.0, 2.0), (3.0, 2.0), (5.0, 1.0), (9.0, 0.5)).map {
+      case (shape: Double, scale: Double) =>
+        val gamma = new GammaGenerator(shape, scale)
+        apiChecks(gamma)
+        // mean of gamma = shape * scale
+        val expectedMean = shape * scale
+        // var of gamma = shape * scale^2
+        val expectedStd = math.sqrt(shape * scale * scale)
+        distributionChecks(gamma, expectedMean, expectedStd, 0.1)
+    }
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala
index ea5889b3ecd5e..6395188a0842a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala
@@ -110,7 +110,19 @@ class RandomRDDsSuite extends FunSuite with MLlibTestSparkContext with Serializa
   test("randomRDD for different distributions") {
     val size = 100000L
     val numPartitions = 10
+
+    //  mean of log normal = e^(mean + var / 2)
+    val logNormalMean = math.exp(0.5)
+    // variance of log normal = (e^var - 1) * e^(2 * mean + var)
+    val logNormalStd = math.sqrt((math.E - 1.0) * math.E)
+    val gammaScale = 1.0
+    val gammaShape = 2.0
+    // mean of gamma = shape * scale
+    val gammaMean = gammaShape * gammaScale
+    // var of gamma = shape * scale^2
+    val gammaStd = math.sqrt(gammaShape * gammaScale * gammaScale)
     val poissonMean = 100.0
+    val exponentialMean = 1.0
 
     for (seed <- 0 until 5) {
       val uniform = RandomRDDs.uniformRDD(sc, size, numPartitions, seed)
@@ -119,8 +131,18 @@ class RandomRDDsSuite extends FunSuite with MLlibTestSparkContext with Serializa
       val normal = RandomRDDs.normalRDD(sc, size, numPartitions, seed)
       testGeneratedRDD(normal, size, numPartitions, 0.0, 1.0)
 
+      val logNormal = RandomRDDs.logNormalRDD(sc, 0.0, 1.0, size, numPartitions, seed)
+      testGeneratedRDD(logNormal, size, numPartitions, logNormalMean, logNormalStd, 0.1)
+
       val poisson = RandomRDDs.poissonRDD(sc, poissonMean, size, numPartitions, seed)
       testGeneratedRDD(poisson, size, numPartitions, poissonMean, math.sqrt(poissonMean), 0.1)
+
+      val exponential = RandomRDDs.exponentialRDD(sc, exponentialMean, size, numPartitions, seed)
+      testGeneratedRDD(exponential, size, numPartitions, exponentialMean, exponentialMean, 0.1)
+
+      val gamma = RandomRDDs.gammaRDD(sc, gammaShape, gammaScale, size, numPartitions, seed)
+      testGeneratedRDD(gamma, size, numPartitions, gammaMean, gammaStd, 0.1)
+
     }
 
     // mock distribution to check that partitions have unique seeds
@@ -132,7 +154,19 @@ class RandomRDDsSuite extends FunSuite with MLlibTestSparkContext with Serializa
     val rows = 1000L
     val cols = 100
     val parts = 10
+
+    //  mean of log normal = e^(mean + var / 2)
+    val logNormalMean = math.exp(0.5)
+    // variance of log normal = (e^var - 1) * e^(2 * mean + var)
+    val logNormalStd = math.sqrt((math.E - 1.0) * math.E)
+    val gammaScale = 1.0
+    val gammaShape = 2.0
+    // mean of gamma = shape * scale
+    val gammaMean = gammaShape * gammaScale
+    // var of gamma = shape * scale^2
+    val gammaStd = math.sqrt(gammaShape * gammaScale * gammaScale)
     val poissonMean = 100.0
+    val exponentialMean = 1.0
 
     for (seed <- 0 until 5) {
       val uniform = RandomRDDs.uniformVectorRDD(sc, rows, cols, parts, seed)
@@ -141,8 +175,17 @@ class RandomRDDsSuite extends FunSuite with MLlibTestSparkContext with Serializa
       val normal = RandomRDDs.normalVectorRDD(sc, rows, cols, parts, seed)
       testGeneratedVectorRDD(normal, rows, cols, parts, 0.0, 1.0)
 
+      val logNormal = RandomRDDs.logNormalVectorRDD(sc, 0.0, 1.0, rows, cols, parts, seed)
+      testGeneratedVectorRDD(logNormal, rows, cols, parts, logNormalMean, logNormalStd, 0.1)
+
       val poisson = RandomRDDs.poissonVectorRDD(sc, poissonMean, rows, cols, parts, seed)
       testGeneratedVectorRDD(poisson, rows, cols, parts, poissonMean, math.sqrt(poissonMean), 0.1)
+
+      val exponential = RandomRDDs.exponentialVectorRDD(sc, exponentialMean, rows, cols, parts, seed)
+      testGeneratedVectorRDD(exponential, rows, cols, parts, exponentialMean, exponentialMean, 0.1)
+
+      val gamma = RandomRDDs.gammaVectorRDD(sc, gammaShape, gammaScale, rows, cols, parts, seed)
+      testGeneratedVectorRDD(gamma, rows, cols, parts, gammaMean, gammaStd, 0.1)
     }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
index 603d0ad127b86..e9fc37e000526 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
@@ -27,6 +27,7 @@ import org.jblas.DoubleMatrix
 import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.recommendation.ALS.BlockStats
+import org.apache.spark.storage.StorageLevel
 
 object ALSSuite {
 
@@ -139,6 +140,32 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext {
     assert(u11 != u2)
   }
 
+  test("Storage Level for RDDs in model") {
+    val ratings = sc.parallelize(ALSSuite.generateRatings(10, 20, 5, 0.5, false, false)._1, 2)
+    var storageLevel = StorageLevel.MEMORY_ONLY
+    var model = new ALS()
+      .setRank(5)
+      .setIterations(1)
+      .setLambda(1.0)
+      .setBlocks(2)
+      .setSeed(1)
+      .setFinalRDDStorageLevel(storageLevel)
+      .run(ratings)
+    assert(model.productFeatures.getStorageLevel == storageLevel);
+    assert(model.userFeatures.getStorageLevel == storageLevel);
+    storageLevel = StorageLevel.DISK_ONLY
+    model = new ALS()
+      .setRank(5)
+      .setIterations(1)
+      .setLambda(1.0)
+      .setBlocks(2)
+      .setSeed(1)
+      .setFinalRDDStorageLevel(storageLevel)
+      .run(ratings)
+    assert(model.productFeatures.getStorageLevel == storageLevel);
+    assert(model.userFeatures.getStorageLevel == storageLevel);
+  }
+
   test("negative ids") {
     val data = ALSSuite.generateRatings(50, 50, 2, 0.7, false, false)
     val ratings = sc.parallelize(data._1.map { case Rating(u, p, r) =>
@@ -188,7 +215,7 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext {
    * @param samplingRate what fraction of the user-product pairs are known
    * @param matchThreshold max difference allowed to consider a predicted rating correct
    * @param implicitPrefs flag to test implicit feedback
-   * @param bulkPredict flag to test bulk prediciton
+   * @param bulkPredict flag to test bulk predicition
    * @param negativeWeights whether the generated data can contain negative values
    * @param numUserBlocks number of user blocks to partition users into
    * @param numProductBlocks number of product blocks to partition products into
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
index 03b71301e9ab1..70b43ddb7daf5 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
@@ -52,7 +52,7 @@ class StreamingLinearRegressionSuite extends FunSuite with TestSuiteBase {
     // create model
     val model = new StreamingLinearRegressionWithSGD()
       .setInitialWeights(Vectors.dense(0.0, 0.0))
-      .setStepSize(0.1)
+      .setStepSize(0.2)
       .setNumIterations(25)
 
     // generate sequence of simulated data
@@ -84,7 +84,7 @@ class StreamingLinearRegressionSuite extends FunSuite with TestSuiteBase {
     // create model
     val model = new StreamingLinearRegressionWithSGD()
       .setInitialWeights(Vectors.dense(0.0))
-      .setStepSize(0.1)
+      .setStepSize(0.2)
       .setNumIterations(25)
 
     // generate sequence of simulated data
@@ -118,7 +118,7 @@ class StreamingLinearRegressionSuite extends FunSuite with TestSuiteBase {
     // create model initialized with true weights
     val model = new StreamingLinearRegressionWithSGD()
       .setInitialWeights(Vectors.dense(10.0, 10.0))
-      .setStepSize(0.1)
+      .setStepSize(0.2)
       .setNumIterations(25)
 
     // generate sequence of simulated data for testing
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
new file mode 100644
index 0000000000000..fac2498e4dcb3
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.distribution
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.linalg.{ Vectors, Matrices }
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+
+class MultivariateGaussianSuite extends FunSuite with MLlibTestSparkContext {
+  test("univariate") {
+    val x1 = Vectors.dense(0.0)
+    val x2 = Vectors.dense(1.5)
+                     
+    val mu = Vectors.dense(0.0)
+    val sigma1 = Matrices.dense(1, 1, Array(1.0))
+    val dist1 = new MultivariateGaussian(mu, sigma1)
+    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
+    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)
+    
+    val sigma2 = Matrices.dense(1, 1, Array(4.0))
+    val dist2 = new MultivariateGaussian(mu, sigma2)
+    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
+    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
+  }
+  
+  test("multivariate") {
+    val x1 = Vectors.dense(0.0, 0.0)
+    val x2 = Vectors.dense(1.0, 1.0)
+    
+    val mu = Vectors.dense(0.0, 0.0)
+    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
+    val dist1 = new MultivariateGaussian(mu, sigma1)
+    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
+    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)
+    
+    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
+    val dist2 = new MultivariateGaussian(mu, sigma2)
+    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
+    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
+  }
+  
+  test("multivariate degenerate") {
+    val x1 = Vectors.dense(0.0, 0.0)
+    val x2 = Vectors.dense(1.0, 1.0)
+    
+    val mu = Vectors.dense(0.0, 0.0)
+    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
+    val dist = new MultivariateGaussian(mu, sigma)
+    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
+    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
index df07987093fbf..668fc1d43c5d6 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -20,18 +20,17 @@ package org.apache.spark.mllib.util
 import java.io.File
 
 import scala.io.Source
-import scala.math
 
 import org.scalatest.FunSuite
 
-import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, norm => breezeNorm,
-  squaredDistance => breezeSquaredDistance}
+import breeze.linalg.{squaredDistance => breezeSquaredDistance}
 import com.google.common.base.Charsets
 import com.google.common.io.Files
 
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLUtils._
+import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.util.Utils
 
 class MLUtilsSuite extends FunSuite with MLlibTestSparkContext {
@@ -52,12 +51,27 @@ class MLUtilsSuite extends FunSuite with MLlibTestSparkContext {
       val values = indices.map(i => a(i))
       val v2 = Vectors.sparse(n, indices, values)
       val norm2 = Vectors.norm(v2, 2.0)
+      val v3 = Vectors.sparse(n, indices, indices.map(i => a(i) + 0.5))
+      val norm3 = Vectors.norm(v3, 2.0)
       val squaredDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze)
       val fastSquaredDist1 = fastSquaredDistance(v1, norm1, v2, norm2, precision)
       assert((fastSquaredDist1 - squaredDist) <= precision * squaredDist, s"failed with m = $m")
       val fastSquaredDist2 =
         fastSquaredDistance(v1, norm1, Vectors.dense(v2.toArray), norm2, precision)
       assert((fastSquaredDist2 - squaredDist) <= precision * squaredDist, s"failed with m = $m")
+      val squaredDist2 = breezeSquaredDistance(v2.toBreeze, v3.toBreeze)
+      val fastSquaredDist3 =
+        fastSquaredDistance(v2, norm2, v3, norm3, precision)
+      assert((fastSquaredDist3 - squaredDist2) <= precision * squaredDist2, s"failed with m = $m")
+      if (m > 10) { 
+        val v4 = Vectors.sparse(n, indices.slice(0, m - 10),
+          indices.map(i => a(i) + 0.5).slice(0, m - 10))
+        val norm4 = Vectors.norm(v4, 2.0)
+        val squaredDist = breezeSquaredDistance(v2.toBreeze, v4.toBreeze)
+        val fastSquaredDist =
+          fastSquaredDistance(v2, norm2, v4, norm4, precision)
+        assert((fastSquaredDist - squaredDist) <= precision * squaredDist, s"failed with m = $m")
+      }
     }
   }
 
@@ -189,4 +203,12 @@ class MLUtilsSuite extends FunSuite with MLlibTestSparkContext {
     assert(points.collect().toSet === loaded.collect().toSet)
     Utils.deleteRecursively(tempDir)
   }
+
+  test("log1pExp") {
+    assert(log1pExp(76.3) ~== math.log1p(math.exp(76.3)) relTol 1E-10)
+    assert(log1pExp(87296763.234) ~== 87296763.234 relTol 1E-10)
+
+    assert(log1pExp(-13.8) ~== math.log1p(math.exp(-13.8)) absTol 1E-10)
+    assert(log1pExp(-238423789.865) ~== math.log1p(math.exp(-238423789.865)) absTol 1E-10)
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala
index 30b906aaa3ba4..e957fa5d25f4c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala
@@ -178,17 +178,17 @@ object TestingUtils {
   implicit class MatrixWithAlmostEquals(val x: Matrix) {
 
     /**
-     * When the difference of two vectors are within eps, returns true; otherwise, returns false.
+     * When the difference of two matrices are within eps, returns true; otherwise, returns false.
      */
     def ~=(r: CompareMatrixRightSide): Boolean = r.fun(x, r.y, r.eps)
 
     /**
-     * When the difference of two vectors are within eps, returns false; otherwise, returns true.
+     * When the difference of two matrices are within eps, returns false; otherwise, returns true.
      */
     def !~=(r: CompareMatrixRightSide): Boolean = !r.fun(x, r.y, r.eps)
 
     /**
-     * Throws exception when the difference of two vectors are NOT within eps;
+     * Throws exception when the difference of two matrices are NOT within eps;
      * otherwise, returns true.
      */
     def ~==(r: CompareMatrixRightSide): Boolean = {
diff --git a/network/common/pom.xml b/network/common/pom.xml
index baca859fa5011..245a96b8c4038 100644
--- a/network/common/pom.xml
+++ b/network/common/pom.xml
@@ -75,11 +75,6 @@
       <artifactId>mockito-all</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
 
   <build>
diff --git a/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java b/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java
index f60573998f7ae..6c9178688693f 100644
--- a/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java
+++ b/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java
@@ -35,14 +35,15 @@ public boolean preferDirectBufs() {
     return conf.getBoolean("spark.shuffle.io.preferDirectBufs", true);
   }
 
-  /** Connect timeout in secs. Default 120 secs. */
+  /** Connect timeout in milliseconds. Default 120 secs. */
   public int connectionTimeoutMs() {
-    return conf.getInt("spark.shuffle.io.connectionTimeout", 120) * 1000;
+    int defaultTimeout = conf.getInt("spark.network.timeout", 120);
+    return conf.getInt("spark.shuffle.io.connectionTimeout", defaultTimeout) * 1000;
   }
 
-  /** Number of concurrent connections between two nodes for fetching data. **/
+  /** Number of concurrent connections between two nodes for fetching data. */
   public int numConnectionsPerPeer() {
-    return conf.getInt("spark.shuffle.io.numConnectionsPerPeer", 2);
+    return conf.getInt("spark.shuffle.io.numConnectionsPerPeer", 1);
   }
 
   /** Requested maximum length of the queue of incoming connections. Default -1 for no backlog. */
@@ -67,7 +68,7 @@ public int numConnectionsPerPeer() {
   public int sendBuf() { return conf.getInt("spark.shuffle.io.sendBuffer", -1); }
 
   /** Timeout for a single round trip of SASL token exchange, in milliseconds. */
-  public int saslRTTimeout() { return conf.getInt("spark.shuffle.sasl.timeout", 30000); }
+  public int saslRTTimeoutMs() { return conf.getInt("spark.shuffle.sasl.timeout", 30) * 1000; }
 
   /**
    * Max number of times we will try IO exceptions (such as connection timeouts) per request.
@@ -79,7 +80,7 @@ public int numConnectionsPerPeer() {
    * Time (in milliseconds) that we will wait in order to perform a retry after an IOException.
    * Only relevant if maxIORetries &gt; 0.
    */
-  public int ioRetryWaitTime() { return conf.getInt("spark.shuffle.io.retryWaitMs", 5000); }
+  public int ioRetryWaitTimeMs() { return conf.getInt("spark.shuffle.io.retryWait", 5) * 1000; }
 
   /**
    * Minimum size of a block that we should start using memory map rather than reading in through
diff --git a/network/shuffle/pom.xml b/network/shuffle/pom.xml
index 12468567c3aed..5bfa1ac9c373e 100644
--- a/network/shuffle/pom.xml
+++ b/network/shuffle/pom.xml
@@ -83,11 +83,6 @@
       <artifactId>mockito-all</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
 
   <build>
diff --git a/network/shuffle/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java b/network/shuffle/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java
index 7bc91e375371f..33aa1344345ff 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java
@@ -59,7 +59,7 @@ public void doBootstrap(TransportClient client) {
         ByteBuf buf = Unpooled.buffer(msg.encodedLength());
         msg.encode(buf);
 
-        byte[] response = client.sendRpcSync(buf.array(), conf.saslRTTimeout());
+        byte[] response = client.sendRpcSync(buf.array(), conf.saslRTTimeoutMs());
         payload = saslClient.response(response);
       }
     } finally {
diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockManager.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockManager.java
index dfe0ba0595090..93e6fdd7161fa 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockManager.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockManager.java
@@ -37,6 +37,7 @@
 import org.apache.spark.network.buffer.ManagedBuffer;
 import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo;
 import org.apache.spark.network.util.JavaUtils;
+import org.apache.spark.network.util.NettyUtils;
 import org.apache.spark.network.util.TransportConf;
 
 /**
@@ -49,7 +50,7 @@
  * the Executor's memory, unlike the IndexShuffleBlockManager.
  */
 public class ExternalShuffleBlockManager {
-  private final Logger logger = LoggerFactory.getLogger(ExternalShuffleBlockManager.class);
+  private static final Logger logger = LoggerFactory.getLogger(ExternalShuffleBlockManager.class);
 
   // Map containing all registered executors' metadata.
   private final ConcurrentMap<AppExecId, ExecutorShuffleInfo> executors;
@@ -60,8 +61,9 @@ public class ExternalShuffleBlockManager {
   private final TransportConf conf;
 
   public ExternalShuffleBlockManager(TransportConf conf) {
-    // TODO: Give this thread a name.
-    this(conf, Executors.newSingleThreadExecutor());
+    this(conf, Executors.newSingleThreadExecutor(
+        // Add `spark` prefix because it will run in NM in Yarn mode.
+        NettyUtils.createThreadFactory("spark-shuffle-directory-cleaner")));
   }
 
   // Allows tests to have more control over when directories are cleaned up.
diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/RetryingBlockFetcher.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/RetryingBlockFetcher.java
index f8a1a266863bb..4bb0498e5d5aa 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/RetryingBlockFetcher.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/RetryingBlockFetcher.java
@@ -106,7 +106,7 @@ public RetryingBlockFetcher(
     this.fetchStarter = fetchStarter;
     this.listener = listener;
     this.maxRetries = conf.maxIORetries();
-    this.retryWaitTime = conf.ioRetryWaitTime();
+    this.retryWaitTime = conf.ioRetryWaitTimeMs();
     this.outstandingBlocksIds = Sets.newLinkedHashSet();
     Collections.addAll(outstandingBlocksIds, blockIds);
     this.currentListener = new RetryingBlockFetchListener();
diff --git a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java
index 0191fe529e1be..1ad0d72ae5ec5 100644
--- a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java
+++ b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java
@@ -54,13 +54,13 @@ public class RetryingBlockFetcherSuite {
   @Before
   public void beforeEach() {
     System.setProperty("spark.shuffle.io.maxRetries", "2");
-    System.setProperty("spark.shuffle.io.retryWaitMs", "0");
+    System.setProperty("spark.shuffle.io.retryWait", "0");
   }
 
   @After
   public void afterEach() {
     System.clearProperty("spark.shuffle.io.maxRetries");
-    System.clearProperty("spark.shuffle.io.retryWaitMs");
+    System.clearProperty("spark.shuffle.io.retryWait");
   }
 
   @Test
diff --git a/pom.xml b/pom.xml
index f42257265ede3..b993391b15042 100644
--- a/pom.xml
+++ b/pom.xml
@@ -115,16 +115,18 @@
     <java.version>1.6</java.version>
     <sbt.project.name>spark</sbt.project.name>
     <scala.macros.version>2.0.1</scala.macros.version>
-    <mesos.version>0.18.1</mesos.version>
+    <mesos.version>0.21.0</mesos.version>
     <mesos.classifier>shaded-protobuf</mesos.classifier>
     <slf4j.version>1.7.5</slf4j.version>
     <log4j.version>1.2.17</log4j.version>
     <hadoop.version>1.0.4</hadoop.version>
     <protobuf.version>2.4.1</protobuf.version>
     <yarn.version>${hadoop.version}</yarn.version>
-    <hbase.version>0.94.6</hbase.version>
+    <hbase.version>0.98.7-hadoop1</hbase.version>
+    <hbase.artifact>hbase</hbase.artifact>
     <flume.version>1.4.0</flume.version>
     <zookeeper.version>3.4.5</zookeeper.version>
+    <hive.group>org.spark-project.hive</hive.group>
     <!-- Version used in Maven Hive dependency -->
     <hive.version>0.13.1a</hive.version>
     <!-- Version used for internal directory structure -->
@@ -143,12 +145,36 @@
     <commons.httpclient.version>4.2.6</commons.httpclient.version>
     <commons.math3.version>3.1.1</commons.math3.version>
     <test_classpath_file>${project.build.directory}/spark-test-classpath.txt</test_classpath_file>
-    <PermGen>64m</PermGen>
-    <MaxPermGen>512m</MaxPermGen>
     <scala.version>2.10.4</scala.version>
     <scala.binary.version>2.10</scala.binary.version>
     <jline.version>${scala.version}</jline.version>
     <jline.groupid>org.scala-lang</jline.groupid>
+    <codehaus.jackson.version>1.8.8</codehaus.jackson.version>
+    <snappy.version>1.1.1.6</snappy.version>
+
+    <!--
+      Dependency scopes that can be overridden by enabling certain profiles. These profiles are
+      declared in the projects that build assemblies.
+
+      For other projects the scope should remain as "compile", otherwise they are not available
+      during compilation if the dependency is transivite (e.g. "bagel/" depending on "core/" and
+      needing Hadoop classes in the classpath to compile).
+    -->
+    <flume.deps.scope>compile</flume.deps.scope>
+    <hadoop.deps.scope>compile</hadoop.deps.scope>
+    <hbase.deps.scope>compile</hbase.deps.scope>
+    <hive.deps.scope>compile</hive.deps.scope>
+    <parquet.deps.scope>compile</parquet.deps.scope>
+
+    <!--
+      Overridable test home. So that you can call individual pom files directory without
+      things breaking.
+    -->
+    <spark.test.home>${session.executionRootDirectory}</spark.test.home>
+
+    <PermGen>64m</PermGen>
+    <MaxPermGen>512m</MaxPermGen>
+    <CodeCacheSize>512m</CodeCacheSize>
   </properties>
 
   <repositories>
@@ -230,18 +256,6 @@
         <enabled>false</enabled>
       </snapshots>
     </repository>
-    <!-- TODO: remove this as soon as 1.2.0 is published on Maven central -->
-    <repository>
-      <id>spark-staging-1038</id>
-      <name>Spark 1.2.0 Staging (1038)</name>
-      <url>https://repository.apache.org/content/repositories/orgapachespark-1038/</url>
-      <releases>
-        <enabled>true</enabled>
-      </releases>
-      <snapshots>
-        <enabled>false</enabled>
-      </snapshots>
-    </repository>
   </repositories>
   <pluginRepositories>
     <pluginRepository>
@@ -255,21 +269,20 @@
       </snapshots>
     </pluginRepository>
   </pluginRepositories>
-
   <dependencies>
-  <!--
-       This is a dummy dependency that is used along with the shading plug-in
-       to create effective poms on publishing (see SPARK-3812).
-  -->
+    <!--
+      This is a dummy dependency that is used along with the shading plug-in
+      to create effective poms on publishing (see SPARK-3812).
+    -->
     <dependency>
       <groupId>org.spark-project.spark</groupId>
       <artifactId>unused</artifactId>
       <version>1.0.0</version>
     </dependency>
     <!--
-         This depndency has been added to provided scope as it is needed for excuting build
-         specific groovy scripts using gmaven+ and not required for downstream project building
-         with spark.
+      This depndency has been added to provided scope as it is needed for executing build
+      specific groovy scripts using gmaven+ and not required for downstream project building
+      with spark.
     -->
     <dependency>
       <groupId>org.codehaus.groovy</groupId>
@@ -277,6 +290,15 @@
       <version>2.3.7</version>
       <scope>provided</scope>
     </dependency>
+    <!--
+         This is needed by the scalatest plugin, and so is declared here to be available in
+         all child modules, just as scalatest is run in all children
+    -->
+    <dependency>
+      <groupId>org.scalatest</groupId>
+      <artifactId>scalatest_${scala.binary.version}</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <dependencyManagement>
     <dependencies>
@@ -371,11 +393,13 @@
         <groupId>org.slf4j</groupId>
         <artifactId>slf4j-api</artifactId>
         <version>${slf4j.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
       </dependency>
       <dependency>
         <groupId>org.slf4j</groupId>
         <artifactId>slf4j-log4j12</artifactId>
         <version>${slf4j.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
       </dependency>
       <dependency>
         <groupId>org.slf4j</groupId>
@@ -392,6 +416,7 @@
         <groupId>log4j</groupId>
         <artifactId>log4j</artifactId>
         <version>${log4j.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
       </dependency>
       <dependency>
         <groupId>com.ning</groupId>
@@ -401,7 +426,8 @@
       <dependency>
         <groupId>org.xerial.snappy</groupId>
         <artifactId>snappy-java</artifactId>
-        <version>1.1.1.6</version>
+        <version>${snappy.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
       </dependency>
       <dependency>
         <groupId>net.jpountz.lz4</groupId>
@@ -429,6 +455,7 @@
         <groupId>com.google.protobuf</groupId>
         <artifactId>protobuf-java</artifactId>
         <version>${protobuf.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
       </dependency>
       <dependency>
         <groupId>${akka.group}</groupId>
@@ -450,6 +477,17 @@
         <artifactId>akka-testkit_${scala.binary.version}</artifactId>
         <version>${akka.version}</version>
       </dependency>
+      <dependency>
+        <groupId>${akka.group}</groupId>
+        <artifactId>akka-zeromq_${scala.binary.version}</artifactId>
+        <version>${akka.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>${akka.group}</groupId>
+            <artifactId>akka-actor_${scala.binary.version}</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
       <dependency>
         <groupId>org.apache.mesos</groupId>
         <artifactId>mesos</artifactId>
@@ -579,6 +617,7 @@
         <groupId>org.apache.curator</groupId>
         <artifactId>curator-recipes</artifactId>
         <version>2.4.0</version>
+        <scope>${hadoop.deps.scope}</scope>
         <exclusions>
           <exclusion>
             <groupId>org.jboss.netty</groupId>
@@ -590,6 +629,7 @@
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-client</artifactId>
         <version>${hadoop.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
         <exclusions>
           <exclusion>
             <groupId>asm</groupId>
@@ -625,11 +665,13 @@
         <groupId>org.apache.avro</groupId>
         <artifactId>avro</artifactId>
         <version>${avro.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
       </dependency>
       <dependency>
         <groupId>org.apache.avro</groupId>
         <artifactId>avro-ipc</artifactId>
         <version>${avro.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
         <exclusions>
           <exclusion>
             <groupId>io.netty</groupId>
@@ -658,6 +700,7 @@
         <artifactId>avro-mapred</artifactId>
         <version>${avro.version}</version>
         <classifier>${avro.mapred.classifier}</classifier>
+        <scope>${hive.deps.scope}</scope>
         <exclusions>
           <exclusion>
             <groupId>io.netty</groupId>
@@ -686,6 +729,7 @@
         <groupId>net.java.dev.jets3t</groupId>
         <artifactId>jets3t</artifactId>
         <version>${jets3t.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
         <exclusions>
           <exclusion>
             <groupId>commons-logging</groupId>
@@ -697,6 +741,7 @@
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-yarn-api</artifactId>
         <version>${yarn.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
         <exclusions>
           <exclusion>
             <groupId>javax.servlet</groupId>
@@ -724,6 +769,7 @@
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-yarn-common</artifactId>
         <version>${yarn.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
         <exclusions>
           <exclusion>
             <groupId>asm</groupId>
@@ -780,6 +826,7 @@
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-yarn-server-web-proxy</artifactId>
         <version>${yarn.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
         <exclusions>
           <exclusion>
             <groupId>asm</groupId>
@@ -807,6 +854,7 @@
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-yarn-client</artifactId>
         <version>${yarn.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
         <exclusions>
           <exclusion>
             <groupId>asm</groupId>
@@ -831,10 +879,126 @@
         </exclusions>
       </dependency>
       <dependency>
-        <!-- Matches the version of jackson-core-asl pulled in by avro -->
+        <groupId>org.apache.zookeeper</groupId>
+        <artifactId>zookeeper</artifactId>
+        <version>${zookeeper.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
+      </dependency>
+      <dependency>
+        <groupId>org.codehaus.jackson</groupId>
+        <artifactId>jackson-core-asl</artifactId>
+        <version>${codehaus.jackson.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
+      </dependency>
+      <dependency>
         <groupId>org.codehaus.jackson</groupId>
         <artifactId>jackson-mapper-asl</artifactId>
-        <version>1.8.8</version>
+        <version>${codehaus.jackson.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
+      </dependency>
+      <dependency>
+        <groupId>${hive.group}</groupId>
+        <artifactId>hive-beeline</artifactId>
+        <version>${hive.version}</version>
+        <scope>${hive.deps.scope}</scope>
+      </dependency>
+      <dependency>
+        <groupId>${hive.group}</groupId>
+        <artifactId>hive-cli</artifactId>
+        <version>${hive.version}</version>
+        <scope>${hive.deps.scope}</scope>
+      </dependency>
+      <dependency>
+        <groupId>${hive.group}</groupId>
+        <artifactId>hive-exec</artifactId>
+        <version>${hive.version}</version>
+        <scope>${hive.deps.scope}</scope>
+        <exclusions>
+          <exclusion>
+            <groupId>commons-logging</groupId>
+            <artifactId>commons-logging</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.esotericsoftware.kryo</groupId>
+            <artifactId>kryo</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
+        <groupId>${hive.group}</groupId>
+        <artifactId>hive-jdbc</artifactId>
+        <version>${hive.version}</version>
+        <scope>${hive.deps.scope}</scope>
+      </dependency>
+      <dependency>
+        <groupId>${hive.group}</groupId>
+        <artifactId>hive-metastore</artifactId>
+        <version>${hive.version}</version>
+        <scope>${hive.deps.scope}</scope>
+      </dependency>
+      <dependency>
+        <groupId>${hive.group}</groupId>
+        <artifactId>hive-serde</artifactId>
+        <version>${hive.version}</version>
+        <scope>${hive.deps.scope}</scope>
+        <exclusions>
+          <exclusion>
+            <groupId>commons-logging</groupId>
+            <artifactId>commons-logging</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>commons-logging</groupId>
+            <artifactId>commons-logging-api</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
+        <groupId>com.twitter</groupId>
+        <artifactId>parquet-column</artifactId>
+        <version>${parquet.version}</version>
+        <scope>${parquet.deps.scope}</scope>
+      </dependency>
+      <dependency>
+        <groupId>com.twitter</groupId>
+        <artifactId>parquet-hadoop</artifactId>
+        <version>${parquet.version}</version>
+        <scope>${parquet.deps.scope}</scope>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.flume</groupId>
+        <artifactId>flume-ng-core</artifactId>
+        <version>${flume.version}</version>
+        <scope>${flume.deps.scope}</scope>
+        <exclusions>
+          <exclusion>
+            <groupId>io.netty</groupId>
+            <artifactId>netty</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.thrift</groupId>
+            <artifactId>libthrift</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.mortbay.jetty</groupId>
+            <artifactId>servlet-api</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.flume</groupId>
+        <artifactId>flume-ng-sdk</artifactId>
+        <version>${flume.version}</version>
+        <scope>${flume.deps.scope}</scope>
+        <exclusions>
+          <exclusion>
+            <groupId>io.netty</groupId>
+            <artifactId>netty</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.thrift</groupId>
+            <artifactId>libthrift</artifactId>
+          </exclusion>
+        </exclusions>
       </dependency>
     </dependencies>
   </dependencyManagement>
@@ -911,6 +1075,7 @@
               <jvmArg>-Xmx1024m</jvmArg>
               <jvmArg>-XX:PermSize=${PermGen}</jvmArg>
               <jvmArg>-XX:MaxPermSize=${MaxPermGen}</jvmArg>
+              <jvmArg>-XX:ReservedCodeCacheSize=${CodeCacheSize}</jvmArg>
             </jvmArgs>
             <javacArgs>
               <javacArg>-source</javacArg>
@@ -941,32 +1106,58 @@
             <fork>true</fork>
           </configuration>
         </plugin>
+        <!-- Surefire runs all Java tests -->
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-surefire-plugin</artifactId>
-          <version>2.17</version>
+          <version>2.18</version>
+          <!-- Note config is repeated in scalatest config -->
           <configuration>
-            <!-- Uses scalatest instead -->
-            <skipTests>true</skipTests>
+            <includes>
+              <include>**/Test*.java</include>
+              <include>**/*Test.java</include>
+              <include>**/*TestCase.java</include>
+              <include>**/*Suite.java</include>
+            </includes>
+            <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
+            <argLine>-Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
+            <systemProperties>
+              <java.awt.headless>true</java.awt.headless>
+              <spark.test.home>${session.executionRootDirectory}</spark.test.home>
+              <spark.testing>1</spark.testing>
+              <spark.ui.enabled>false</spark.ui.enabled>
+              <spark.ui.showConsoleProgress>false</spark.ui.showConsoleProgress>
+              <spark.executor.extraClassPath>${test_classpath}</spark.executor.extraClassPath>
+              <spark.driver.allowMultipleContexts>true</spark.driver.allowMultipleContexts>
+            </systemProperties>
+            <failIfNoTests>false</failIfNoTests>
           </configuration>
         </plugin>
+        <!-- Scalatest runs all Scala tests -->
         <plugin>
           <groupId>org.scalatest</groupId>
           <artifactId>scalatest-maven-plugin</artifactId>
           <version>1.0</version>
+          <!-- Note config is repeated in surefire config -->
           <configuration>
             <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
             <junitxml>.</junitxml>
             <filereports>SparkTestSuite.txt</filereports>
-            <argLine>-Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
+            <argLine>-ea -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize}</argLine>
             <stderr/>
+            <environmentVariables>
+              <!--
+                Setting SPARK_DIST_CLASSPATH is a simple way to make sure any child processes
+                launched by the tests have access to the correct test-time classpath.
+              -->
+              <SPARK_DIST_CLASSPATH>${test_classpath}</SPARK_DIST_CLASSPATH>
+            </environmentVariables>
             <systemProperties>
               <java.awt.headless>true</java.awt.headless>
-              <spark.test.home>${session.executionRootDirectory}</spark.test.home>
+              <spark.test.home>${spark.test.home}</spark.test.home>
               <spark.testing>1</spark.testing>
               <spark.ui.enabled>false</spark.ui.enabled>
               <spark.ui.showConsoleProgress>false</spark.ui.showConsoleProgress>
-              <spark.executor.extraClassPath>${test_classpath}</spark.executor.extraClassPath>
               <spark.driver.allowMultipleContexts>true</spark.driver.allowMultipleContexts>
             </systemProperties>
           </configuration>
@@ -989,11 +1180,6 @@
           <artifactId>maven-antrun-plugin</artifactId>
           <version>1.7</version>
         </plugin>
-        <plugin>
-          <groupId>org.apache.maven.plugins</groupId>
-          <artifactId>maven-shade-plugin</artifactId>
-          <version>2.2</version>
-        </plugin>
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-source-plugin</artifactId>
@@ -1022,6 +1208,9 @@
               <fileset>
                 <directory>checkpoint</directory>
               </fileset>
+              <fileset>
+                <directory>lib_managed</directory>
+              </fileset>
             </filesets>
           </configuration>
         </plugin>
@@ -1079,6 +1268,7 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-shade-plugin</artifactId>
+        <version>2.2</version>
         <configuration>
           <shadedArtifactAttached>false</shadedArtifactAttached>
           <artifactSet>
@@ -1162,6 +1352,15 @@
           </execution>
         </executions>
       </plugin>
+      <!-- Enable surefire and scalatest in all children, in one place: -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-surefire-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
+      </plugin>
     </plugins>
   </build>
 
@@ -1267,6 +1466,7 @@
       <properties>
         <hadoop.version>2.2.0</hadoop.version>
         <protobuf.version>2.5.0</protobuf.version>
+        <hbase.version>0.98.7-hadoop2</hbase.version>
         <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
       </properties>
     </profile>
@@ -1277,6 +1477,7 @@
         <hadoop.version>2.3.0</hadoop.version>
         <protobuf.version>2.5.0</protobuf.version>
         <jets3t.version>0.9.0</jets3t.version>
+        <hbase.version>0.98.7-hadoop2</hbase.version>
         <commons.math3.version>3.1.1</commons.math3.version>
         <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
       </properties>
@@ -1288,6 +1489,7 @@
         <hadoop.version>2.4.0</hadoop.version>
         <protobuf.version>2.5.0</protobuf.version>
         <jets3t.version>0.9.0</jets3t.version>
+        <hbase.version>0.98.7-hadoop2</hbase.version>
         <commons.math3.version>3.1.1</commons.math3.version>
         <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
       </properties>
@@ -1305,7 +1507,7 @@
       <id>mapr3</id>
       <properties>
         <hadoop.version>1.0.3-mapr-3.0.3</hadoop.version>
-        <yarn.version>2.3.0-mapr-4.0.0-FCS</yarn.version>
+        <yarn.version>2.4.1-mapr-1408</yarn.version>
         <hbase.version>0.94.17-mapr-1405</hbase.version>
         <zookeeper.version>3.4.5-mapr-1406</zookeeper.version>
       </properties>
@@ -1314,8 +1516,8 @@
     <profile>
       <id>mapr4</id>
       <properties>
-        <hadoop.version>2.3.0-mapr-4.0.0-FCS</hadoop.version>
-        <yarn.version>2.3.0-mapr-4.0.0-FCS</yarn.version>
+        <hadoop.version>2.4.1-mapr-1408</hadoop.version>
+        <yarn.version>2.4.1-mapr-1408</yarn.version>
         <hbase.version>0.94.17-mapr-1405-4.0.0-FCS</hbase.version>
         <zookeeper.version>3.4.5-mapr-1406</zookeeper.version>
       </properties>
@@ -1339,53 +1541,6 @@
       </dependencies>
     </profile>
 
-    <!-- Build without Hadoop dependencies that are included in some runtime environments. -->
-    <profile>
-      <id>hadoop-provided</id>
-      <dependencies>
-        <dependency>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-client</artifactId>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-yarn-api</artifactId>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-yarn-common</artifactId>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-yarn-server-web-proxy</artifactId>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-yarn-client</artifactId>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.avro</groupId>
-          <artifactId>avro</artifactId>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.avro</groupId>
-          <artifactId>avro-ipc</artifactId>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.zookeeper</groupId>
-          <artifactId>zookeeper</artifactId>
-          <version>${zookeeper.version}</version>
-          <scope>provided</scope>
-        </dependency>
-      </dependencies>
-    </profile>
     <profile>
       <id>hive-thriftserver</id>
       <modules>
@@ -1438,5 +1593,25 @@
       </properties>
     </profile>
 
+    <!--
+      These empty profiles are available in some sub-modules. Declare them here so that
+      maven does not complain when they're provided on the command line for a sub-module
+      that does not have them.
+    -->
+    <profile>
+      <id>flume-provided</id>
+    </profile>
+    <profile>
+      <id>hadoop-provided</id>
+    </profile>
+    <profile>
+      <id>hbase-provided</id>
+    </profile>
+    <profile>
+      <id>hive-provided</id>
+    </profile>
+    <profile>
+      <id>parquet-provided</id>
+    </profile>
   </profiles>
 </project>
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 230239aa40500..bc5d81f12d746 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -36,7 +36,6 @@ object MimaExcludes {
         case v if v.startsWith("1.3") =>
           Seq(
             MimaBuild.excludeSparkPackage("deploy"),
-            MimaBuild.excludeSparkPackage("graphx"),
             // These are needed if checking against the sbt build, since they are part of
             // the maven-generated artifacts in the 1.2 build.
             MimaBuild.excludeSparkPackage("unused"),
@@ -53,6 +52,48 @@ object MimaExcludes {
               "org.apache.spark.mllib.linalg.Matrices.randn"),
             ProblemFilters.exclude[MissingMethodProblem](
               "org.apache.spark.mllib.linalg.Matrices.rand")
+          ) ++ Seq(
+            // SPARK-3325
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.streaming.api.java.JavaDStreamLike.print"),
+            // SPARK-2757
+            ProblemFilters.exclude[IncompatibleResultTypeProblem](
+              "org.apache.spark.streaming.flume.sink.SparkAvroCallbackHandler." +
+                "removeAndGetProcessor")
+          ) ++ Seq(
+            // SPARK-5123 (SparkSQL data type change) - alpha component only
+            ProblemFilters.exclude[IncompatibleResultTypeProblem](
+              "org.apache.spark.ml.feature.HashingTF.outputDataType"),
+            ProblemFilters.exclude[IncompatibleResultTypeProblem](
+              "org.apache.spark.ml.feature.Tokenizer.outputDataType"),
+            ProblemFilters.exclude[IncompatibleMethTypeProblem](
+              "org.apache.spark.ml.feature.Tokenizer.validateInputType"),
+            ProblemFilters.exclude[IncompatibleMethTypeProblem](
+              "org.apache.spark.ml.classification.LogisticRegressionModel.validateAndTransformSchema"),
+            ProblemFilters.exclude[IncompatibleMethTypeProblem](
+              "org.apache.spark.ml.classification.LogisticRegression.validateAndTransformSchema")
+          ) ++ Seq(
+            // SPARK-4014
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.TaskContext.taskAttemptId"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.TaskContext.attemptNumber")
+          ) ++ Seq(
+            // SPARK-5166 Spark SQL API stabilization
+            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Transformer.transform"),
+            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Estimator.fit")
+          ) ++ Seq(
+            // SPARK-5270
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.api.java.JavaRDDLike.isEmpty")
+          ) ++ Seq(
+            // SPARK-5297 Java FileStream do not work with custom key/values
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.streaming.api.java.JavaStreamingContext.fileStream")
+          ) ++ Seq(
+            // SPARK-5315 Spark Streaming Java API returns Scala DStream
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.streaming.api.java.JavaDStreamLike.reduceByWindow")
           )
 
         case v if v.startsWith("1.2") =>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 39ac27f820d89..ded4b5443a904 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -15,6 +15,8 @@
  * limitations under the License.
  */
 
+import java.io.File
+
 import scala.util.Properties
 import scala.collection.JavaConversions._
 
@@ -23,7 +25,7 @@ import sbt.Classpaths.publishTask
 import sbt.Keys._
 import sbtunidoc.Plugin.genjavadocSettings
 import sbtunidoc.Plugin.UnidocKeys.unidocGenjavadocVersion
-import com.typesafe.sbt.pom.{PomBuild, SbtPomKeys}
+import com.typesafe.sbt.pom.{loadEffectivePom, PomBuild, SbtPomKeys}
 import net.virtualvoid.sbt.graph.Plugin.graphSettings
 
 object BuildCommons {
@@ -153,7 +155,7 @@ object SparkBuild extends PomBuild {
 
   // TODO: Add Sql to mima checks
   allProjects.filterNot(x => Seq(spark, sql, hive, hiveThriftServer, catalyst, repl,
-    streamingFlumeSink, networkCommon, networkShuffle, networkYarn).contains(x)).foreach {
+    networkCommon, networkShuffle, networkYarn).contains(x)).foreach {
       x => enable(MimaBuild.mimaSettings(sparkHome, x))(x)
     }
 
@@ -214,8 +216,6 @@ object OldDeps {
   def oldDepsSettings() = Defaults.coreDefaultSettings ++ Seq(
     name := "old-deps",
     scalaVersion := "2.10.4",
-    // TODO: remove this as soon as 1.2.0 is published on Maven central.
-    resolvers += "spark-staging-1038" at "https://repository.apache.org/content/repositories/orgapachespark-1038/",
     retrieveManaged := true,
     retrievePattern := "[type]s/[artifact](-[revision])(-[classifier]).[ext]",
     libraryDependencies := Seq("spark-streaming-mqtt", "spark-streaming-zeromq",
@@ -243,10 +243,10 @@ object SQL {
         |import org.apache.spark.sql.catalyst.expressions._
         |import org.apache.spark.sql.catalyst.plans.logical._
         |import org.apache.spark.sql.catalyst.rules._
-        |import org.apache.spark.sql.catalyst.types._
         |import org.apache.spark.sql.catalyst.util._
         |import org.apache.spark.sql.execution
         |import org.apache.spark.sql.test.TestSQLContext._
+        |import org.apache.spark.sql.types._
         |import org.apache.spark.sql.parquet.ParquetTestData""".stripMargin,
     cleanupCommands in console := "sparkContext.stop()"
   )
@@ -256,6 +256,8 @@ object Hive {
 
   lazy val settings = Seq(
     javaOptions += "-XX:MaxPermSize=1g",
+    // Specially disable assertions since some Hive tests fail them
+    javaOptions in Test := (javaOptions in Test).value.filterNot(_ == "-ea"),
     // Multiple queries rely on the TestHive singleton. See comments there for more details.
     parallelExecution in Test := false,
     // Supporting all SerDes requires us to depend on deprecated APIs, so we turn off the warnings
@@ -271,11 +273,11 @@ object Hive {
         |import org.apache.spark.sql.catalyst.expressions._
         |import org.apache.spark.sql.catalyst.plans.logical._
         |import org.apache.spark.sql.catalyst.rules._
-        |import org.apache.spark.sql.catalyst.types._
         |import org.apache.spark.sql.catalyst.util._
         |import org.apache.spark.sql.execution
         |import org.apache.spark.sql.hive._
         |import org.apache.spark.sql.hive.test.TestHive._
+        |import org.apache.spark.sql.types._
         |import org.apache.spark.sql.parquet.ParquetTestData""".stripMargin,
     cleanupCommands in console := "sparkContext.stop()",
     // Some of our log4j jars make it impossible to submit jobs from this JVM to Hive Map/Reduce
@@ -290,17 +292,15 @@ object Assembly {
   import sbtassembly.Plugin._
   import AssemblyKeys._
 
+  val hadoopVersion = taskKey[String]("The version of hadoop that spark is compiled against.")
+
   lazy val settings = assemblySettings ++ Seq(
     test in assembly := {},
-    jarName in assembly <<= (version, moduleName) map { (v, mName) =>
-      if (mName.contains("network-yarn")) {
-        // This must match the same name used in maven (see network/yarn/pom.xml)
-        "spark-" + v + "-yarn-shuffle.jar"
-      } else {
-        mName + "-" + v + "-hadoop" +
-          Option(System.getProperty("hadoop.version")).getOrElse("1.0.4") + ".jar"
-      }
+    hadoopVersion := {
+      sys.props.get("hadoop.version")
+        .getOrElse(SbtPomKeys.effectivePom.value.getProperties.get("hadoop.version").asInstanceOf[String])
     },
+    jarName in assembly := s"${moduleName.value}-${version.value}-hadoop${hadoopVersion.value}.jar",
     mergeStrategy in assembly := {
       case PathList("org", "datanucleus", xs @ _*)             => MergeStrategy.discard
       case m if m.toLowerCase.endsWith("manifest.mf")          => MergeStrategy.discard
@@ -311,7 +311,6 @@ object Assembly {
       case _                                                   => MergeStrategy.first
     }
   )
-
 }
 
 object Unidoc {
@@ -385,6 +384,7 @@ object TestSettings {
     javaOptions in Test += "-Dsun.io.serialization.extendedDebugInfo=true",
     javaOptions in Test ++= System.getProperties.filter(_._1 startsWith "spark")
       .map { case (k,v) => s"-D$k=$v" }.toSeq,
+    javaOptions in Test += "-ea",
     javaOptions in Test ++= "-Xmx3g -XX:PermSize=128M -XX:MaxNewSize=256m -XX:MaxPermSize=1g"
       .split(" ").toSeq,
     // This places test scope jars on the classpath of executors during tests.
diff --git a/python/docs/pyspark.streaming.rst b/python/docs/pyspark.streaming.rst
index 5024d694b668f..f08185627d0bc 100644
--- a/python/docs/pyspark.streaming.rst
+++ b/python/docs/pyspark.streaming.rst
@@ -1,5 +1,5 @@
 pyspark.streaming module
-==================
+========================
 
 Module contents
 ---------------
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index a294316106b9e..66d3e728b55a9 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -233,6 +233,14 @@ def _ensure_initialized(cls, instance=None, gateway=None):
                 else:
                     SparkContext._active_spark_context = instance
 
+    def __getnewargs__(self):
+        # This method is called when attempting to pickle SparkContext, which is always an error:
+        raise Exception(
+            "It appears that you are attempting to reference SparkContext from a broadcast "
+            "variable, action, or transforamtion. SparkContext can only be used on the driver, "
+            "not in code that it run on workers. For more information, see SPARK-5063."
+        )
+
     def __enter__(self):
         """
         Enable 'with SparkContext(...) as sc: app(sc)' syntax.
@@ -323,7 +331,7 @@ def f(split, iterator):
         # Make sure we distribute data evenly if it's smaller than self.batchSize
         if "__len__" not in dir(c):
             c = list(c)    # Make it a list so we can compute its length
-        batchSize = max(1, min(len(c) // numSlices, self._batchSize))
+        batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024))
         serializer = BatchedSerializer(self._unbatched_serializer, batchSize)
         serializer.dump_stream(c, tempFile)
         tempFile.close()
@@ -411,7 +419,7 @@ def wholeTextFiles(self, path, minPartitions=None, use_unicode=True):
 
     def binaryFiles(self, path, minPartitions=None):
         """
-        :: Experimental ::
+        .. note:: Experimental
 
         Read a directory of binary files from HDFS, a local file system
         (available on all nodes), or any Hadoop-supported file system URI
@@ -428,7 +436,7 @@ def binaryFiles(self, path, minPartitions=None):
 
     def binaryRecords(self, path, recordLength):
         """
-        :: Experimental ::
+        .. note:: Experimental
 
         Load data from a flat binary file, assuming each record is a set of numbers
         with the specified numerical format (see ByteBuffer), and the number of
diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py
index 5030a655fcbba..c3217620e3c4e 100644
--- a/python/pyspark/mllib/__init__.py
+++ b/python/pyspark/mllib/__init__.py
@@ -32,29 +32,4 @@
 import rand as random
 random.__name__ = 'random'
 random.RandomRDDs.__module__ = __name__ + '.random'
-
-
-class RandomModuleHook(object):
-    """
-    Hook to import pyspark.mllib.random
-    """
-    fullname = __name__ + '.random'
-
-    def find_module(self, name, path=None):
-        # skip all other modules
-        if not name.startswith(self.fullname):
-            return
-        return self
-
-    def load_module(self, name):
-        if name == self.fullname:
-            return random
-
-        cname = name.rsplit('.', 1)[-1]
-        try:
-            return getattr(random, cname)
-        except AttributeError:
-            raise ImportError
-
-
-sys.meta_path.append(RandomModuleHook())
+sys.modules[__name__ + '.random'] = random
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index f14d0ed11cbbb..00e2e76711e84 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -41,7 +41,7 @@ def __init__(self, weights, intercept):
 
     def setThreshold(self, value):
         """
-        :: Experimental ::
+        .. note:: Experimental
 
         Sets the threshold that separates positive predictions from negative
         predictions. An example with prediction score greater than or equal
@@ -51,7 +51,7 @@ def setThreshold(self, value):
 
     def clearThreshold(self):
         """
-        :: Experimental ::
+        .. note:: Experimental
 
         Clears the threshold so that `predict` will output raw prediction scores.
         """
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index e2492eef5bd6a..6b713aa39374e 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -78,10 +78,10 @@ def predict(self, x):
 class KMeans(object):
 
     @classmethod
-    def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"):
+    def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||", seed=None):
         """Train a k-means clustering model."""
         model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations,
-                              runs, initializationMode)
+                              runs, initializationMode, seed)
         centers = callJavaFunc(rdd.context, model.clusterCenters)
         return KMeansModel([c.toArray() for c in centers])
 
diff --git a/python/pyspark/mllib/common.py b/python/pyspark/mllib/common.py
index 33c49e2399908..3c5ee66cd8b64 100644
--- a/python/pyspark/mllib/common.py
+++ b/python/pyspark/mllib/common.py
@@ -18,7 +18,7 @@
 import py4j.protocol
 from py4j.protocol import Py4JJavaError
 from py4j.java_gateway import JavaObject
-from py4j.java_collections import MapConverter, ListConverter, JavaArray, JavaList
+from py4j.java_collections import ListConverter, JavaArray, JavaList
 
 from pyspark import RDD, SparkContext
 from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
@@ -70,9 +70,7 @@ def _py2java(sc, obj):
         obj = _to_java_object_rdd(obj)
     elif isinstance(obj, SparkContext):
         obj = obj._jsc
-    elif isinstance(obj, dict):
-        obj = MapConverter().convert(obj, sc._gateway._gateway_client)
-    elif isinstance(obj, (list, tuple)):
+    elif isinstance(obj, list) and (obj or isinstance(obj[0], JavaObject)):
         obj = ListConverter().convert(obj, sc._gateway._gateway_client)
     elif isinstance(obj, JavaObject):
         pass
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 8cb992df2d9c7..10df6288065b8 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -28,7 +28,7 @@
 
 from pyspark import RDD, SparkContext
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
-from pyspark.mllib.linalg import Vectors, _convert_to_vector
+from pyspark.mllib.linalg import Vectors, Vector, _convert_to_vector
 
 __all__ = ['Normalizer', 'StandardScalerModel', 'StandardScaler',
            'HashingTF', 'IDFModel', 'IDF', 'Word2Vec', 'Word2VecModel']
@@ -36,7 +36,7 @@
 
 class VectorTransformer(object):
     """
-    :: DeveloperApi ::
+    .. note:: DeveloperApi
 
     Base class for transformation of a vector or RDD of vector
     """
@@ -51,12 +51,12 @@ def transform(self, vector):
 
 class Normalizer(VectorTransformer):
     """
-    :: Experimental ::
+    .. note:: Experimental
 
-    Normalizes samples individually to unit L\ :sup:`p`\ norm
+    Normalizes samples individually to unit L\ :sup:`p`\  norm
 
-    For any 1 <= `p` <= float('inf'), normalizes samples using
-    sum(abs(vector). :sup:`p`) :sup:`(1/p)` as norm.
+    For any 1 <= `p` < float('inf'), normalizes samples using
+    sum(abs(vector) :sup:`p`) :sup:`(1/p)` as norm.
 
     For `p` = float('inf'), max(abs(vector)) will be used as norm for normalization.
 
@@ -112,7 +112,7 @@ def transform(self, vector):
 
 class StandardScalerModel(JavaVectorTransformer):
     """
-    :: Experimental ::
+    .. note:: Experimental
 
     Represents a StandardScaler model that can transform vectors.
     """
@@ -129,7 +129,7 @@ def transform(self, vector):
 
 class StandardScaler(object):
     """
-    :: Experimental ::
+    .. note:: Experimental
 
     Standardizes features by removing the mean and scaling to unit
     variance using column summary statistics on the samples in the
@@ -172,7 +172,7 @@ def fit(self, dataset):
 
 class HashingTF(object):
     """
-    :: Experimental ::
+    .. note:: Experimental
 
     Maps a sequence of terms to their term frequencies using the hashing trick.
 
@@ -212,7 +212,7 @@ class IDFModel(JavaVectorTransformer):
     """
     Represents an IDF model that can transform term frequency vectors.
     """
-    def transform(self, dataset):
+    def transform(self, x):
         """
         Transforms term frequency (TF) vectors to TF-IDF vectors.
 
@@ -220,17 +220,19 @@ def transform(self, dataset):
         the terms which occur in fewer than `minDocFreq`
         documents will have an entry of 0.
 
-        :param dataset: an RDD of term frequency vectors
-        :return: an RDD of TF-IDF vectors
+        :param x: an RDD of term frequency vectors or a term frequency vector
+        :return: an RDD of TF-IDF vectors or a TF-IDF vector
         """
-        if not isinstance(dataset, RDD):
-            raise TypeError("dataset should be an RDD of term frequency vectors")
-        return JavaVectorTransformer.transform(self, dataset)
+        if isinstance(x, RDD):
+            return JavaVectorTransformer.transform(self, x)
+
+        x = _convert_to_vector(x)
+        return JavaVectorTransformer.transform(self, x)
 
 
 class IDF(object):
     """
-    :: Experimental ::
+    .. note:: Experimental
 
     Inverse document frequency (IDF).
 
@@ -255,6 +257,12 @@ class IDF(object):
     SparseVector(4, {1: 0.0, 3: 0.5754})
     DenseVector([0.0, 0.0, 1.3863, 0.863])
     SparseVector(4, {1: 0.0})
+    >>> model.transform(Vectors.dense([0.0, 1.0, 2.0, 3.0]))
+    DenseVector([0.0, 0.0, 1.3863, 0.863])
+    >>> model.transform([0.0, 1.0, 2.0, 3.0])
+    DenseVector([0.0, 0.0, 1.3863, 0.863])
+    >>> model.transform(Vectors.sparse(n, (1, 3), (1.0, 2.0)))
+    SparseVector(4, {1: 0.0, 3: 0.5754})
     """
     def __init__(self, minDocFreq=0):
         """
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index f7aa2b0cb04b3..7f21190ed8c25 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -178,7 +178,7 @@ def __init__(self, ar):
         elif not isinstance(ar, np.ndarray):
             ar = np.array(ar, dtype=np.float64)
         if ar.dtype != np.float64:
-            ar.astype(np.float64)
+            ar = ar.astype(np.float64)
         self.array = ar
 
     def __reduce__(self):
@@ -510,6 +510,23 @@ def __eq__(self, other):
                 and np.array_equal(other.indices, self.indices)
                 and np.array_equal(other.values, self.values))
 
+    def __getitem__(self, index):
+        inds = self.indices
+        vals = self.values
+        if not isinstance(index, int):
+            raise ValueError(
+                "Indices must be of type integer, got type %s" % type(index))
+        if index < 0:
+            index += self.size
+        if index >= self.size or index < 0:
+            raise ValueError("Index %d out of bounds." % index)
+
+        insert_index = np.searchsorted(inds, index)
+        row_ind = inds[insert_index]
+        if row_ind == index:
+            return vals[insert_index]
+        return 0.
+
     def __ne__(self, other):
         return not self.__eq__(other)
 
diff --git a/python/pyspark/mllib/rand.py b/python/pyspark/mllib/rand.py
index cb4304f92152b..20ee9d78bf5b0 100644
--- a/python/pyspark/mllib/rand.py
+++ b/python/pyspark/mllib/rand.py
@@ -99,6 +99,38 @@ def normalRDD(sc, size, numPartitions=None, seed=None):
         """
         return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed)
 
+    @staticmethod
+    def logNormalRDD(sc, mean, std, size, numPartitions=None, seed=None):
+        """
+        Generates an RDD comprised of i.i.d. samples from the log normal
+        distribution with the input mean and standard distribution.
+
+        :param sc: SparkContext used to create the RDD.
+        :param mean: mean for the log Normal distribution
+        :param std: std for the log Normal distribution
+        :param size: Size of the RDD.
+        :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
+        :param seed: Random seed (default: a random long integer).
+        :return: RDD of float comprised of i.i.d. samples ~ log N(mean, std).
+
+        >>> from math import sqrt, exp
+        >>> mean = 0.0
+        >>> std = 1.0
+        >>> expMean = exp(mean + 0.5 * std * std)
+        >>> expStd = sqrt((exp(std * std) - 1.0) * exp(2.0 * mean + std * std))
+        >>> x = RandomRDDs.logNormalRDD(sc, mean, std, 1000, seed=2L)
+        >>> stats = x.stats()
+        >>> stats.count()
+        1000L
+        >>> abs(stats.mean() - expMean) < 0.5
+        True
+        >>> from math import sqrt
+        >>> abs(stats.stdev() - expStd) < 0.5
+        True
+        """
+        return callMLlibFunc("logNormalRDD", sc._jsc, float(mean), float(std),
+                             size, numPartitions, seed)
+
     @staticmethod
     def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
         """
@@ -125,6 +157,63 @@ def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
         """
         return callMLlibFunc("poissonRDD", sc._jsc, float(mean), size, numPartitions, seed)
 
+    @staticmethod
+    def exponentialRDD(sc, mean, size, numPartitions=None, seed=None):
+        """
+        Generates an RDD comprised of i.i.d. samples from the Exponential
+        distribution with the input mean.
+
+        :param sc: SparkContext used to create the RDD.
+        :param mean: Mean, or 1 / lambda, for the Exponential distribution.
+        :param size: Size of the RDD.
+        :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
+        :param seed: Random seed (default: a random long integer).
+        :return: RDD of float comprised of i.i.d. samples ~ Exp(mean).
+
+        >>> mean = 2.0
+        >>> x = RandomRDDs.exponentialRDD(sc, mean, 1000, seed=2L)
+        >>> stats = x.stats()
+        >>> stats.count()
+        1000L
+        >>> abs(stats.mean() - mean) < 0.5
+        True
+        >>> from math import sqrt
+        >>> abs(stats.stdev() - sqrt(mean)) < 0.5
+        True
+        """
+        return callMLlibFunc("exponentialRDD", sc._jsc, float(mean), size, numPartitions, seed)
+
+    @staticmethod
+    def gammaRDD(sc, shape, scale, size, numPartitions=None, seed=None):
+        """
+        Generates an RDD comprised of i.i.d. samples from the Gamma
+        distribution with the input shape and scale.
+
+        :param sc: SparkContext used to create the RDD.
+        :param shape: shape (> 0) parameter for the Gamma distribution
+        :param scale: scale (> 0) parameter for the Gamma distribution
+        :param size: Size of the RDD.
+        :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
+        :param seed: Random seed (default: a random long integer).
+        :return: RDD of float comprised of i.i.d. samples ~ Gamma(shape, scale).
+
+        >>> from math import sqrt
+        >>> shape = 1.0
+        >>> scale = 2.0
+        >>> expMean = shape * scale
+        >>> expStd = sqrt(shape * scale * scale)
+        >>> x = RandomRDDs.gammaRDD(sc, shape, scale, 1000, seed=2L)
+        >>> stats = x.stats()
+        >>> stats.count()
+        1000L
+        >>> abs(stats.mean() - expMean) < 0.5
+        True
+        >>> abs(stats.stdev() - expStd) < 0.5
+        True
+        """
+        return callMLlibFunc("gammaRDD", sc._jsc, float(shape),
+                             float(scale), size, numPartitions, seed)
+
     @staticmethod
     @toArray
     def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
@@ -175,6 +264,40 @@ def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
         """
         return callMLlibFunc("normalVectorRDD", sc._jsc, numRows, numCols, numPartitions, seed)
 
+    @staticmethod
+    @toArray
+    def logNormalVectorRDD(sc, mean, std, numRows, numCols, numPartitions=None, seed=None):
+        """
+        Generates an RDD comprised of vectors containing i.i.d. samples drawn
+        from the log normal distribution.
+
+        :param sc: SparkContext used to create the RDD.
+        :param mean: Mean of the log normal distribution
+        :param std: Standard Deviation of the log normal distribution
+        :param numRows: Number of Vectors in the RDD.
+        :param numCols: Number of elements in each Vector.
+        :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
+        :param seed: Random seed (default: a random long integer).
+        :return: RDD of Vector with vectors containing i.i.d. samples ~ log `N(mean, std)`.
+
+        >>> import numpy as np
+        >>> from math import sqrt, exp
+        >>> mean = 0.0
+        >>> std = 1.0
+        >>> expMean = exp(mean + 0.5 * std * std)
+        >>> expStd = sqrt((exp(std * std) - 1.0) * exp(2.0 * mean + std * std))
+        >>> mat = np.matrix(RandomRDDs.logNormalVectorRDD(sc, mean, std, \
+                               100, 100, seed=1L).collect())
+        >>> mat.shape
+        (100, 100)
+        >>> abs(mat.mean() - expMean) < 0.1
+        True
+        >>> abs(mat.std() - expStd) < 0.1
+        True
+        """
+        return callMLlibFunc("logNormalVectorRDD", sc._jsc, float(mean), float(std),
+                             numRows, numCols, numPartitions, seed)
+
     @staticmethod
     @toArray
     def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
@@ -205,6 +328,70 @@ def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
         return callMLlibFunc("poissonVectorRDD", sc._jsc, float(mean), numRows, numCols,
                              numPartitions, seed)
 
+    @staticmethod
+    @toArray
+    def exponentialVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
+        """
+        Generates an RDD comprised of vectors containing i.i.d. samples drawn
+        from the Exponential distribution with the input mean.
+
+        :param sc: SparkContext used to create the RDD.
+        :param mean: Mean, or 1 / lambda, for the Exponential distribution.
+        :param numRows: Number of Vectors in the RDD.
+        :param numCols: Number of elements in each Vector.
+        :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`)
+        :param seed: Random seed (default: a random long integer).
+        :return: RDD of Vector with vectors containing i.i.d. samples ~ Exp(mean).
+
+        >>> import numpy as np
+        >>> mean = 0.5
+        >>> rdd = RandomRDDs.exponentialVectorRDD(sc, mean, 100, 100, seed=1L)
+        >>> mat = np.mat(rdd.collect())
+        >>> mat.shape
+        (100, 100)
+        >>> abs(mat.mean() - mean) < 0.5
+        True
+        >>> from math import sqrt
+        >>> abs(mat.std() - sqrt(mean)) < 0.5
+        True
+        """
+        return callMLlibFunc("exponentialVectorRDD", sc._jsc, float(mean), numRows, numCols,
+                             numPartitions, seed)
+
+    @staticmethod
+    @toArray
+    def gammaVectorRDD(sc, shape, scale, numRows, numCols, numPartitions=None, seed=None):
+        """
+        Generates an RDD comprised of vectors containing i.i.d. samples drawn
+        from the Gamma distribution.
+
+        :param sc: SparkContext used to create the RDD.
+        :param shape: Shape (> 0) of the Gamma distribution
+        :param scale: Scale (> 0) of the Gamma distribution
+        :param numRows: Number of Vectors in the RDD.
+        :param numCols: Number of elements in each Vector.
+        :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
+        :param seed: Random seed (default: a random long integer).
+        :return: RDD of Vector with vectors containing i.i.d. samples ~ Gamma(shape, scale).
+
+        >>> import numpy as np
+        >>> from math import sqrt
+        >>> shape = 1.0
+        >>> scale = 2.0
+        >>> expMean = shape * scale
+        >>> expStd = sqrt(shape * scale * scale)
+        >>> mat = np.matrix(RandomRDDs.gammaVectorRDD(sc, shape, scale, \
+                       100, 100, seed=1L).collect())
+        >>> mat.shape
+        (100, 100)
+        >>> abs(mat.mean() - expMean) < 0.1
+        True
+        >>> abs(mat.std() - expStd) < 0.1
+        True
+        """
+        return callMLlibFunc("gammaVectorRDD", sc._jsc, float(shape), float(scale),
+                             numRows, numCols, numPartitions, seed)
+
 
 def _test():
     import doctest
diff --git a/python/pyspark/mllib/stat.py b/python/pyspark/mllib/stat.py
index 1980f5b03f430..c8af777a8b00d 100644
--- a/python/pyspark/mllib/stat.py
+++ b/python/pyspark/mllib/stat.py
@@ -55,7 +55,7 @@ def min(self):
 
 class ChiSqTestResult(JavaModelWrapper):
     """
-    :: Experimental ::
+    .. note:: Experimental
 
     Object containing the test results for the chi-squared hypothesis test.
     """
@@ -200,7 +200,7 @@ def corr(x, y=None, method=None):
     @staticmethod
     def chiSqTest(observed, expected=None):
         """
-        :: Experimental ::
+        .. note:: Experimental
 
         If `observed` is Vector, conduct Pearson's chi-squared goodness
         of fit test of the observed data against the expected distribution,
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 8332f8e061f48..f48e3d6dacb4b 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -23,6 +23,7 @@
 import array as pyarray
 
 from numpy import array, array_equal
+from py4j.protocol import Py4JJavaError
 
 if sys.version_info[:2] <= (2, 6):
     try:
@@ -34,7 +35,7 @@
     import unittest
 
 from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\
-    DenseMatrix
+    DenseMatrix, Vectors, Matrices
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.random import RandomRDDs
 from pyspark.mllib.stat import Statistics
@@ -109,6 +110,28 @@ def test_squared_distance(self):
         self.assertEquals(0.0, _squared_distance(dv, dv))
         self.assertEquals(0.0, _squared_distance(lst, lst))
 
+    def test_conversion(self):
+        # numpy arrays should be automatically upcast to float64
+        # tests for fix of [SPARK-5089]
+        v = array([1, 2, 3, 4], dtype='float64')
+        dv = DenseVector(v)
+        self.assertTrue(dv.array.dtype == 'float64')
+        v = array([1, 2, 3, 4], dtype='float32')
+        dv = DenseVector(v)
+        self.assertTrue(dv.array.dtype == 'float64')
+
+    def test_sparse_vector_indexing(self):
+        sv = SparseVector(4, {1: 1, 3: 2})
+        self.assertEquals(sv[0], 0.)
+        self.assertEquals(sv[3], 2.)
+        self.assertEquals(sv[1], 1.)
+        self.assertEquals(sv[2], 0.)
+        self.assertEquals(sv[-1], 2)
+        self.assertEquals(sv[-2], 0)
+        self.assertEquals(sv[-4], 0)
+        for ind in [4, -5, 7.8]:
+            self.assertRaises(ValueError, sv.__getitem__, ind)
+
 
 class ListTests(PySparkTestCase):
 
@@ -117,7 +140,7 @@ class ListTests(PySparkTestCase):
     as NumPy arrays.
     """
 
-    def test_clustering(self):
+    def test_kmeans(self):
         from pyspark.mllib.clustering import KMeans
         data = [
             [0, 1.1],
@@ -129,6 +152,21 @@ def test_clustering(self):
         self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
         self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
 
+    def test_kmeans_deterministic(self):
+        from pyspark.mllib.clustering import KMeans
+        X = range(0, 100, 10)
+        Y = range(0, 100, 10)
+        data = [[x, y] for x, y in zip(X, Y)]
+        clusters1 = KMeans.train(self.sc.parallelize(data),
+                                 3, initializationMode="k-means||", seed=42)
+        clusters2 = KMeans.train(self.sc.parallelize(data),
+                                 3, initializationMode="k-means||", seed=42)
+        centers1 = clusters1.centers
+        centers2 = clusters2.centers
+        for c1, c2 in zip(centers1, centers2):
+            # TODO: Allow small numeric difference.
+            self.assertTrue(array_equal(c1, c2))
+
     def test_classification(self):
         from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
         from pyspark.mllib.tree import DecisionTree
@@ -400,6 +438,103 @@ def test_regression(self):
         self.assertTrue(dt_model.predict(features[3]) > 0)
 
 
+class ChiSqTestTests(PySparkTestCase):
+    def test_goodness_of_fit(self):
+        from numpy import inf
+
+        observed = Vectors.dense([4, 6, 5])
+        pearson = Statistics.chiSqTest(observed)
+
+        # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
+        self.assertEqual(pearson.statistic, 0.4)
+        self.assertEqual(pearson.degreesOfFreedom, 2)
+        self.assertAlmostEqual(pearson.pValue, 0.8187, 4)
+
+        # Different expected and observed sum
+        observed1 = Vectors.dense([21, 38, 43, 80])
+        expected1 = Vectors.dense([3, 5, 7, 20])
+        pearson1 = Statistics.chiSqTest(observed1, expected1)
+
+        # Results validated against the R command
+        # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
+        self.assertAlmostEqual(pearson1.statistic, 14.1429, 4)
+        self.assertEqual(pearson1.degreesOfFreedom, 3)
+        self.assertAlmostEqual(pearson1.pValue, 0.002717, 4)
+
+        # Vectors with different sizes
+        observed3 = Vectors.dense([1.0, 2.0, 3.0])
+        expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0])
+        self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3)
+
+        # Negative counts in observed
+        neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0])
+        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_obs, expected1)
+
+        # Count = 0.0 in expected but not observed
+        zero_expected = Vectors.dense([1.0, 0.0, 3.0])
+        pearson_inf = Statistics.chiSqTest(observed, zero_expected)
+        self.assertEqual(pearson_inf.statistic, inf)
+        self.assertEqual(pearson_inf.degreesOfFreedom, 2)
+        self.assertEqual(pearson_inf.pValue, 0.0)
+
+        # 0.0 in expected and observed simultaneously
+        zero_observed = Vectors.dense([2.0, 0.0, 1.0])
+        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, zero_observed, zero_expected)
+
+    def test_matrix_independence(self):
+        data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
+        chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
+
+        # Results validated against R command
+        # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
+        self.assertAlmostEqual(chi.statistic, 21.9958, 4)
+        self.assertEqual(chi.degreesOfFreedom, 6)
+        self.assertAlmostEqual(chi.pValue, 0.001213, 4)
+
+        # Negative counts
+        neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
+        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_counts)
+
+        # Row sum = 0.0
+        row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
+        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, row_zero)
+
+        # Column sum = 0.0
+        col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
+        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, col_zero)
+
+    def test_chi_sq_pearson(self):
+        data = [
+            LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
+            LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
+            LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
+            LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
+            LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
+            LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
+        ]
+
+        for numParts in [2, 4, 6, 8]:
+            chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
+            feature1 = chi[0]
+            self.assertEqual(feature1.statistic, 0.75)
+            self.assertEqual(feature1.degreesOfFreedom, 2)
+            self.assertAlmostEqual(feature1.pValue, 0.6873, 4)
+
+            feature2 = chi[1]
+            self.assertEqual(feature2.statistic, 1.5)
+            self.assertEqual(feature2.degreesOfFreedom, 3)
+            self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
+
+    def test_right_number_of_results(self):
+        num_cols = 1001
+        sparse_data = [
+            LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
+            LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
+        ]
+        chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
+        self.assertEqual(len(chi), num_cols)
+        self.assertIsNotNone(chi[1000])
+
 if __name__ == "__main__":
     if not _have_scipy:
         print "NOTE: Skipping SciPy tests as it does not seem to be installed"
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 6fc2907ab7515..65913e5b73708 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -141,6 +141,17 @@ def id(self):
     def __repr__(self):
         return self._jrdd.toString()
 
+    def __getnewargs__(self):
+        # This method is called when attempting to pickle an RDD, which is always an error:
+        raise Exception(
+            "It appears that you are attempting to broadcast an RDD or reference an RDD from an "
+            "action or transformation. RDD transformations and actions can only be invoked by the "
+            "driver, not inside of other transformations; for example, "
+            "rdd1.map(lambda x: rdd2.values.count() * x) is invalid because the values "
+            "transformation and count action cannot be performed inside of the rdd1.map "
+            "transformation. For more information, see SPARK-5063."
+        )
+
     @property
     def context(self):
         """
@@ -469,8 +480,7 @@ def intersection(self, other):
     def _reserialize(self, serializer=None):
         serializer = serializer or self.ctx.serializer
         if self._jrdd_deserializer != serializer:
-            if not isinstance(self, PipelinedRDD):
-                self = self.map(lambda x: x, preservesPartitioning=True)
+            self = self.map(lambda x: x, preservesPartitioning=True)
             self._jrdd_deserializer = serializer
         return self
 
@@ -1131,6 +1141,18 @@ def first(self):
             return rs[0]
         raise ValueError("RDD is empty")
 
+    def isEmpty(self):
+        """
+        Returns true if and only if the RDD contains no elements at all. Note that an RDD
+        may be empty even when it has at least 1 partition.
+
+        >>> sc.parallelize([]).isEmpty()
+        True
+        >>> sc.parallelize([1]).isEmpty()
+        False
+        """
+        return self._jrdd.partitions().size() == 0 or len(self.take(1)) == 0
+
     def saveAsNewAPIHadoopDataset(self, conf, keyConverter=None, valueConverter=None):
         """
         Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file
@@ -1805,23 +1827,21 @@ def zip(self, other):
         def get_batch_size(ser):
             if isinstance(ser, BatchedSerializer):
                 return ser.batchSize
-            return 1
+            return 1  # not batched
 
         def batch_as(rdd, batchSize):
-            ser = rdd._jrdd_deserializer
-            if isinstance(ser, BatchedSerializer):
-                ser = ser.serializer
-            return rdd._reserialize(BatchedSerializer(ser, batchSize))
+            return rdd._reserialize(BatchedSerializer(PickleSerializer(), batchSize))
 
         my_batch = get_batch_size(self._jrdd_deserializer)
         other_batch = get_batch_size(other._jrdd_deserializer)
-        # use the smallest batchSize for both of them
-        batchSize = min(my_batch, other_batch)
-        if batchSize <= 0:
-            # auto batched or unlimited
-            batchSize = 100
-        other = batch_as(other, batchSize)
-        self = batch_as(self, batchSize)
+        if my_batch != other_batch:
+            # use the smallest batchSize for both of them
+            batchSize = min(my_batch, other_batch)
+            if batchSize <= 0:
+                # auto batched or unlimited
+                batchSize = 100
+            other = batch_as(other, batchSize)
+            self = batch_as(self, batchSize)
 
         if self.getNumPartitions() != other.getNumPartitions():
             raise ValueError("Can only zip with RDD which has the same number of partitions")
@@ -1974,7 +1994,7 @@ def _to_java_object_rdd(self):
 
     def countApprox(self, timeout, confidence=0.95):
         """
-        :: Experimental ::
+        .. note:: Experimental
         Approximate version of count() that returns a potentially incomplete
         result within a timeout, even if not all tasks have finished.
 
@@ -1987,7 +2007,7 @@ def countApprox(self, timeout, confidence=0.95):
 
     def sumApprox(self, timeout, confidence=0.95):
         """
-        :: Experimental ::
+        .. note:: Experimental
         Approximate operation to return the sum within a timeout
         or meet the confidence.
 
@@ -2003,7 +2023,7 @@ def sumApprox(self, timeout, confidence=0.95):
 
     def meanApprox(self, timeout, confidence=0.95):
         """
-        :: Experimental ::
+        .. note:: Experimental
         Approximate operation to return the mean within a timeout
         or meet the confidence.
 
@@ -2019,7 +2039,7 @@ def meanApprox(self, timeout, confidence=0.95):
 
     def countApproxDistinct(self, relativeSD=0.05):
         """
-        :: Experimental ::
+        .. note:: Experimental
         Return approximate number of distinct elements in the RDD.
 
         The algorithm used is based on streamlib's implementation of
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 754e1c1198c8d..8bae80af99237 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -189,6 +189,10 @@ def __init__(self, serializer, batchSize=UNLIMITED_BATCH_SIZE):
     def _batched(self, iterator):
         if self.batchSize == self.UNLIMITED_BATCH_SIZE:
             yield list(iterator)
+        elif hasattr(iterator, "__len__") and hasattr(iterator, "__getslice__"):
+            n = len(iterator)
+            for i in xrange(0, n, self.batchSize):
+                yield iterator[i: i + self.batchSize]
         else:
             items = []
             count = 0
@@ -471,6 +475,9 @@ def dumps(self, obj):
     def loads(self, obj):
         return self.serializer.loads(zlib.decompress(obj))
 
+    def __eq__(self, other):
+        return isinstance(other, CompressedSerializer) and self.serializer == other.serializer
+
 
 class UTF8Deserializer(Serializer):
 
@@ -497,6 +504,9 @@ def load_stream(self, stream):
         except EOFError:
             return
 
+    def __eq__(self, other):
+        return isinstance(other, UTF8Deserializer) and self.use_unicode == other.use_unicode
+
 
 def read_long(stream):
     length = stream.read(8)
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index ae288471b0e51..1990323249cf6 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -420,7 +420,7 @@ def fromJson(cls, json):
 
 class UserDefinedType(DataType):
     """
-    :: WARN: Spark Internal Use Only ::
+    .. note:: WARN: Spark Internal Use Only
     SQL User-Defined Type (UDT).
     """
 
@@ -788,8 +788,9 @@ def _create_converter(dataType):
         return lambda row: map(conv, row)
 
     elif isinstance(dataType, MapType):
-        conv = _create_converter(dataType.valueType)
-        return lambda row: dict((k, conv(v)) for k, v in row.iteritems())
+        kconv = _create_converter(dataType.keyType)
+        vconv = _create_converter(dataType.valueType)
+        return lambda row: dict((kconv(k), vconv(v)) for k, v in row.iteritems())
 
     elif isinstance(dataType, NullType):
         return lambda x: None
@@ -806,14 +807,14 @@ def convert_struct(obj):
             return
 
         if isinstance(obj, tuple):
-            if hasattr(obj, "fields"):
-                d = dict(zip(obj.fields, obj))
-            if hasattr(obj, "__FIELDS__"):
+            if hasattr(obj, "_fields"):
+                d = dict(zip(obj._fields, obj))
+            elif hasattr(obj, "__FIELDS__"):
                 d = dict(zip(obj.__FIELDS__, obj))
             elif all(isinstance(x, tuple) and len(x) == 2 for x in obj):
                 d = dict(obj)
             else:
-                raise ValueError("unexpected tuple: %s" % obj)
+                raise ValueError("unexpected tuple: %s" % str(obj))
 
         elif isinstance(obj, dict):
             d = obj
@@ -944,7 +945,7 @@ def _infer_schema_type(obj, dataType):
 
     elif isinstance(dataType, MapType):
         k, v = obj.iteritems().next()
-        return MapType(_infer_type(k),
+        return MapType(_infer_schema_type(k, dataType.keyType),
                        _infer_schema_type(v, dataType.valueType))
 
     elif isinstance(dataType, StructType):
@@ -1085,7 +1086,7 @@ def _has_struct_or_date(dt):
     elif isinstance(dt, ArrayType):
         return _has_struct_or_date(dt.elementType)
     elif isinstance(dt, MapType):
-        return _has_struct_or_date(dt.valueType)
+        return _has_struct_or_date(dt.keyType) or _has_struct_or_date(dt.valueType)
     elif isinstance(dt, DateType):
         return True
     elif isinstance(dt, UserDefinedType):
@@ -1148,12 +1149,13 @@ def List(l):
         return List
 
     elif isinstance(dataType, MapType):
-        cls = _create_cls(dataType.valueType)
+        kcls = _create_cls(dataType.keyType)
+        vcls = _create_cls(dataType.valueType)
 
         def Dict(d):
             if d is None:
                 return
-            return dict((k, _create_object(cls, v)) for k, v in d.items())
+            return dict((_create_object(kcls, k), _create_object(vcls, v)) for k, v in d.items())
 
         return Dict
 
@@ -1164,7 +1166,8 @@ def Dict(d):
         return lambda datum: dataType.deserialize(datum)
 
     elif not isinstance(dataType, StructType):
-        raise Exception("unexpected data type: %s" % dataType)
+        # no wrapper for primitive types
+        return lambda x: x
 
     class Row(tuple):
 
@@ -1278,14 +1281,14 @@ def registerFunction(self, name, f, returnType=StringType()):
                                      self._sc._gateway._gateway_client)
         includes = ListConverter().convert(self._sc._python_includes,
                                            self._sc._gateway._gateway_client)
-        self._ssql_ctx.registerPython(name,
-                                      bytearray(pickled_command),
-                                      env,
-                                      includes,
-                                      self._sc.pythonExec,
-                                      broadcast_vars,
-                                      self._sc._javaAccumulator,
-                                      returnType.json())
+        self._ssql_ctx.udf().registerPython(name,
+                                            bytearray(pickled_command),
+                                            env,
+                                            includes,
+                                            self._sc.pythonExec,
+                                            broadcast_vars,
+                                            self._sc._javaAccumulator,
+                                            returnType.json())
 
     def inferSchema(self, rdd, samplingRatio=None):
         """Infer and apply a schema to an RDD of L{Row}.
@@ -1324,6 +1327,16 @@ def inferSchema(self, rdd, samplingRatio=None):
         >>> srdd = sqlCtx.inferSchema(nestedRdd2)
         >>> srdd.collect()
         [Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), ..., f2=[2, 3])]
+
+        >>> from collections import namedtuple
+        >>> CustomRow = namedtuple('CustomRow', 'field1 field2')
+        >>> rdd = sc.parallelize(
+        ...     [CustomRow(field1=1, field2="row1"),
+        ...      CustomRow(field1=2, field2="row2"),
+        ...      CustomRow(field1=3, field2="row3")])
+        >>> srdd = sqlCtx.inferSchema(rdd)
+        >>> srdd.collect()[0]
+        Row(field1=1, field2=u'row1')
         """
 
         if isinstance(rdd, SchemaRDD):
@@ -1445,7 +1458,7 @@ def applySchema(self, rdd, schema):
 
         jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
         srdd = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
-        return SchemaRDD(srdd.toJavaSchemaRDD(), self)
+        return SchemaRDD(srdd, self)
 
     def registerRDDAsTable(self, rdd, tableName):
         """Registers the given RDD as a temporary table in the catalog.
@@ -1474,7 +1487,7 @@ def parquetFile(self, path):
         >>> sorted(srdd.collect()) == sorted(srdd2.collect())
         True
         """
-        jschema_rdd = self._ssql_ctx.parquetFile(path).toJavaSchemaRDD()
+        jschema_rdd = self._ssql_ctx.parquetFile(path)
         return SchemaRDD(jschema_rdd, self)
 
     def jsonFile(self, path, schema=None, samplingRatio=1.0):
@@ -1536,7 +1549,7 @@ def jsonFile(self, path, schema=None, samplingRatio=1.0):
         else:
             scala_datatype = self._ssql_ctx.parseDataType(schema.json())
             srdd = self._ssql_ctx.jsonFile(path, scala_datatype)
-        return SchemaRDD(srdd.toJavaSchemaRDD(), self)
+        return SchemaRDD(srdd, self)
 
     def jsonRDD(self, rdd, schema=None, samplingRatio=1.0):
         """Loads an RDD storing one JSON object per string as a L{SchemaRDD}.
@@ -1606,7 +1619,7 @@ def func(iterator):
         else:
             scala_datatype = self._ssql_ctx.parseDataType(schema.json())
             srdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
-        return SchemaRDD(srdd.toJavaSchemaRDD(), self)
+        return SchemaRDD(srdd, self)
 
     def sql(self, sqlQuery):
         """Return a L{SchemaRDD} representing the result of the given query.
@@ -1617,7 +1630,7 @@ def sql(self, sqlQuery):
         >>> srdd2.collect()
         [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
         """
-        return SchemaRDD(self._ssql_ctx.sql(sqlQuery).toJavaSchemaRDD(), self)
+        return SchemaRDD(self._ssql_ctx.sql(sqlQuery), self)
 
     def table(self, tableName):
         """Returns the specified table as a L{SchemaRDD}.
@@ -1628,7 +1641,7 @@ def table(self, tableName):
         >>> sorted(srdd.collect()) == sorted(srdd2.collect())
         True
         """
-        return SchemaRDD(self._ssql_ctx.table(tableName).toJavaSchemaRDD(), self)
+        return SchemaRDD(self._ssql_ctx.table(tableName), self)
 
     def cacheTable(self, tableName):
         """Caches the specified table in-memory."""
@@ -1668,29 +1681,11 @@ def _ssql_ctx(self):
         except Py4JError as e:
             raise Exception("You must build Spark with Hive. "
                             "Export 'SPARK_HIVE=true' and run "
-                            "sbt/sbt assembly", e)
+                            "build/sbt assembly", e)
 
     def _get_hive_ctx(self):
         return self._jvm.HiveContext(self._jsc.sc())
 
-    def hiveql(self, hqlQuery):
-        """
-        DEPRECATED: Use sql()
-        """
-        warnings.warn("hiveql() is deprecated as the sql function now parses using HiveQL by" +
-                      "default. The SQL dialect for parsing can be set using 'spark.sql.dialect'",
-                      DeprecationWarning)
-        return SchemaRDD(self._ssql_ctx.hiveql(hqlQuery).toJavaSchemaRDD(), self)
-
-    def hql(self, hqlQuery):
-        """
-        DEPRECATED: Use sql()
-        """
-        warnings.warn("hql() is deprecated as the sql function now parses using HiveQL by" +
-                      "default. The SQL dialect for parsing can be set using 'spark.sql.dialect'",
-                      DeprecationWarning)
-        return self.hiveql(hqlQuery)
-
 
 class LocalHiveContext(HiveContext):
 
@@ -1703,12 +1698,6 @@ def _get_hive_ctx(self):
         return self._jvm.LocalHiveContext(self._jsc.sc())
 
 
-class TestHiveContext(HiveContext):
-
-    def _get_hive_ctx(self):
-        return self._jvm.TestHiveContext(self._jsc.sc())
-
-
 def _create_row(fields, values):
     row = Row(*values)
     row.__FIELDS__ = fields
@@ -1833,7 +1822,7 @@ def __init__(self, jschema_rdd, sql_ctx):
         self.sql_ctx = sql_ctx
         self._sc = sql_ctx._sc
         clsName = jschema_rdd.getClass().getName()
-        assert clsName.endswith("JavaSchemaRDD"), "jschema_rdd must be JavaSchemaRDD"
+        assert clsName.endswith("SchemaRDD"), "jschema_rdd must be SchemaRDD"
         self._jschema_rdd = jschema_rdd
         self._id = None
         self.is_cached = False
@@ -1867,7 +1856,7 @@ def limit(self, num):
         >>> srdd.limit(0).collect()
         []
         """
-        rdd = self._jschema_rdd.baseSchemaRDD().limit(num).toJavaSchemaRDD()
+        rdd = self._jschema_rdd.baseSchemaRDD().limit(num)
         return SchemaRDD(rdd, self.sql_ctx)
 
     def toJSON(self, use_unicode=False):
@@ -2046,18 +2035,18 @@ def isCheckpointed(self):
 
     def getCheckpointFile(self):
         checkpointFile = self._jschema_rdd.getCheckpointFile()
-        if checkpointFile.isPresent():
+        if checkpointFile.isDefined():
             return checkpointFile.get()
 
     def coalesce(self, numPartitions, shuffle=False):
-        rdd = self._jschema_rdd.coalesce(numPartitions, shuffle)
+        rdd = self._jschema_rdd.coalesce(numPartitions, shuffle, None)
         return SchemaRDD(rdd, self.sql_ctx)
 
     def distinct(self, numPartitions=None):
         if numPartitions is None:
             rdd = self._jschema_rdd.distinct()
         else:
-            rdd = self._jschema_rdd.distinct(numPartitions)
+            rdd = self._jschema_rdd.distinct(numPartitions, None)
         return SchemaRDD(rdd, self.sql_ctx)
 
     def intersection(self, other):
@@ -2068,7 +2057,7 @@ def intersection(self, other):
             raise ValueError("Can only intersect with another SchemaRDD")
 
     def repartition(self, numPartitions):
-        rdd = self._jschema_rdd.repartition(numPartitions)
+        rdd = self._jschema_rdd.repartition(numPartitions, None)
         return SchemaRDD(rdd, self.sql_ctx)
 
     def subtract(self, other, numPartitions=None):
@@ -2082,6 +2071,34 @@ def subtract(self, other, numPartitions=None):
         else:
             raise ValueError("Can only subtract another SchemaRDD")
 
+    def sample(self, withReplacement, fraction, seed=None):
+        """
+        Return a sampled subset of this SchemaRDD.
+
+        >>> srdd = sqlCtx.inferSchema(rdd)
+        >>> srdd.sample(False, 0.5, 97).count()
+        2L
+        """
+        assert fraction >= 0.0, "Negative fraction value: %s" % fraction
+        seed = seed if seed is not None else random.randint(0, sys.maxint)
+        rdd = self._jschema_rdd.sample(withReplacement, fraction, long(seed))
+        return SchemaRDD(rdd, self.sql_ctx)
+
+    def takeSample(self, withReplacement, num, seed=None):
+        """Return a fixed-size sampled subset of this SchemaRDD.
+
+        >>> srdd = sqlCtx.inferSchema(rdd)
+        >>> srdd.takeSample(False, 2, 97)
+        [Row(field1=3, field2=u'row3'), Row(field1=1, field2=u'row1')]
+        """
+        seed = seed if seed is not None else random.randint(0, sys.maxint)
+        with SCCallSiteSync(self.context) as css:
+            bytesInJava = self._jschema_rdd.baseSchemaRDD() \
+                .takeSampleToPython(withReplacement, num, long(seed)) \
+                .iterator()
+        cls = _create_cls(self.schema())
+        return map(cls, self._collect_iterator_through_file(bytesInJava))
+
 
 def _test():
     import doctest
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 0826ddc56e844..2fe39392ff081 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -157,18 +157,20 @@ def foreachRDD(self, func):
         api = self._ssc._jvm.PythonDStream
         api.callForeachRDD(self._jdstream, jfunc)
 
-    def pprint(self):
+    def pprint(self, num=10):
         """
-        Print the first ten elements of each RDD generated in this DStream.
+        Print the first num elements of each RDD generated in this DStream.
+
+        @param num: the number of elements from the first will be printed.
         """
         def takeAndPrint(time, rdd):
-            taken = rdd.take(11)
+            taken = rdd.take(num + 1)
             print "-------------------------------------------"
             print "Time: %s" % time
             print "-------------------------------------------"
-            for record in taken[:10]:
+            for record in taken[:num]:
                 print record
-            if len(taken) > 10:
+            if len(taken) > num:
                 print "..."
             print
 
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 674e923969c2b..a3c6b071b9c0e 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -534,6 +534,15 @@ def test_zip_with_different_serializers(self):
         a = a._reserialize(BatchedSerializer(PickleSerializer(), 2))
         b = b._reserialize(MarshalSerializer())
         self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
+        # regression test for SPARK-4841
+        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
+        t = self.sc.textFile(path)
+        cnt = t.count()
+        self.assertEqual(cnt, t.zip(t).count())
+        rdd = t.map(str)
+        self.assertEqual(cnt, t.zip(rdd).count())
+        # regression test for bug in _reserializer()
+        self.assertEqual(cnt, t.zip(rdd).count())
 
     def test_zip_with_different_number_of_items(self):
         a = self.sc.parallelize(range(5), 2)
@@ -936,6 +945,14 @@ def test_infer_schema(self):
         result = self.sqlCtx.sql("SELECT l[0].a from test2 where d['key'].d = '2'")
         self.assertEqual(1, result.first()[0])
 
+    def test_struct_in_map(self):
+        d = [Row(m={Row(i=1): Row(s="")})]
+        rdd = self.sc.parallelize(d)
+        srdd = self.sqlCtx.inferSchema(rdd)
+        k, v = srdd.first().m.items()[0]
+        self.assertEqual(1, k.i)
+        self.assertEqual("", v.s)
+
     def test_convert_row_to_dict(self):
         row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})
         self.assertEqual(1, row.asDict()['l'][0].a)
diff --git a/repl/pom.xml b/repl/pom.xml
index 9b2290429fee5..ae7c31aef4f5f 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -68,10 +68,6 @@
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.eclipse.jetty</groupId>
-      <artifactId>jetty-server</artifactId>
-    </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
       <artifactId>scala-compiler</artifactId>
@@ -86,11 +82,6 @@
       <groupId>org.slf4j</groupId>
       <artifactId>jul-to-slf4j</artifactId>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
@@ -101,29 +92,6 @@
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
     <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-install-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-        <configuration>
-          <environmentVariables>
-            <SPARK_HOME>${basedir}/..</SPARK_HOME>
-          </environmentVariables>
-        </configuration>
-      </plugin>
       <!-- Include a source dir depending on the Scala version -->
       <plugin>
         <groupId>org.codehaus.mojo</groupId>
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkCommandLine.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkCommandLine.scala
index 05816941b54b3..6480e2d24e044 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkCommandLine.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkCommandLine.scala
@@ -19,14 +19,21 @@ package org.apache.spark.repl
 
 import scala.tools.nsc.{Settings, CompilerCommand}
 import scala.Predef._
+import org.apache.spark.annotation.DeveloperApi
 
 /**
  * Command class enabling Spark-specific command line options (provided by
  * <i>org.apache.spark.repl.SparkRunnerSettings</i>).
+ *
+ * @example new SparkCommandLine(Nil).settings
+ *
+ * @param args The list of command line arguments
+ * @param settings The underlying settings to associate with this set of
+ *                 command-line options
  */
+@DeveloperApi
 class SparkCommandLine(args: List[String], override val settings: Settings)
     extends CompilerCommand(args, settings) {
-
   def this(args: List[String], error: String => Unit) {
     this(args, new SparkRunnerSettings(error))
   }
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala
index f8432c8af6ed2..5fb378112ef92 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala
@@ -15,7 +15,7 @@ import scala.tools.nsc.ast.parser.Tokens.EOF
 
 import org.apache.spark.Logging
 
-trait SparkExprTyper extends Logging {
+private[repl] trait SparkExprTyper extends Logging {
   val repl: SparkIMain
 
   import repl._
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkHelper.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkHelper.scala
index 5340951d91331..955be17a73b85 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkHelper.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkHelper.scala
@@ -17,6 +17,23 @@
 
 package scala.tools.nsc
 
+import org.apache.spark.annotation.DeveloperApi
+
+// NOTE: Forced to be public (and in scala.tools.nsc package) to access the
+//       settings "explicitParentLoader" method
+
+/**
+ * Provides exposure for the explicitParentLoader method on settings instances.
+ */
+@DeveloperApi
 object SparkHelper {
+  /**
+   * Retrieves the explicit parent loader for the provided settings.
+   *
+   * @param settings The settings whose explicit parent loader to retrieve
+   *
+   * @return The Optional classloader representing the explicit parent loader
+   */
+  @DeveloperApi
   def explicitParentLoader(settings: Settings) = settings.explicitParentLoader
 }
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index e56b74edba88c..72c1a989999b4 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -10,6 +10,8 @@ package org.apache.spark.repl
 
 import java.net.URL
 
+import org.apache.spark.annotation.DeveloperApi
+
 import scala.reflect.io.AbstractFile
 import scala.tools.nsc._
 import scala.tools.nsc.backend.JavaPlatform
@@ -57,20 +59,22 @@ import org.apache.spark.util.Utils
  *  @author  Lex Spoon
  *  @version 1.2
  */
-class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
-               val master: Option[String])
-                extends AnyRef
-                   with LoopCommands
-                   with SparkILoopInit
-                   with Logging
-{
+@DeveloperApi
+class SparkILoop(
+    private val in0: Option[BufferedReader],
+    protected val out: JPrintWriter,
+    val master: Option[String]
+) extends AnyRef with LoopCommands with SparkILoopInit with Logging {
   def this(in0: BufferedReader, out: JPrintWriter, master: String) = this(Some(in0), out, Some(master))
   def this(in0: BufferedReader, out: JPrintWriter) = this(Some(in0), out, None)
   def this() = this(None, new JPrintWriter(Console.out, true), None)
 
-  var in: InteractiveReader = _   // the input stream from which commands come
-  var settings: Settings = _
-  var intp: SparkIMain = _
+  private var in: InteractiveReader = _   // the input stream from which commands come
+
+  // NOTE: Exposed in package for testing
+  private[repl] var settings: Settings = _
+
+  private[repl] var intp: SparkIMain = _
 
   @deprecated("Use `intp` instead.", "2.9.0") def interpreter = intp
   @deprecated("Use `intp` instead.", "2.9.0") def interpreter_= (i: SparkIMain): Unit = intp = i
@@ -123,6 +127,8 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
     }
   }
 
+  // NOTE: Must be public for visibility
+  @DeveloperApi
   var sparkContext: SparkContext = _
 
   override def echoCommandMessage(msg: String) {
@@ -130,45 +136,45 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
   }
 
   // def isAsync = !settings.Yreplsync.value
-  def isAsync = false
+  private[repl] def isAsync = false
   // lazy val power = new Power(intp, new StdReplVals(this))(tagOfStdReplVals, classTag[StdReplVals])
-  def history = in.history
+  private def history = in.history
 
   /** The context class loader at the time this object was created */
   protected val originalClassLoader = Utils.getContextOrSparkClassLoader
 
   // classpath entries added via :cp
-  var addedClasspath: String = ""
+  private var addedClasspath: String = ""
 
   /** A reverse list of commands to replay if the user requests a :replay */
-  var replayCommandStack: List[String] = Nil
+  private var replayCommandStack: List[String] = Nil
 
   /** A list of commands to replay if the user requests a :replay */
-  def replayCommands = replayCommandStack.reverse
+  private def replayCommands = replayCommandStack.reverse
 
   /** Record a command for replay should the user request a :replay */
-  def addReplay(cmd: String) = replayCommandStack ::= cmd
+  private def addReplay(cmd: String) = replayCommandStack ::= cmd
 
-  def savingReplayStack[T](body: => T): T = {
+  private def savingReplayStack[T](body: => T): T = {
     val saved = replayCommandStack
     try body
     finally replayCommandStack = saved
   }
-  def savingReader[T](body: => T): T = {
+  private def savingReader[T](body: => T): T = {
     val saved = in
     try body
     finally in = saved
   }
 
 
-  def sparkCleanUp(){
+  private def sparkCleanUp(){
     echo("Stopping spark context.")
     intp.beQuietDuring {
       command("sc.stop()")
     }
   }
   /** Close the interpreter and set the var to null. */
-  def closeInterpreter() {
+  private def closeInterpreter() {
     if (intp ne null) {
       sparkCleanUp()
       intp.close()
@@ -179,14 +185,16 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
   class SparkILoopInterpreter extends SparkIMain(settings, out) {
     outer =>
 
-    override lazy val formatting = new Formatting {
+    override private[repl] lazy val formatting = new Formatting {
       def prompt = SparkILoop.this.prompt
     }
     override protected def parentClassLoader = SparkHelper.explicitParentLoader(settings).getOrElse(classOf[SparkILoop].getClassLoader)
   }
 
-  /** Create a new interpreter. */
-  def createInterpreter() {
+  /**
+   * Constructs a new interpreter.
+   */
+  protected def createInterpreter() {
     require(settings != null)
 
     if (addedClasspath != "") settings.classpath.append(addedClasspath)
@@ -207,7 +215,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
   }
 
   /** print a friendly help message */
-  def helpCommand(line: String): Result = {
+  private def helpCommand(line: String): Result = {
     if (line == "") helpSummary()
     else uniqueCommand(line) match {
       case Some(lc) => echo("\n" + lc.longHelp)
@@ -258,7 +266,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
   }
 
   /** Show the history */
-  lazy val historyCommand = new LoopCommand("history", "show the history (optional num is commands to show)") {
+  private lazy val historyCommand = new LoopCommand("history", "show the history (optional num is commands to show)") {
     override def usage = "[num]"
     def defaultLines = 20
 
@@ -279,21 +287,21 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
 
   // When you know you are most likely breaking into the middle
   // of a line being typed.  This softens the blow.
-  protected def echoAndRefresh(msg: String) = {
+  private[repl] def echoAndRefresh(msg: String) = {
     echo("\n" + msg)
     in.redrawLine()
   }
-  protected def echo(msg: String) = {
+  private[repl] def echo(msg: String) = {
     out println msg
     out.flush()
   }
-  protected def echoNoNL(msg: String) = {
+  private def echoNoNL(msg: String) = {
     out print msg
     out.flush()
   }
 
   /** Search the history */
-  def searchHistory(_cmdline: String) {
+  private def searchHistory(_cmdline: String) {
     val cmdline = _cmdline.toLowerCase
     val offset  = history.index - history.size + 1
 
@@ -302,14 +310,27 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
   }
 
   private var currentPrompt = Properties.shellPromptString
+
+  /**
+   * Sets the prompt string used by the REPL.
+   *
+   * @param prompt The new prompt string
+   */
+  @DeveloperApi
   def setPrompt(prompt: String) = currentPrompt = prompt
-  /** Prompt to print when awaiting input */
+
+  /**
+   * Represents the current prompt string used by the REPL.
+   *
+   * @return The current prompt string
+   */
+  @DeveloperApi
   def prompt = currentPrompt
 
   import LoopCommand.{ cmd, nullary }
 
   /** Standard commands */
-  lazy val standardCommands = List(
+  private lazy val standardCommands = List(
     cmd("cp", "<path>", "add a jar or directory to the classpath", addClasspath),
     cmd("help", "[command]", "print this summary or command-specific help", helpCommand),
     historyCommand,
@@ -333,7 +354,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
   )
 
   /** Power user commands */
-  lazy val powerCommands: List[LoopCommand] = List(
+  private lazy val powerCommands: List[LoopCommand] = List(
     // cmd("phase", "<phase>", "set the implicit phase for power commands", phaseCommand)
   )
 
@@ -459,7 +480,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
     }
   }
 
-  protected def newJavap() = new JavapClass(addToolsJarToLoader(), new SparkIMain.ReplStrippingWriter(intp)) {
+  private def newJavap() = new JavapClass(addToolsJarToLoader(), new SparkIMain.ReplStrippingWriter(intp)) {
     override def tryClass(path: String): Array[Byte] = {
       val hd :: rest = path split '.' toList;
       // If there are dots in the name, the first segment is the
@@ -581,7 +602,12 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
   //   }
   // }
 
-  /** Available commands */
+  /**
+   * Provides a list of available commands.
+   *
+   * @return The list of commands
+   */
+  @DeveloperApi
   def commands: List[LoopCommand] = standardCommands /*++ (
     if (isReplPower) powerCommands else Nil
   )*/
@@ -613,7 +639,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
    *  command() for each line of input, and stops when
    *  command() returns false.
    */
-  def loop() {
+  private def loop() {
     def readOneLine() = {
       out.flush()
       in readLine prompt
@@ -642,7 +668,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
   }
 
   /** interpret all lines from a specified file */
-  def interpretAllFrom(file: File) {
+  private def interpretAllFrom(file: File) {
     savingReader {
       savingReplayStack {
         file applyReader { reader =>
@@ -655,7 +681,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
   }
 
   /** create a new interpreter and replay the given commands */
-  def replay() {
+  private def replay() {
     reset()
     if (replayCommandStack.isEmpty)
       echo("Nothing to replay.")
@@ -665,7 +691,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
       echo("")
     }
   }
-  def resetCommand() {
+  private def resetCommand() {
     echo("Resetting repl state.")
     if (replayCommandStack.nonEmpty) {
       echo("Forgetting this session history:\n")
@@ -681,13 +707,13 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
     reset()
   }
 
-  def reset() {
+  private def reset() {
     intp.reset()
     // unleashAndSetPhase()
   }
 
   /** fork a shell and run a command */
-  lazy val shCommand = new LoopCommand("sh", "run a shell command (result is implicitly => List[String])") {
+  private lazy val shCommand = new LoopCommand("sh", "run a shell command (result is implicitly => List[String])") {
     override def usage = "<command line>"
     def apply(line: String): Result = line match {
       case ""   => showUsage()
@@ -698,14 +724,14 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
     }
   }
 
-  def withFile(filename: String)(action: File => Unit) {
+  private def withFile(filename: String)(action: File => Unit) {
     val f = File(filename)
 
     if (f.exists) action(f)
     else echo("That file does not exist")
   }
 
-  def loadCommand(arg: String) = {
+  private def loadCommand(arg: String) = {
     var shouldReplay: Option[String] = None
     withFile(arg)(f => {
       interpretAllFrom(f)
@@ -714,7 +740,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
     Result(true, shouldReplay)
   }
 
-  def addAllClasspath(args: Seq[String]): Unit = {
+  private def addAllClasspath(args: Seq[String]): Unit = {
     var added = false
     var totalClasspath = ""
     for (arg <- args) {
@@ -729,7 +755,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
     }
   }
 
-  def addClasspath(arg: String): Unit = {
+  private def addClasspath(arg: String): Unit = {
     val f = File(arg).normalize
     if (f.exists) {
       addedClasspath = ClassPath.join(addedClasspath, f.path)
@@ -741,12 +767,12 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
   }
 
 
-  def powerCmd(): Result = {
+  private def powerCmd(): Result = {
     if (isReplPower) "Already in power mode."
     else enablePowerMode(false)
   }
 
-  def enablePowerMode(isDuringInit: Boolean) = {
+  private[repl] def enablePowerMode(isDuringInit: Boolean) = {
     // replProps.power setValue true
     // unleashAndSetPhase()
     // asyncEcho(isDuringInit, power.banner)
@@ -759,12 +785,12 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
 //     }
 //   }
 
-  def asyncEcho(async: Boolean, msg: => String) {
+  private def asyncEcho(async: Boolean, msg: => String) {
     if (async) asyncMessage(msg)
     else echo(msg)
   }
 
-  def verbosity() = {
+  private def verbosity() = {
     // val old = intp.printResults
     // intp.printResults = !old
     // echo("Switched " + (if (old) "off" else "on") + " result printing.")
@@ -773,7 +799,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
   /** Run one command submitted by the user.  Two values are returned:
     * (1) whether to keep running, (2) the line to record for replay,
     * if any. */
-  def command(line: String): Result = {
+  private[repl] def command(line: String): Result = {
     if (line startsWith ":") {
       val cmd = line.tail takeWhile (x => !x.isWhitespace)
       uniqueCommand(cmd) match {
@@ -789,7 +815,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
     Iterator continually in.readLine("") takeWhile (x => x != null && cond(x))
   }
 
-  def pasteCommand(): Result = {
+  private def pasteCommand(): Result = {
     echo("// Entering paste mode (ctrl-D to finish)\n")
     val code = readWhile(_ => true) mkString "\n"
     echo("\n// Exiting paste mode, now interpreting.\n")
@@ -820,7 +846,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
     * read, go ahead and interpret it.  Return the full string
     * to be recorded for replay, if any.
     */
-  def interpretStartingWith(code: String): Option[String] = {
+  private def interpretStartingWith(code: String): Option[String] = {
     // signal completion non-completion input has been received
     in.completion.resetVerbosity()
 
@@ -874,7 +900,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
   }
 
   // runs :load `file` on any files passed via -i
-  def loadFiles(settings: Settings) = settings match {
+  private def loadFiles(settings: Settings) = settings match {
     case settings: SparkRunnerSettings =>
       for (filename <- settings.loadfiles.value) {
         val cmd = ":load " + filename
@@ -889,7 +915,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
    *  unless settings or properties are such that it should start
    *  with SimpleReader.
    */
-  def chooseReader(settings: Settings): InteractiveReader = {
+  private def chooseReader(settings: Settings): InteractiveReader = {
     if (settings.Xnojline.value || Properties.isEmacsShell)
       SimpleReader()
     else try new SparkJLineReader(
@@ -903,8 +929,8 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
     }
   }
 
-  val u: scala.reflect.runtime.universe.type = scala.reflect.runtime.universe
-  val m = u.runtimeMirror(Utils.getSparkClassLoader)
+  private val u: scala.reflect.runtime.universe.type = scala.reflect.runtime.universe
+  private val m = u.runtimeMirror(Utils.getSparkClassLoader)
   private def tagOfStaticClass[T: ClassTag]: u.TypeTag[T] =
     u.TypeTag[T](
       m,
@@ -913,7 +939,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
           m.staticClass(classTag[T].runtimeClass.getName).toTypeConstructor.asInstanceOf[U # Type]
       })
 
-  def process(settings: Settings): Boolean = savingContextLoader {
+  private def process(settings: Settings): Boolean = savingContextLoader {
     if (getMaster() == "yarn-client") System.setProperty("SPARK_YARN_MODE", "true")
 
     this.settings = settings
@@ -972,6 +998,8 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
     true
   }
 
+  // NOTE: Must be public for visibility
+  @DeveloperApi
   def createSparkContext(): SparkContext = {
     val execUri = System.getenv("SPARK_EXECUTOR_URI")
     val jars = SparkILoop.getAddedJars
@@ -979,7 +1007,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
       .setMaster(getMaster())
       .setAppName("Spark shell")
       .setJars(jars)
-      .set("spark.repl.class.uri", intp.classServer.uri)
+      .set("spark.repl.class.uri", intp.classServerUri)
     if (execUri != null) {
       conf.set("spark.executor.uri", execUri)
     }
@@ -1014,7 +1042,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
   }
 
   @deprecated("Use `process` instead", "2.9.0")
-  def main(settings: Settings): Unit = process(settings)
+  private def main(settings: Settings): Unit = process(settings)
 }
 
 object SparkILoop {
@@ -1033,7 +1061,7 @@ object SparkILoop {
   // Designed primarily for use by test code: take a String with a
   // bunch of code, and prints out a transcript of what it would look
   // like if you'd just typed it into the repl.
-  def runForTranscript(code: String, settings: Settings): String = {
+  private[repl] def runForTranscript(code: String, settings: Settings): String = {
     import java.io.{ BufferedReader, StringReader, OutputStreamWriter }
 
     stringFromStream { ostream =>
@@ -1071,7 +1099,7 @@ object SparkILoop {
   /** Creates an interpreter loop with default settings and feeds
    *  the given code to it as input.
    */
-  def run(code: String, sets: Settings = new Settings): String = {
+  private[repl] def run(code: String, sets: Settings = new Settings): String = {
     import java.io.{ BufferedReader, StringReader, OutputStreamWriter }
 
     stringFromStream { ostream =>
@@ -1087,5 +1115,5 @@ object SparkILoop {
       }
     }
   }
-  def run(lines: List[String]): String = run(lines map (_ + "\n") mkString)
+  private[repl] def run(lines: List[String]): String = run(lines map (_ + "\n") mkString)
 }
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
index da4286c5e4874..99bd777c04fdb 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
@@ -19,7 +19,7 @@ import org.apache.spark.SPARK_VERSION
 /**
  *  Machinery for the asynchronous initialization of the repl.
  */
-trait SparkILoopInit {
+private[repl] trait SparkILoopInit {
   self: SparkILoop =>
 
   /** Print a welcome message */
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala
index 646c68e60c2e9..35fb625645022 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala
@@ -39,6 +39,7 @@ import scala.util.control.ControlThrowable
 
 import org.apache.spark.{Logging, HttpServer, SecurityManager, SparkConf}
 import org.apache.spark.util.Utils
+import org.apache.spark.annotation.DeveloperApi
 
 // /** directory to save .class files to */
 // private class ReplVirtualDirectory(out: JPrintWriter) extends VirtualDirectory("((memory))", None) {
@@ -84,17 +85,18 @@ import org.apache.spark.util.Utils
    *  @author Moez A. Abdel-Gawad
    *  @author Lex Spoon
    */
+  @DeveloperApi
   class SparkIMain(
       initialSettings: Settings,
       val out: JPrintWriter,
       propagateExceptions: Boolean = false)
     extends SparkImports with Logging { imain =>
 
-    val conf = new SparkConf()
+    private val conf = new SparkConf()
 
-    val SPARK_DEBUG_REPL: Boolean = (System.getenv("SPARK_DEBUG_REPL") == "1")
+    private val SPARK_DEBUG_REPL: Boolean = (System.getenv("SPARK_DEBUG_REPL") == "1")
     /** Local directory to save .class files too */
-    lazy val outputDir = {
+    private lazy val outputDir = {
       val tmp = System.getProperty("java.io.tmpdir")
       val rootDir = conf.get("spark.repl.classdir",  tmp)
       Utils.createTempDir(rootDir)
@@ -103,13 +105,20 @@ import org.apache.spark.util.Utils
       echo("Output directory: " + outputDir)
     }
 
-    val virtualDirectory                              = new PlainFile(outputDir) // "directory" for classfiles
+    /**
+     * Returns the path to the output directory containing all generated
+     * class files that will be served by the REPL class server.
+     */
+    @DeveloperApi
+    lazy val getClassOutputDirectory = outputDir
+
+    private val virtualDirectory                              = new PlainFile(outputDir) // "directory" for classfiles
     /** Jetty server that will serve our classes to worker nodes */
-    val classServerPort                               = conf.getInt("spark.replClassServer.port", 0)
-    val classServer                                   = new HttpServer(outputDir, new SecurityManager(conf), classServerPort, "HTTP class server")
+    private val classServerPort                               = conf.getInt("spark.replClassServer.port", 0)
+    private val classServer                                   = new HttpServer(conf, outputDir, new SecurityManager(conf), classServerPort, "HTTP class server")
     private var currentSettings: Settings             = initialSettings
-    var printResults                                  = true      // whether to print result lines
-    var totalSilence                                  = false     // whether to print anything
+    private var printResults                                  = true      // whether to print result lines
+    private var totalSilence                                  = false     // whether to print anything
     private var _initializeComplete                   = false     // compiler is initialized
     private var _isInitialized: Future[Boolean]       = null      // set up initialization future
     private var bindExceptions                        = true      // whether to bind the lastException variable
@@ -123,6 +132,14 @@ import org.apache.spark.util.Utils
       echo("Class server started, URI = " + classServer.uri)
     }
 
+    /**
+     * URI of the class server used to feed REPL compiled classes.
+     *
+     * @return The string representing the class server uri
+     */
+    @DeveloperApi
+    def classServerUri = classServer.uri
+
     /** We're going to go to some trouble to initialize the compiler asynchronously.
      *  It's critical that nothing call into it until it's been initialized or we will
      *  run into unrecoverable issues, but the perceived repl startup time goes
@@ -141,17 +158,18 @@ import org.apache.spark.util.Utils
       () => { counter += 1 ; counter }
     }
 
-    def compilerClasspath: Seq[URL] = (
+    private def compilerClasspath: Seq[URL] = (
       if (isInitializeComplete) global.classPath.asURLs
       else new PathResolver(settings).result.asURLs  // the compiler's classpath
       )
-    def settings = currentSettings
-    def mostRecentLine = prevRequestList match {
+    // NOTE: Exposed to repl package since accessed indirectly from SparkIMain
+    private[repl] def settings = currentSettings
+    private def mostRecentLine = prevRequestList match {
       case Nil      => ""
       case req :: _ => req.originalLine
     }
     // Run the code body with the given boolean settings flipped to true.
-    def withoutWarnings[T](body: => T): T = beQuietDuring {
+    private def withoutWarnings[T](body: => T): T = beQuietDuring {
       val saved = settings.nowarn.value
       if (!saved)
         settings.nowarn.value = true
@@ -164,16 +182,28 @@ import org.apache.spark.util.Utils
     def this(settings: Settings) = this(settings, new NewLinePrintWriter(new ConsoleWriter, true))
     def this() = this(new Settings())
 
-    lazy val repllog: Logger = new Logger {
+    private lazy val repllog: Logger = new Logger {
       val out: JPrintWriter = imain.out
       val isInfo: Boolean  = BooleanProp keyExists "scala.repl.info"
       val isDebug: Boolean = BooleanProp keyExists "scala.repl.debug"
       val isTrace: Boolean = BooleanProp keyExists "scala.repl.trace"
     }
-    lazy val formatting: Formatting = new Formatting {
+    private[repl] lazy val formatting: Formatting = new Formatting {
       val prompt = Properties.shellPromptString
     }
-    lazy val reporter: ConsoleReporter = new SparkIMain.ReplReporter(this)
+
+    // NOTE: Exposed to repl package since used by SparkExprTyper and SparkILoop
+    private[repl] lazy val reporter: ConsoleReporter = new SparkIMain.ReplReporter(this)
+
+    /**
+     * Determines if errors were reported (typically during compilation).
+     *
+     * @note This is not for runtime errors
+     *
+     * @return True if had errors, otherwise false
+     */
+    @DeveloperApi
+    def isReportingErrors = reporter.hasErrors
 
     import formatting._
     import reporter.{ printMessage, withoutTruncating }
@@ -193,7 +223,8 @@ import org.apache.spark.util.Utils
     private def tquoted(s: String) = "\"\"\"" + s + "\"\"\""
 
     // argument is a thunk to execute after init is done
-    def initialize(postInitSignal: => Unit) {
+    // NOTE: Exposed to repl package since used by SparkILoop
+    private[repl] def initialize(postInitSignal: => Unit) {
       synchronized {
         if (_isInitialized == null) {
           _isInitialized = io.spawn {
@@ -203,15 +234,27 @@ import org.apache.spark.util.Utils
         }
       }
     }
+
+    /**
+     * Initializes the underlying compiler/interpreter in a blocking fashion.
+     *
+     * @note Must be executed before using SparkIMain!
+     */
+    @DeveloperApi
     def initializeSynchronous(): Unit = {
       if (!isInitializeComplete) {
         _initialize()
         assert(global != null, global)
       }
     }
-    def isInitializeComplete = _initializeComplete
+    private def isInitializeComplete = _initializeComplete
 
     /** the public, go through the future compiler */
+
+    /**
+     * The underlying compiler used to generate ASTs and execute code.
+     */
+    @DeveloperApi
     lazy val global: Global = {
       if (isInitializeComplete) _compiler
       else {
@@ -226,13 +269,13 @@ import org.apache.spark.util.Utils
       }
     }
     @deprecated("Use `global` for access to the compiler instance.", "2.9.0")
-    lazy val compiler: global.type = global
+    private lazy val compiler: global.type = global
 
     import global._
     import definitions.{ScalaPackage, JavaLangPackage, termMember, typeMember}
     import rootMirror.{RootClass, getClassIfDefined, getModuleIfDefined, getRequiredModule, getRequiredClass}
 
-    implicit class ReplTypeOps(tp: Type) {
+    private implicit class ReplTypeOps(tp: Type) {
       def orElse(other: => Type): Type    = if (tp ne NoType) tp else other
       def andAlso(fn: Type => Type): Type = if (tp eq NoType) tp else fn(tp)
     }
@@ -240,7 +283,8 @@ import org.apache.spark.util.Utils
     // TODO: If we try to make naming a lazy val, we run into big time
     // scalac unhappiness with what look like cycles.  It has not been easy to
     // reduce, but name resolution clearly takes different paths.
-    object naming extends {
+    // NOTE: Exposed to repl package since used by SparkExprTyper
+    private[repl] object naming extends {
       val global: imain.global.type = imain.global
     } with Naming {
       // make sure we don't overwrite their unwisely named res3 etc.
@@ -254,22 +298,43 @@ import org.apache.spark.util.Utils
     }
     import naming._
 
-    object deconstruct extends {
+    // NOTE: Exposed to repl package since used by SparkILoop
+    private[repl] object deconstruct extends {
       val global: imain.global.type = imain.global
     } with StructuredTypeStrings
 
-    lazy val memberHandlers = new {
+    // NOTE: Exposed to repl package since used by SparkImports
+    private[repl] lazy val memberHandlers = new {
       val intp: imain.type = imain
     } with SparkMemberHandlers
     import memberHandlers._
 
-    /** Temporarily be quiet */
+    /**
+     * Suppresses overwriting print results during the operation.
+     *
+     * @param body The block to execute
+     * @tparam T The return type of the block
+     *
+     * @return The result from executing the block
+     */
+    @DeveloperApi
     def beQuietDuring[T](body: => T): T = {
       val saved = printResults
       printResults = false
       try body
       finally printResults = saved
     }
+
+    /**
+     * Completely masks all output during the operation (minus JVM standard
+     * out and error).
+     *
+     * @param operation The block to execute
+     * @tparam T The return type of the block
+     *
+     * @return The result from executing the block
+     */
+    @DeveloperApi
     def beSilentDuring[T](operation: => T): T = {
       val saved = totalSilence
       totalSilence = true
@@ -277,10 +342,10 @@ import org.apache.spark.util.Utils
       finally totalSilence = saved
     }
 
-    def quietRun[T](code: String) = beQuietDuring(interpret(code))
+    // NOTE: Exposed to repl package since used by SparkILoop
+    private[repl] def quietRun[T](code: String) = beQuietDuring(interpret(code))
 
-
-     private def logAndDiscard[T](label: String, alt: => T): PartialFunction[Throwable, T] = {
+    private def logAndDiscard[T](label: String, alt: => T): PartialFunction[Throwable, T] = {
       case t: ControlThrowable => throw t
       case t: Throwable        =>
         logDebug(label + ": " + unwrap(t))
@@ -298,14 +363,44 @@ import org.apache.spark.util.Utils
       finally bindExceptions = true
     }
 
+    /**
+     * Contains the code (in string form) representing a wrapper around all
+     * code executed by this instance.
+     *
+     * @return The wrapper code as a string
+     */
+    @DeveloperApi
     def executionWrapper = _executionWrapper
+
+    /**
+     * Sets the code to use as a wrapper around all code executed by this
+     * instance.
+     *
+     * @param code The wrapper code as a string
+     */
+    @DeveloperApi
     def setExecutionWrapper(code: String) = _executionWrapper = code
+
+    /**
+     * Clears the code used as a wrapper around all code executed by
+     * this instance.
+     */
+    @DeveloperApi
     def clearExecutionWrapper() = _executionWrapper = ""
 
     /** interpreter settings */
-    lazy val isettings = new SparkISettings(this)
+    private lazy val isettings = new SparkISettings(this)
 
-    /** Instantiate a compiler.  Overridable. */
+    /**
+     * Instantiates a new compiler used by SparkIMain. Overridable to provide
+     * own instance of a compiler.
+     *
+     * @param settings The settings to provide the compiler
+     * @param reporter The reporter to use for compiler output
+     *
+     * @return The compiler as a Global
+     */
+    @DeveloperApi
     protected def newCompiler(settings: Settings, reporter: Reporter): ReplGlobal = {
       settings.outputDirs setSingleOutput virtualDirectory
       settings.exposeEmptyPackage.value = true
@@ -320,13 +415,14 @@ import org.apache.spark.util.Utils
      * @note Currently only supports jars, not directories
      * @param urls The list of items to add to the compile and runtime classpaths
      */
+    @DeveloperApi
     def addUrlsToClassPath(urls: URL*): Unit = {
       new Run // Needed to force initialization of "something" to correctly load Scala classes from jars
       urls.foreach(_runtimeClassLoader.addNewUrl) // Add jars/classes to runtime for execution
       updateCompilerClassPath(urls: _*)           // Add jars/classes to compile time for compiling
     }
 
-    protected def updateCompilerClassPath(urls: URL*): Unit = {
+    private def updateCompilerClassPath(urls: URL*): Unit = {
       require(!global.forMSIL) // Only support JavaPlatform
 
       val platform = global.platform.asInstanceOf[JavaPlatform]
@@ -342,7 +438,7 @@ import org.apache.spark.util.Utils
       global.invalidateClassPathEntries(urls.map(_.getPath): _*)
     }
 
-    protected def mergeUrlsIntoClassPath(platform: JavaPlatform, urls: URL*): MergedClassPath[AbstractFile] = {
+    private def mergeUrlsIntoClassPath(platform: JavaPlatform, urls: URL*): MergedClassPath[AbstractFile] = {
       // Collect our new jars/directories and add them to the existing set of classpaths
       val allClassPaths = (
         platform.classPath.asInstanceOf[MergedClassPath[AbstractFile]].entries ++
@@ -365,7 +461,13 @@ import org.apache.spark.util.Utils
       new MergedClassPath(allClassPaths, platform.classPath.context)
     }
 
-    /** Parent classloader.  Overridable. */
+    /**
+     * Represents the parent classloader used by this instance. Can be
+     * overridden to provide alternative classloader.
+     *
+     * @return The classloader used as the parent loader of this instance
+     */
+    @DeveloperApi
     protected def parentClassLoader: ClassLoader =
       SparkHelper.explicitParentLoader(settings).getOrElse( this.getClass.getClassLoader() )
 
@@ -382,16 +484,18 @@ import org.apache.spark.util.Utils
     shadow the old ones, and old code objects refer to the old
     definitions.
     */
-    def resetClassLoader() = {
+    private def resetClassLoader() = {
       logDebug("Setting new classloader: was " + _classLoader)
       _classLoader = null
       ensureClassLoader()
     }
-    final def ensureClassLoader() {
+    private final def ensureClassLoader() {
       if (_classLoader == null)
         _classLoader = makeClassLoader()
     }
-    def classLoader: AbstractFileClassLoader = {
+
+    // NOTE: Exposed to repl package since used by SparkILoop
+    private[repl] def classLoader: AbstractFileClassLoader = {
       ensureClassLoader()
       _classLoader
     }
@@ -418,27 +522,58 @@ import org.apache.spark.util.Utils
           _runtimeClassLoader
       })
 
-    def getInterpreterClassLoader() = classLoader
+    private def getInterpreterClassLoader() = classLoader
 
     // Set the current Java "context" class loader to this interpreter's class loader
-    def setContextClassLoader() = classLoader.setAsContext()
+    // NOTE: Exposed to repl package since used by SparkILoopInit
+    private[repl] def setContextClassLoader() = classLoader.setAsContext()
 
-    /** Given a simple repl-defined name, returns the real name of
-     *  the class representing it, e.g. for "Bippy" it may return
-     *  {{{
-     *    $line19.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$Bippy
-     *  }}}
+    /**
+     * Returns the real name of a class based on its repl-defined name.
+     *
+     * ==Example==
+     * Given a simple repl-defined name, returns the real name of
+     * the class representing it, e.g. for "Bippy" it may return
+     * {{{
+     *     $line19.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$Bippy
+     * }}}
+     *
+     * @param simpleName The repl-defined name whose real name to retrieve
+     *
+     * @return Some real name if the simple name exists, else None
      */
+    @DeveloperApi
     def generatedName(simpleName: String): Option[String] = {
       if (simpleName endsWith nme.MODULE_SUFFIX_STRING) optFlatName(simpleName.init) map (_ + nme.MODULE_SUFFIX_STRING)
       else optFlatName(simpleName)
     }
-    def flatName(id: String)    = optFlatName(id) getOrElse id
-    def optFlatName(id: String) = requestForIdent(id) map (_ fullFlatName id)
 
+    // NOTE: Exposed to repl package since used by SparkILoop
+    private[repl] def flatName(id: String)    = optFlatName(id) getOrElse id
+    // NOTE: Exposed to repl package since used by SparkILoop
+    private[repl] def optFlatName(id: String) = requestForIdent(id) map (_ fullFlatName id)
+
+    /**
+     * Retrieves all simple names contained in the current instance.
+     *
+     * @return A list of sorted names
+     */
+    @DeveloperApi
     def allDefinedNames = definedNameMap.keys.toList.sorted
-    def pathToType(id: String): String = pathToName(newTypeName(id))
-    def pathToTerm(id: String): String = pathToName(newTermName(id))
+
+    private def pathToType(id: String): String = pathToName(newTypeName(id))
+    // NOTE: Exposed to repl package since used by SparkILoop
+    private[repl] def pathToTerm(id: String): String = pathToName(newTermName(id))
+
+    /**
+     * Retrieves the full code path to access the specified simple name
+     * content.
+     *
+     * @param name The simple name of the target whose path to determine
+     *
+     * @return The full path used to access the specified target (name)
+     */
+    @DeveloperApi
     def pathToName(name: Name): String = {
       if (definedNameMap contains name)
         definedNameMap(name) fullPath name
@@ -457,13 +592,13 @@ import org.apache.spark.util.Utils
     }
 
     /** Stubs for work in progress. */
-    def handleTypeRedefinition(name: TypeName, old: Request, req: Request) = {
+    private def handleTypeRedefinition(name: TypeName, old: Request, req: Request) = {
       for (t1 <- old.simpleNameOfType(name) ; t2 <- req.simpleNameOfType(name)) {
         logDebug("Redefining type '%s'\n  %s -> %s".format(name, t1, t2))
       }
     }
 
-    def handleTermRedefinition(name: TermName, old: Request, req: Request) = {
+    private def handleTermRedefinition(name: TermName, old: Request, req: Request) = {
       for (t1 <- old.compilerTypeOf get name ; t2 <- req.compilerTypeOf get name) {
     //    Printing the types here has a tendency to cause assertion errors, like
         //   assertion failed: fatal: <refinement> has owner value x, but a class owner is required
@@ -473,7 +608,7 @@ import org.apache.spark.util.Utils
       }
     }
 
-    def recordRequest(req: Request) {
+    private def recordRequest(req: Request) {
       if (req == null || referencedNameMap == null)
         return
 
@@ -504,12 +639,12 @@ import org.apache.spark.util.Utils
       }
     }
 
-    def replwarn(msg: => String) {
+    private def replwarn(msg: => String) {
       if (!settings.nowarnings.value)
         printMessage(msg)
     }
 
-    def isParseable(line: String): Boolean = {
+    private def isParseable(line: String): Boolean = {
       beSilentDuring {
         try parse(line) match {
           case Some(xs) => xs.nonEmpty  // parses as-is
@@ -522,22 +657,32 @@ import org.apache.spark.util.Utils
       }
     }
 
-    def compileSourcesKeepingRun(sources: SourceFile*) = {
+    private def compileSourcesKeepingRun(sources: SourceFile*) = {
       val run = new Run()
       reporter.reset()
       run compileSources sources.toList
       (!reporter.hasErrors, run)
     }
 
-    /** Compile an nsc SourceFile.  Returns true if there are
-     *  no compilation errors, or false otherwise.
+    /**
+     * Compiles specified source files.
+     *
+     * @param sources The sequence of source files to compile
+     *
+     * @return True if successful, otherwise false
      */
+    @DeveloperApi
     def compileSources(sources: SourceFile*): Boolean =
       compileSourcesKeepingRun(sources: _*)._1
 
-    /** Compile a string.  Returns true if there are no
-     *  compilation errors, or false otherwise.
+    /**
+     * Compiles a string of code.
+     *
+     * @param code The string of code to compile
+     *
+     * @return True if successful, otherwise false
      */
+    @DeveloperApi
     def compileString(code: String): Boolean =
       compileSources(new BatchSourceFile("<script>", code))
 
@@ -562,7 +707,7 @@ import org.apache.spark.util.Utils
 
   private def safePos(t: Tree, alt: Int): Int =
     try t.pos.startOrPoint
-  catch { case _: UnsupportedOperationException => alt }
+    catch { case _: UnsupportedOperationException => alt }
 
   // Given an expression like 10 * 10 * 10 we receive the parent tree positioned
   // at a '*'.  So look at each subtree and find the earliest of all positions.
@@ -653,22 +798,43 @@ import org.apache.spark.util.Utils
   }
 
   // normalize non-public types so we don't see protected aliases like Self
-  def normalizeNonPublic(tp: Type) = tp match {
+  private def normalizeNonPublic(tp: Type) = tp match {
     case TypeRef(_, sym, _) if sym.isAliasType && !sym.isPublic => tp.dealias
     case _                                                      => tp
   }
 
   /**
-   *  Interpret one line of input. All feedback, including parse errors
-   *  and evaluation results, are printed via the supplied compiler's
-   *  reporter. Values defined are available for future interpreted strings.
+   * Interpret one line of input. All feedback, including parse errors
+   * and evaluation results, are printed via the supplied compiler's
+   * reporter. Values defined are available for future interpreted strings.
+   *
+   * @note This assigns variables with user name structure like "res0"
+   *
+   * @param line The line representing the code to interpret
    *
-   *  The return value is whether the line was interpreter successfully,
-   *  e.g. that there were no parse errors.
+   * @return Whether the line was interpreted successfully, or failed due to
+   *         incomplete code, compilation error, or runtime error
    */
+  @DeveloperApi
   def interpret(line: String): IR.Result = interpret(line, false)
+
+  /**
+   * Interpret one line of input. All feedback, including parse errors
+   * and evaluation results, are printed via the supplied compiler's
+   * reporter. Values defined are available for future interpreted strings.
+   *
+   * @note This assigns variables with synthetic (generated) name structure
+   *       like "$ires0"
+   *
+   * @param line The line representing the code to interpret
+   *
+   * @return Whether the line was interpreted successfully, or failed due to
+   *         incomplete code, compilation error, or runtime error
+   */
+  @DeveloperApi
   def interpretSynthetic(line: String): IR.Result = interpret(line, true)
-  def interpret(line: String, synthetic: Boolean): IR.Result = {
+
+  private def interpret(line: String, synthetic: Boolean): IR.Result = {
     def loadAndRunReq(req: Request) = {
       classLoader.setAsContext()
       val (result, succeeded) = req.loadAndRun
@@ -706,14 +872,20 @@ import org.apache.spark.util.Utils
     }
   }
 
-  /** Bind a specified name to a specified value.  The name may
-   *  later be used by expressions passed to interpret.
+  /**
+   * Bind a specified name to a specified value.  The name may
+   * later be used by expressions passed to interpret.
    *
-   *  @param name      the variable name to bind
-   *  @param boundType the type of the variable, as a string
-   *  @param value     the object value to bind to it
-   *  @return          an indication of whether the binding succeeded
+   * @note This binds via compilation and interpretation
+   *
+   * @param name The variable name to bind
+   * @param boundType The type of the variable, as a string
+   * @param value The object value to bind to it
+   *
+   * @return An indication of whether the binding succeeded or failed
+   *         using interpreter results
    */
+  @DeveloperApi
   def bind(name: String, boundType: String, value: Any, modifiers: List[String] = Nil): IR.Result = {
     val bindRep = new ReadEvalPrint()
     val run = bindRep.compile("""
@@ -735,15 +907,38 @@ import org.apache.spark.util.Utils
       interpret(line)
     }
   }
+
+  /**
+   * Bind a specified name to a specified value directly.
+   *
+   * @note This updates internal bound names directly
+   *
+   * @param name The variable name to bind
+   * @param boundType The type of the variable, as a string
+   * @param value The object value to bind to it
+   *
+   * @return An indication of whether the binding succeeded or failed
+   *         using interpreter results
+   */
+  @DeveloperApi
   def directBind(name: String, boundType: String, value: Any): IR.Result = {
     val result = bind(name, boundType, value)
     if (result == IR.Success)
       directlyBoundNames += newTermName(name)
     result
   }
-  def directBind(p: NamedParam): IR.Result                                    = directBind(p.name, p.tpe, p.value)
-  def directBind[T: ru.TypeTag : ClassTag](name: String, value: T): IR.Result = directBind((name, value))
 
+  private def directBind(p: NamedParam): IR.Result                                    = directBind(p.name, p.tpe, p.value)
+  private def directBind[T: ru.TypeTag : ClassTag](name: String, value: T): IR.Result = directBind((name, value))
+
+  /**
+   * Overwrites previously-bound val with a new instance.
+   *
+   * @param p The named parameters used to provide the name, value, and type
+   *
+   * @return The results of rebinding the named val
+   */
+  @DeveloperApi
   def rebind(p: NamedParam): IR.Result = {
     val name     = p.name
     val oldType  = typeOfTerm(name) orElse { return IR.Error }
@@ -753,19 +948,34 @@ import org.apache.spark.util.Utils
     quietRun("val %s = %s".format(tempName, name))
     quietRun("val %s = %s.asInstanceOf[%s]".format(name, tempName, newType))
   }
-  def quietImport(ids: String*): IR.Result = beQuietDuring(addImports(ids: _*))
+  private def quietImport(ids: String*): IR.Result = beQuietDuring(addImports(ids: _*))
+
+  /**
+   * Executes an import statement per "id" provided
+   *
+   * @example addImports("org.apache.spark.SparkContext")
+   *
+   * @param ids The series of "id" strings used for import statements
+   *
+   * @return The results of importing the series of "id" strings
+   */
+  @DeveloperApi
   def addImports(ids: String*): IR.Result =
     if (ids.isEmpty) IR.Success
     else interpret("import " + ids.mkString(", "))
 
-  def quietBind(p: NamedParam): IR.Result                               = beQuietDuring(bind(p))
-  def bind(p: NamedParam): IR.Result                                    = bind(p.name, p.tpe, p.value)
-  def bind[T: ru.TypeTag : ClassTag](name: String, value: T): IR.Result = bind((name, value))
-  def bindSyntheticValue(x: Any): IR.Result                             = bindValue(freshInternalVarName(), x)
-  def bindValue(x: Any): IR.Result                                      = bindValue(freshUserVarName(), x)
-  def bindValue(name: String, x: Any): IR.Result                        = bind(name, TypeStrings.fromValue(x), x)
+  // NOTE: Exposed to repl package since used by SparkILoop
+  private[repl] def quietBind(p: NamedParam): IR.Result                               = beQuietDuring(bind(p))
+  private def bind(p: NamedParam): IR.Result                                    = bind(p.name, p.tpe, p.value)
+  private def bind[T: ru.TypeTag : ClassTag](name: String, value: T): IR.Result = bind((name, value))
+  private def bindSyntheticValue(x: Any): IR.Result                             = bindValue(freshInternalVarName(), x)
+  private def bindValue(x: Any): IR.Result                                      = bindValue(freshUserVarName(), x)
+  private def bindValue(name: String, x: Any): IR.Result                        = bind(name, TypeStrings.fromValue(x), x)
 
-  /** Reset this interpreter, forgetting all user-specified requests. */
+  /**
+   * Reset this interpreter, forgetting all user-specified requests.
+   */
+  @DeveloperApi
   def reset() {
     clearExecutionWrapper()
     resetClassLoader()
@@ -777,9 +987,11 @@ import org.apache.spark.util.Utils
     virtualDirectory.create()
   }
 
-  /** This instance is no longer needed, so release any resources
-   *  it is using.  The reporter's output gets flushed.
+  /**
+   * Stops the underlying REPL class server and flushes the reporter used
+   * for compiler output.
    */
+  @DeveloperApi
   def close() {
     reporter.flush()
     classServer.stop()
@@ -788,6 +1000,7 @@ import org.apache.spark.util.Utils
   /**
    * Captures the session names (which are set by system properties) once, instead of for each line.
    */
+  @DeveloperApi
   object FixedSessionNames {
     val lineName    = sessionNames.line
     val readName    = sessionNames.read
@@ -1129,10 +1342,13 @@ import org.apache.spark.util.Utils
     override def toString = "Request(line=%s, %s trees)".format(line, trees.size)
   }
 
-  /** Returns the name of the most recent interpreter result.
-   *  Mostly this exists so you can conveniently invoke methods on
-   *  the previous result.
+  /**
+   * Returns the name of the most recent interpreter result. Useful for
+   * for extracting information regarding the previous result.
+   *
+   * @return The simple name of the result (such as res0)
    */
+  @DeveloperApi
   def mostRecentVar: String =
     if (mostRecentlyHandledTree.isEmpty) ""
     else "" + (mostRecentlyHandledTree.get match {
@@ -1143,6 +1359,13 @@ import org.apache.spark.util.Utils
     })
 
   private var mostRecentWarnings: List[(global.Position, String)] = Nil
+
+  /**
+   * Returns a list of recent warnings from compiler execution.
+   *
+   * @return The list of tuples (compiler position, warning)
+   */
+  @DeveloperApi
   def lastWarnings = mostRecentWarnings
 
   def treesForRequestId(id: Int): List[Tree] =
@@ -1169,23 +1392,75 @@ import org.apache.spark.util.Utils
       req.handlers find (_.definedNames contains name)
     }
 
+  /**
+   * Retrieves the object representing the id (variable name, method name,
+   * class name, etc) provided.
+   *
+   * @param id The id (variable name, method name, class name, etc) whose
+   *           associated content to retrieve
+   *
+   * @return Some containing term name (id) representation if exists, else None
+   */
+  @DeveloperApi
   def valueOfTerm(id: String): Option[AnyRef] =
     requestForName(newTermName(id)) flatMap (_.getEval)
 
+  /**
+   * Retrieves the class representing the id (variable name, method name,
+   * class name, etc) provided.
+   *
+   * @param id The id (variable name, method name, class name, etc) whose
+   *           associated class to retrieve
+   *
+   * @return Some containing term name (id) class if exists, else None
+   */
+  @DeveloperApi
   def classOfTerm(id: String): Option[JClass] =
     valueOfTerm(id) map (_.getClass)
 
+  /**
+   * Retrieves the type representing the id (variable name, method name,
+   * class name, etc) provided.
+   *
+   * @param id The id (variable name, method name, class name, etc) whose
+   *           associated type to retrieve
+   *
+   * @return The Type information about the term name (id) provided
+   */
+  @DeveloperApi
   def typeOfTerm(id: String): Type = newTermName(id) match {
     case nme.ROOTPKG  => RootClass.tpe
     case name         => requestForName(name).fold(NoType: Type)(_ compilerTypeOf name)
   }
 
-  def symbolOfType(id: String): Symbol =
-    requestForName(newTypeName(id)).fold(NoSymbol: Symbol)(_ definedTypeSymbol id)
-
+  /**
+   * Retrieves the symbol representing the id (variable name, method name,
+   * class name, etc) provided.
+   *
+   * @param id The id (variable name, method name, class name, etc) whose
+   *           associated symbol to retrieve
+   *
+   * @return The Symbol information about the term name (id) provided
+   */
+  @DeveloperApi
   def symbolOfTerm(id: String): Symbol =
     requestForIdent(newTermName(id)).fold(NoSymbol: Symbol)(_ definedTermSymbol id)
 
+  // TODO: No use yet, but could be exposed as a DeveloperApi
+  private def symbolOfType(id: String): Symbol =
+    requestForName(newTypeName(id)).fold(NoSymbol: Symbol)(_ definedTypeSymbol id)
+
+  /**
+   * Retrieves the runtime class and type representing the id (variable name,
+   * method name, class name, etc) provided.
+   *
+   * @param id The id (variable name, method name, class name, etc) whose
+   *           associated runtime class and type to retrieve
+   *
+   * @return Some runtime class and Type information as a tuple for the
+   *         provided term name if it exists, else None
+   */
+  @DeveloperApi
   def runtimeClassAndTypeOfTerm(id: String): Option[(JClass, Type)] = {
     classOfTerm(id) flatMap { clazz =>
       new RichClass(clazz).supers find(c => !(new RichClass(c).isScalaAnonymous)) map { nonAnon =>
@@ -1194,6 +1469,16 @@ import org.apache.spark.util.Utils
     }
   }
 
+  /**
+   * Retrieves the runtime type representing the id (variable name,
+   * method name, class name, etc) provided.
+   *
+   * @param id The id (variable name, method name, class name, etc) whose
+   *           associated runtime type to retrieve
+   *
+   * @return The runtime Type information about the term name (id) provided
+   */
+  @DeveloperApi
   def runtimeTypeOfTerm(id: String): Type = {
     typeOfTerm(id) andAlso { tpe =>
       val clazz      = classOfTerm(id) getOrElse { return NoType }
@@ -1205,7 +1490,8 @@ import org.apache.spark.util.Utils
       else NoType
     }
   }
-  def cleanMemberDecl(owner: Symbol, member: Name): Type = afterTyper {
+
+  private def cleanMemberDecl(owner: Symbol, member: Name): Type = afterTyper {
     normalizeNonPublic {
       owner.info.nonPrivateDecl(member).tpe match {
         case NullaryMethodType(tp) => tp
@@ -1214,50 +1500,125 @@ import org.apache.spark.util.Utils
     }
   }
 
-  object exprTyper extends {
+  private object exprTyper extends {
     val repl: SparkIMain.this.type = imain
   } with SparkExprTyper { }
 
+  /**
+   * Constructs a list of abstract syntax trees representing the provided code.
+   *
+   * @param line The line of code to parse and construct into ASTs
+   *
+   * @return Some list of ASTs if the line is valid, else None
+   */
+  @DeveloperApi
   def parse(line: String): Option[List[Tree]] = exprTyper.parse(line)
 
+  /**
+   * Constructs a Symbol representing the final result of the expression
+   * provided or representing the definition provided.
+   *
+   * @param code The line of code
+   *
+   * @return The Symbol or NoSymbol (found under scala.reflect.internal)
+   */
+  @DeveloperApi
   def symbolOfLine(code: String): Symbol =
     exprTyper.symbolOfLine(code)
 
+  /**
+   * Constucts type information based on the provided expression's final
+   * result or the definition provided.
+   *
+   * @param expr The expression or definition
+   *
+   * @param silent Whether to output information while constructing the type
+   *
+   * @return The type information or an error
+   */
+  @DeveloperApi
   def typeOfExpression(expr: String, silent: Boolean = true): Type =
     exprTyper.typeOfExpression(expr, silent)
 
   protected def onlyTerms(xs: List[Name]) = xs collect { case x: TermName => x }
   protected def onlyTypes(xs: List[Name]) = xs collect { case x: TypeName => x }
 
+  /**
+   * Retrieves the defined, public names in the compiler.
+   *
+   * @return The list of matching "term" names
+   */
+  @DeveloperApi
   def definedTerms      = onlyTerms(allDefinedNames) filterNot isInternalTermName
+
+  /**
+   * Retrieves the defined type names in the compiler.
+   *
+   * @return The list of matching type names
+   */
+  @DeveloperApi
   def definedTypes      = onlyTypes(allDefinedNames)
+
+  /**
+   * Retrieves the defined symbols in the compiler.
+   *
+   * @return The set of matching Symbol instances
+   */
+  @DeveloperApi
   def definedSymbols    = prevRequestList.flatMap(_.definedSymbols.values).toSet[Symbol]
+
+  /**
+   * Retrieves the list of public symbols in the compiler.
+   *
+   * @return The list of public Symbol instances
+   */
+  @DeveloperApi
   def definedSymbolList = prevRequestList flatMap (_.definedSymbolList) filterNot (s => isInternalTermName(s.name))
 
   // Terms with user-given names (i.e. not res0 and not synthetic)
-    def namedDefinedTerms = definedTerms filterNot (x => isUserVarName("" + x) || directlyBoundNames(x))
+
+  /**
+   * Retrieves defined, public names that are not res0 or the result of a direct bind.
+   *
+   * @return The list of matching "term" names
+   */
+  @DeveloperApi
+  def namedDefinedTerms = definedTerms filterNot (x => isUserVarName("" + x) || directlyBoundNames(x))
 
   private def findName(name: Name) = definedSymbols find (_.name == name) getOrElse NoSymbol
 
   /** Translate a repl-defined identifier into a Symbol.
    */
-  def apply(name: String): Symbol =
+  private def apply(name: String): Symbol =
     types(name) orElse terms(name)
 
-  def types(name: String): Symbol = {
+  private def types(name: String): Symbol = {
     val tpname = newTypeName(name)
     findName(tpname) orElse getClassIfDefined(tpname)
   }
-  def terms(name: String): Symbol = {
+  private def terms(name: String): Symbol = {
     val termname = newTypeName(name)
     findName(termname) orElse getModuleIfDefined(termname)
   }
   // [Eugene to Paul] possibly you could make use of TypeTags here
-  def types[T: ClassTag] : Symbol = types(classTag[T].runtimeClass.getName)
-  def terms[T: ClassTag] : Symbol = terms(classTag[T].runtimeClass.getName)
-  def apply[T: ClassTag] : Symbol = apply(classTag[T].runtimeClass.getName)
+  private def types[T: ClassTag] : Symbol = types(classTag[T].runtimeClass.getName)
+  private def terms[T: ClassTag] : Symbol = terms(classTag[T].runtimeClass.getName)
+  private def apply[T: ClassTag] : Symbol = apply(classTag[T].runtimeClass.getName)
 
+  /**
+   * Retrieves the Symbols representing classes in the compiler.
+   *
+   * @return The list of matching ClassSymbol instances
+   */
+  @DeveloperApi
   def classSymbols  = allDefSymbols collect { case x: ClassSymbol => x }
+
+  /**
+   * Retrieves the Symbols representing methods in the compiler.
+   *
+   * @return The list of matching MethodSymbol instances
+   */
+  @DeveloperApi
   def methodSymbols = allDefSymbols collect { case x: MethodSymbol => x }
 
   /** the previous requests this interpreter has processed */
@@ -1267,32 +1628,41 @@ import org.apache.spark.util.Utils
   private val definedNameMap     = mutable.Map[Name, Request]()
   private val directlyBoundNames = mutable.Set[Name]()
 
-  def allHandlers    = prevRequestList flatMap (_.handlers)
-  def allDefHandlers = allHandlers collect { case x: MemberDefHandler => x }
-  def allDefSymbols  = allDefHandlers map (_.symbol) filter (_ ne NoSymbol)
+  private def allHandlers    = prevRequestList flatMap (_.handlers)
+  private def allDefHandlers = allHandlers collect { case x: MemberDefHandler => x }
+  private def allDefSymbols  = allDefHandlers map (_.symbol) filter (_ ne NoSymbol)
 
-  def lastRequest         = if (prevRequests.isEmpty) null else prevRequests.last
-  def prevRequestList     = prevRequests.toList
-  def allSeenTypes        = prevRequestList flatMap (_.typeOf.values.toList) distinct
-  def allImplicits        = allHandlers filter (_.definesImplicit) flatMap (_.definedNames)
-  def importHandlers      = allHandlers collect { case x: ImportHandler => x }
+  private def lastRequest         = if (prevRequests.isEmpty) null else prevRequests.last
+  // NOTE: Exposed to repl package since used by SparkImports
+  private[repl] def prevRequestList     = prevRequests.toList
+  private def allSeenTypes        = prevRequestList flatMap (_.typeOf.values.toList) distinct
+  private def allImplicits        = allHandlers filter (_.definesImplicit) flatMap (_.definedNames)
+  // NOTE: Exposed to repl package since used by SparkILoop and SparkImports
+  private[repl] def importHandlers      = allHandlers collect { case x: ImportHandler => x }
 
+  /**
+   * Retrieves a list of unique defined and imported names in the compiler.
+   *
+   * @return The list of "term" names
+   */
   def visibleTermNames: List[Name] = definedTerms ++ importedTerms distinct
 
   /** Another entry point for tab-completion, ids in scope */
-  def unqualifiedIds = visibleTermNames map (_.toString) filterNot (_ contains "$") sorted
+  // NOTE: Exposed to repl package since used by SparkJLineCompletion
+  private[repl] def unqualifiedIds = visibleTermNames map (_.toString) filterNot (_ contains "$") sorted
 
   /** Parse the ScalaSig to find type aliases */
-  def aliasForType(path: String) = ByteCode.aliasForType(path)
+  private def aliasForType(path: String) = ByteCode.aliasForType(path)
 
-  def withoutUnwrapping(op: => Unit): Unit = {
+  private def withoutUnwrapping(op: => Unit): Unit = {
     val saved = isettings.unwrapStrings
     isettings.unwrapStrings = false
     try op
     finally isettings.unwrapStrings = saved
   }
 
-  def symbolDefString(sym: Symbol) = {
+  // NOTE: Exposed to repl package since used by SparkILoop
+  private[repl] def symbolDefString(sym: Symbol) = {
     TypeStrings.quieter(
       afterTyper(sym.defString),
       sym.owner.name + ".this.",
@@ -1300,7 +1670,7 @@ import org.apache.spark.util.Utils
     )
   }
 
-  def showCodeIfDebugging(code: String) {
+  private def showCodeIfDebugging(code: String) {
     /** Secret bookcase entrance for repl debuggers: end the line
      *  with "// show" and see what's going on.
      */
@@ -1319,7 +1689,9 @@ import org.apache.spark.util.Utils
   }
 
   // debugging
-  def debugging[T](msg: String)(res: T) = {
+  // NOTE: Exposed to repl package since accessed indirectly from SparkIMain
+  //       and SparkJLineCompletion
+  private[repl] def debugging[T](msg: String)(res: T) = {
     logDebug(msg + " " + res)
     res
   }
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkImports.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkImports.scala
index 193a42dcded12..1d0fe10d3d817 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkImports.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkImports.scala
@@ -12,7 +12,7 @@ import scala.tools.nsc.interpreter._
 
 import scala.collection.{ mutable, immutable }
 
-trait SparkImports {
+private[repl] trait SparkImports {
   self: SparkIMain =>
 
   import global._
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineCompletion.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineCompletion.scala
index 3159b70008ae3..f24d6da72437e 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineCompletion.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineCompletion.scala
@@ -7,6 +7,8 @@
 
 package org.apache.spark.repl
 
+import org.apache.spark.annotation.DeveloperApi
+
 import scala.tools.nsc._
 import scala.tools.nsc.interpreter._
 
@@ -16,27 +18,45 @@ import Completion._
 import scala.collection.mutable.ListBuffer
 import org.apache.spark.Logging
 
-// REPL completor - queries supplied interpreter for valid
-// completions based on current contents of buffer.
+/**
+ * Represents an auto-completion tool for the supplied interpreter that
+ * utilizes supplied queries for valid completions based on the current
+ * contents of the internal buffer.
+ *
+ * @param intp The interpreter to use for information retrieval to do with
+ *             auto completion
+ */
+@DeveloperApi
 class SparkJLineCompletion(val intp: SparkIMain) extends Completion with CompletionOutput with Logging {
+  // NOTE: Exposed in package as used in quite a few classes
+  // NOTE: Must be public to override the global found in CompletionOutput
   val global: intp.global.type = intp.global
+
   import global._
   import definitions.{ PredefModule, AnyClass, AnyRefClass, ScalaPackage, JavaLangPackage }
   import rootMirror.{ RootClass, getModuleIfDefined }
   type ExecResult = Any
   import intp.{ debugging }
 
-  // verbosity goes up with consecutive tabs
-  private var verbosity: Int = 0
+  /**
+   * Represents the level of verbosity. Increments with consecutive tabs.
+   */
+  @DeveloperApi
+  var verbosity: Int = 0
+
+  /**
+   * Resets the level of verbosity to zero.
+   */
+  @DeveloperApi
   def resetVerbosity() = verbosity = 0
 
-  def getSymbol(name: String, isModule: Boolean) = (
+  private def getSymbol(name: String, isModule: Boolean) = (
     if (isModule) getModuleIfDefined(name)
     else getModuleIfDefined(name)
   )
-  def getType(name: String, isModule: Boolean) = getSymbol(name, isModule).tpe
-  def typeOf(name: String)                     = getType(name, false)
-  def moduleOf(name: String)                   = getType(name, true)
+  private def getType(name: String, isModule: Boolean) = getSymbol(name, isModule).tpe
+  private def typeOf(name: String)                     = getType(name, false)
+  private def moduleOf(name: String)                   = getType(name, true)
 
   trait CompilerCompletion {
     def tp: Type
@@ -258,12 +278,12 @@ class SparkJLineCompletion(val intp: SparkIMain) extends Completion with Complet
 
   // the list of completion aware objects which should be consulted
   // for top level unqualified, it's too noisy to let much in.
-  lazy val topLevelBase: List[CompletionAware] = List(ids, rootClass, predef, scalalang, javalang, literals)
-  def topLevel = topLevelBase ++ imported
-  def topLevelThreshold = 50
+  private lazy val topLevelBase: List[CompletionAware] = List(ids, rootClass, predef, scalalang, javalang, literals)
+  private def topLevel = topLevelBase ++ imported
+  private def topLevelThreshold = 50
 
   // the first tier of top level objects (doesn't include file completion)
-  def topLevelFor(parsed: Parsed): List[String] = {
+  private def topLevelFor(parsed: Parsed): List[String] = {
     val buf = new ListBuffer[String]
     topLevel foreach { ca =>
       buf ++= (ca completionsFor parsed)
@@ -275,9 +295,9 @@ class SparkJLineCompletion(val intp: SparkIMain) extends Completion with Complet
   }
 
   // the most recent result
-  def lastResult = Forwarder(() => ids follow intp.mostRecentVar)
+  private def lastResult = Forwarder(() => ids follow intp.mostRecentVar)
 
-  def lastResultFor(parsed: Parsed) = {
+  private def lastResultFor(parsed: Parsed) = {
     /** The logic is a little tortured right now because normally '.' is
      *  ignored as a delimiter, but on .<tab> it needs to be propagated.
      */
@@ -286,9 +306,15 @@ class SparkJLineCompletion(val intp: SparkIMain) extends Completion with Complet
   }
 
   // generic interface for querying (e.g. interpreter loop, testing)
-  def completions(buf: String): List[String] =
+  private def completions(buf: String): List[String] =
     topLevelFor(Parsed.dotted(buf + ".", buf.length + 1))
 
+  /**
+   * Constructs a new ScalaCompleter for auto completion.
+   *
+   * @return The new JLineTabCompletion instance
+   */
+  @DeveloperApi
   def completer(): ScalaCompleter = new JLineTabCompletion
 
   /** This gets a little bit hairy.  It's no small feat delegating everything
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineReader.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineReader.scala
index 0db26c3407dff..016e0f039f4f1 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineReader.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineReader.scala
@@ -22,7 +22,7 @@ import io.Streamable.slurp
 /**
  *  Reads from the console using JLine.
  */
-class SparkJLineReader(_completion: => Completion) extends InteractiveReader {
+private[repl] class SparkJLineReader(_completion: => Completion) extends InteractiveReader {
   val interactive = true
   val consoleReader = new JLineConsoleReader()
 
@@ -82,7 +82,7 @@ class SparkJLineReader(_completion: => Completion) extends InteractiveReader {
 }
 
 /** Changes the default history file to not collide with the scala repl's. */
-class SparkJLineHistory extends JLineFileHistory {
+private[repl] class SparkJLineHistory extends JLineFileHistory {
   import Properties.userHome
 
   def defaultFileName = ".spark_history"
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkMemberHandlers.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkMemberHandlers.scala
index 13cd2b7fa56c7..4de9714247c1f 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkMemberHandlers.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkMemberHandlers.scala
@@ -16,7 +16,7 @@ import scala.reflect.internal.Chars
 import scala.reflect.internal.Flags._
 import scala.language.implicitConversions
 
-trait SparkMemberHandlers {
+private[repl] trait SparkMemberHandlers {
   val intp: SparkIMain
 
   import intp.{ Request, global, naming }
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkRunnerSettings.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkRunnerSettings.scala
index 7fd5fbb42468c..94c801ebec7c1 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkRunnerSettings.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkRunnerSettings.scala
@@ -23,8 +23,7 @@ import scala.tools.nsc.Settings
  * <i>scala.tools.nsc.Settings</i> implementation adding Spark-specific REPL
  * command line options.
  */
-class SparkRunnerSettings(error: String => Unit) extends Settings(error){
-
+private[repl] class SparkRunnerSettings(error: String => Unit) extends Settings(error) {
   val loadfiles = MultiStringSetting(
       "-i",
       "file",
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
index 5e93a71995072..69e44d4f916e1 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
@@ -32,7 +32,7 @@ object Main extends Logging {
   val s = new Settings()
   s.processArguments(List("-Yrepl-class-based",
     "-Yrepl-outdir", s"${outputDir.getAbsolutePath}", "-Yrepl-sync"), true)
-  val classServer = new HttpServer(outputDir, new SecurityManager(conf))
+  val classServer = new HttpServer(conf, outputDir, new SecurityManager(conf))
   var sparkContext: SparkContext = _
   var interp = new SparkILoop // this is a public var because tests reset it.
 
diff --git a/repl/src/test/resources/log4j.properties b/repl/src/test/resources/log4j.properties
index 52098993f5c3c..e7e4a4113174a 100644
--- a/repl/src/test/resources/log4j.properties
+++ b/repl/src/test/resources/log4j.properties
@@ -15,10 +15,10 @@
 # limitations under the License.
 #
 
-# Set everything to be logged to the repl/target/unit-tests.log
+# Set everything to be logged to the target/unit-tests.log
 log4j.rootCategory=INFO, file
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/sbt/sbt b/sbt/sbt
index 0a251d97db95c..41438251f681e 100755
--- a/sbt/sbt
+++ b/sbt/sbt
@@ -1,111 +1,29 @@
 #!/usr/bin/env bash
 
-# When creating new tests for Spark SQL Hive, the HADOOP_CLASSPATH must contain the hive jars so
-# that we can run Hive to generate the golden answer.  This is not required for normal development
-# or testing.
-for i in "$HIVE_HOME"/lib/*
-do HADOOP_CLASSPATH="$HADOOP_CLASSPATH:$i"
-done
-export HADOOP_CLASSPATH
-
-realpath () {
-(
-  TARGET_FILE="$1"
-
-  cd "$(dirname "$TARGET_FILE")"
-  TARGET_FILE="$(basename "$TARGET_FILE")"
-
-  COUNT=0
-  while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
-  do
-      TARGET_FILE="$(readlink "$TARGET_FILE")"
-      cd $(dirname "$TARGET_FILE")
-      TARGET_FILE="$(basename $TARGET_FILE)"
-      COUNT=$(($COUNT + 1))
-  done
-
-  echo "$(pwd -P)/"$TARGET_FILE""
-)
-}
-
-. "$(dirname "$(realpath "$0")")"/sbt-launch-lib.bash
-
-
-declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
-declare -r sbt_opts_file=".sbtopts"
-declare -r etc_sbt_opts_file="/etc/sbt/sbtopts"
-
-usage() {
- cat <<EOM
-Usage: $script_name [options]
-
-  -h | -help         print this message
-  -v | -verbose      this runner is chattier
-  -d | -debug        set sbt log level to debug
-  -no-colors         disable ANSI color codes
-  -sbt-create        start sbt even if current directory contains no sbt project
-  -sbt-dir   <path>  path to global settings/plugins directory (default: ~/.sbt)
-  -sbt-boot  <path>  path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
-  -ivy       <path>  path to local Ivy repository (default: ~/.ivy2)
-  -mem    <integer>  set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
-  -no-share          use all local caches; no sharing
-  -no-global         uses global caches, but does not use global ~/.sbt directory.
-  -jvm-debug <port>  Turn on JVM debugging, open at the given port.
-  -batch             Disable interactive mode
-
-  # sbt version (default: from project/build.properties if present, else latest release)
-  -sbt-version  <version>   use the specified version of sbt
-  -sbt-jar      <path>      use the specified jar as the sbt launcher
-  -sbt-rc                   use an RC version of sbt
-  -sbt-snapshot             use a snapshot version of sbt
-
-  # java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
-  -java-home <path>         alternate JAVA_HOME
-
-  # jvm options and output control
-  JAVA_OPTS          environment variable, if unset uses "$java_opts"
-  SBT_OPTS           environment variable, if unset uses "$default_sbt_opts"
-  .sbtopts           if this file exists in the current directory, it is
-                     prepended to the runner args
-  /etc/sbt/sbtopts   if this file exists, it is prepended to the runner args
-  -Dkey=val          pass -Dkey=val directly to the java runtime
-  -J-X               pass option -X directly to the java runtime
-                     (-J is stripped)
-  -S-X               add -X to sbt's scalacOptions (-S is stripped)
-  -PmavenProfiles    Enable a maven profile for the build.
-
-In the case of duplicated or conflicting options, the order above
-shows precedence: JAVA_OPTS lowest, command line options highest.
-EOM
-}
-
-process_my_args () {
-  while [[ $# -gt 0 ]]; do
-    case "$1" in
-     -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
-      -no-share) addJava "$noshare_opts" && shift ;;
-     -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;;
-      -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
-       -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;;
-     -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
-         -batch) exec </dev/null && shift ;;
-
-    -sbt-create) sbt_create=true && shift ;;
-
-              *) addResidual "$1" && shift ;;
-    esac
-  done
-
-  # Now, ensure sbt version is used.
-  [[ "${sbt_version}XXX" != "XXX" ]] && addJava "-Dsbt.version=$sbt_version"
-}
-
-loadConfigFile() {
-  cat "$1" | sed '/^\#/d'
-}
-
-# if sbtopts files exist, prepend their contents to $@ so it can be processed by this runner
-[[ -f "$etc_sbt_opts_file" ]] && set -- $(loadConfigFile "$etc_sbt_opts_file") "$@"
-[[ -f "$sbt_opts_file" ]] && set -- $(loadConfigFile "$sbt_opts_file") "$@"
-
-run "$@"
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Determine the current working directory
+_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+echo "NOTE: The sbt/sbt script has been relocated to build/sbt." >&2
+echo "      Please update references to point to the new location." >&2
+echo "" >&2
+echo "      Invoking 'build/sbt $@' now ..." >&2
+echo "" >&2
+
+${_DIR}/../build/sbt "$@"
diff --git a/sql/README.md b/sql/README.md
index c84534da9a3d3..d058a6b011d37 100644
--- a/sql/README.md
+++ b/sql/README.md
@@ -22,10 +22,10 @@ export HADOOP_HOME="<path to>/hadoop-1.0.4"
 
 Using the console
 =================
-An interactive scala console can be invoked by running `sbt/sbt hive/console`.  From here you can execute queries and inspect the various stages of query optimization.
+An interactive scala console can be invoked by running `build/sbt hive/console`.  From here you can execute queries and inspect the various stages of query optimization.
 
 ```scala
-catalyst$ sbt/sbt hive/console
+catalyst$ build/sbt hive/console
 
 [info] Starting scala interpreter...
 import org.apache.spark.sql.catalyst.analysis._
@@ -34,11 +34,11 @@ import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.TestHive._
+import org.apache.spark.sql.types._
 Welcome to Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_45).
 Type in expressions to have them evaluated.
 Type :help for more information.
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 1caa297e24e37..a1947fb022e54 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -50,11 +50,6 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
@@ -65,11 +60,6 @@
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
     <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
-
       <!--
            This plugin forces the generation of jar containing catalyst test classes,
            so that the tests classes of external modules can use them. The two execution profiles
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/MetadataBuilder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/RowFactory.java
similarity index 66%
rename from sql/core/src/main/java/org/apache/spark/sql/api/java/MetadataBuilder.java
rename to sql/catalyst/src/main/java/org/apache/spark/sql/RowFactory.java
index 6e6b12f0722c5..5ed60fe78d116 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/MetadataBuilder.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/RowFactory.java
@@ -15,14 +15,20 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.api.java;
+package org.apache.spark.sql;
+
+import org.apache.spark.sql.catalyst.expressions.GenericRow;
 
 /**
- * Builder for [[Metadata]]. If there is a key collision, the latter will overwrite the former.
+ * A factory class used to construct {@link Row} objects.
  */
-public class MetadataBuilder extends org.apache.spark.sql.catalyst.util.MetadataBuilder {
-  @Override
-  public Metadata build() {
-    return new Metadata(getMap());
+public class RowFactory {
+
+  /**
+   * Create a {@link Row} from the given arguments. Position i in the argument list becomes
+   * position i in the created {@link Row} object.
+   */
+  public static Row create(Object ... values) {
+    return new GenericRow(values);
   }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/DataType.java b/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java
similarity index 81%
rename from sql/core/src/main/java/org/apache/spark/sql/api/java/DataType.java
rename to sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java
index c69bbd5736a5b..e457542c647e7 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/DataType.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java
@@ -15,77 +15,74 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.api.java;
+package org.apache.spark.sql.types;
 
 import java.util.*;
 
 /**
- * The base type of all Spark SQL data types.
- *
  * To get/create specific data type, users should use singleton objects and factory methods
  * provided by this class.
  */
-public abstract class DataType {
-
+public class DataTypes {
   /**
    * Gets the StringType object.
    */
-  public static final StringType StringType = new StringType();
+  public static final DataType StringType = StringType$.MODULE$;
 
   /**
    * Gets the BinaryType object.
    */
-  public static final BinaryType BinaryType = new BinaryType();
+  public static final DataType BinaryType = BinaryType$.MODULE$;
 
   /**
    * Gets the BooleanType object.
    */
-  public static final BooleanType BooleanType = new BooleanType();
+  public static final DataType BooleanType = BooleanType$.MODULE$;
 
   /**
    * Gets the DateType object.
    */
-  public static final DateType DateType = new DateType();
+  public static final DataType DateType = DateType$.MODULE$;
 
   /**
    * Gets the TimestampType object.
    */
-  public static final TimestampType TimestampType = new TimestampType();
+  public static final DataType TimestampType = TimestampType$.MODULE$;
 
   /**
    * Gets the DoubleType object.
    */
-  public static final DoubleType DoubleType = new DoubleType();
+  public static final DataType DoubleType = DoubleType$.MODULE$;
 
   /**
    * Gets the FloatType object.
    */
-  public static final FloatType FloatType = new FloatType();
+  public static final DataType FloatType = FloatType$.MODULE$;
 
   /**
    * Gets the ByteType object.
    */
-  public static final ByteType ByteType = new ByteType();
+  public static final DataType ByteType = ByteType$.MODULE$;
 
   /**
    * Gets the IntegerType object.
    */
-  public static final IntegerType IntegerType = new IntegerType();
+  public static final DataType IntegerType = IntegerType$.MODULE$;
 
   /**
    * Gets the LongType object.
    */
-  public static final LongType LongType = new LongType();
+  public static final DataType LongType = LongType$.MODULE$;
 
   /**
    * Gets the ShortType object.
    */
-  public static final ShortType ShortType = new ShortType();
+  public static final DataType ShortType = ShortType$.MODULE$;
 
   /**
    * Gets the NullType object.
    */
-  public static final NullType NullType = new NullType();
+  public static final DataType NullType = NullType$.MODULE$;
 
   /**
    * Creates an ArrayType by specifying the data type of elements ({@code elementType}).
@@ -95,7 +92,6 @@ public static ArrayType createArrayType(DataType elementType) {
     if (elementType == null) {
       throw new IllegalArgumentException("elementType should not be null.");
     }
-
     return new ArrayType(elementType, true);
   }
 
@@ -107,10 +103,17 @@ public static ArrayType createArrayType(DataType elementType, boolean containsNu
     if (elementType == null) {
       throw new IllegalArgumentException("elementType should not be null.");
     }
-
     return new ArrayType(elementType, containsNull);
   }
 
+  public static DecimalType createDecimalType(int precision, int scale) {
+    return DecimalType$.MODULE$.apply(precision, scale);
+  }
+
+  public static DecimalType createDecimalType() {
+    return DecimalType$.MODULE$.Unlimited();
+  }
+
   /**
    * Creates a MapType by specifying the data type of keys ({@code keyType}) and values
    * ({@code keyType}). The field of {@code valueContainsNull} is set to {@code true}.
@@ -122,7 +125,6 @@ public static MapType createMapType(DataType keyType, DataType valueType) {
     if (valueType == null) {
       throw new IllegalArgumentException("valueType should not be null.");
     }
-
     return new MapType(keyType, valueType, true);
   }
 
@@ -141,7 +143,6 @@ public static MapType createMapType(
     if (valueType == null) {
       throw new IllegalArgumentException("valueType should not be null.");
     }
-
     return new MapType(keyType, valueType, valueContainsNull);
   }
 
@@ -163,7 +164,6 @@ public static StructField createStructField(
     if (metadata == null) {
       throw new IllegalArgumentException("metadata should not be null.");
     }
-
     return new StructField(name, dataType, nullable, metadata);
   }
 
@@ -191,18 +191,18 @@ public static StructType createStructType(StructField[] fields) {
       throw new IllegalArgumentException("fields should not be null.");
     }
     Set<String> distinctNames = new HashSet<String>();
-    for (StructField field: fields) {
+    for (StructField field : fields) {
       if (field == null) {
         throw new IllegalArgumentException(
           "fields should not contain any null.");
       }
 
-      distinctNames.add(field.getName());
+      distinctNames.add(field.name());
     }
     if (distinctNames.size() != fields.length) {
       throw new IllegalArgumentException("fields should have distinct names.");
     }
 
-    return new StructType(fields);
+    return StructType$.MODULE$.apply(fields);
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
new file mode 100644
index 0000000000000..41bb4f012f2e1
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -0,0 +1,369 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import scala.util.hashing.MurmurHash3
+
+import org.apache.spark.sql.catalyst.expressions.GenericRow
+
+
+object Row {
+  /**
+   * This method can be used to extract fields from a [[Row]] object in a pattern match. Example:
+   * {{{
+   * import org.apache.spark.sql._
+   *
+   * val pairs = sql("SELECT key, value FROM src").rdd.map {
+   *   case Row(key: Int, value: String) =>
+   *     key -> value
+   * }
+   * }}}
+   */
+  def unapplySeq(row: Row): Some[Seq[Any]] = Some(row.toSeq)
+
+  /**
+   * This method can be used to construct a [[Row]] with the given values.
+   */
+  def apply(values: Any*): Row = new GenericRow(values.toArray)
+
+  /**
+   * This method can be used to construct a [[Row]] from a [[Seq]] of values.
+   */
+  def fromSeq(values: Seq[Any]): Row = new GenericRow(values.toArray)
+
+  def fromTuple(tuple: Product): Row = fromSeq(tuple.productIterator.toSeq)
+
+  /**
+   * Merge multiple rows into a single row, one after another.
+   */
+  def merge(rows: Row*): Row = {
+    // TODO: Improve the performance of this if used in performance critical part.
+    new GenericRow(rows.flatMap(_.toSeq).toArray)
+  }
+}
+
+
+/**
+ * Represents one row of output from a relational operator.  Allows both generic access by ordinal,
+ * which will incur boxing overhead for primitives, as well as native primitive access.
+ *
+ * It is invalid to use the native primitive interface to retrieve a value that is null, instead a
+ * user must check `isNullAt` before attempting to retrieve a value that might be null.
+ *
+ * To create a new Row, use [[RowFactory.create()]] in Java or [[Row.apply()]] in Scala.
+ *
+ * A [[Row]] object can be constructed by providing field values. Example:
+ * {{{
+ * import org.apache.spark.sql._
+ *
+ * // Create a Row from values.
+ * Row(value1, value2, value3, ...)
+ * // Create a Row from a Seq of values.
+ * Row.fromSeq(Seq(value1, value2, ...))
+ * }}}
+ *
+ * A value of a row can be accessed through both generic access by ordinal,
+ * which will incur boxing overhead for primitives, as well as native primitive access.
+ * An example of generic access by ordinal:
+ * {{{
+ * import org.apache.spark.sql._
+ *
+ * val row = Row(1, true, "a string", null)
+ * // row: Row = [1,true,a string,null]
+ * val firstValue = row(0)
+ * // firstValue: Any = 1
+ * val fourthValue = row(3)
+ * // fourthValue: Any = null
+ * }}}
+ *
+ * For native primitive access, it is invalid to use the native primitive interface to retrieve
+ * a value that is null, instead a user must check `isNullAt` before attempting to retrieve a
+ * value that might be null.
+ * An example of native primitive access:
+ * {{{
+ * // using the row from the previous example.
+ * val firstValue = row.getInt(0)
+ * // firstValue: Int = 1
+ * val isNull = row.isNullAt(3)
+ * // isNull: Boolean = true
+ * }}}
+ *
+ * In Scala, fields in a [[Row]] object can be extracted in a pattern match. Example:
+ * {{{
+ * import org.apache.spark.sql._
+ *
+ * val pairs = sql("SELECT key, value FROM src").rdd.map {
+ *   case Row(key: Int, value: String) =>
+ *     key -> value
+ * }
+ * }}}
+ *
+ * @group row
+ */
+trait Row extends Serializable {
+  /** Number of elements in the Row. */
+  def size: Int = length
+
+  /** Number of elements in the Row. */
+  def length: Int
+
+  /**
+   * Returns the value at position i. If the value is null, null is returned. The following
+   * is a mapping between Spark SQL types and return types:
+   *
+   * {{{
+   *   BooleanType -> java.lang.Boolean
+   *   ByteType -> java.lang.Byte
+   *   ShortType -> java.lang.Short
+   *   IntegerType -> java.lang.Integer
+   *   FloatType -> java.lang.Float
+   *   DoubleType -> java.lang.Double
+   *   StringType -> String
+   *   DecimalType -> java.math.BigDecimal
+   *
+   *   DateType -> java.sql.Date
+   *   TimestampType -> java.sql.Timestamp
+   *
+   *   BinaryType -> byte array
+   *   ArrayType -> scala.collection.Seq (use getList for java.util.List)
+   *   MapType -> scala.collection.Map (use getJavaMap for java.util.Map)
+   *   StructType -> org.apache.spark.sql.Row
+   * }}}
+   */
+  def apply(i: Int): Any
+
+  /**
+   * Returns the value at position i. If the value is null, null is returned. The following
+   * is a mapping between Spark SQL types and return types:
+   *
+   * {{{
+   *   BooleanType -> java.lang.Boolean
+   *   ByteType -> java.lang.Byte
+   *   ShortType -> java.lang.Short
+   *   IntegerType -> java.lang.Integer
+   *   FloatType -> java.lang.Float
+   *   DoubleType -> java.lang.Double
+   *   StringType -> String
+   *   DecimalType -> java.math.BigDecimal
+   *
+   *   DateType -> java.sql.Date
+   *   TimestampType -> java.sql.Timestamp
+   *
+   *   BinaryType -> byte array
+   *   ArrayType -> scala.collection.Seq (use getList for java.util.List)
+   *   MapType -> scala.collection.Map (use getJavaMap for java.util.Map)
+   *   StructType -> org.apache.spark.sql.Row
+   * }}}
+   */
+  def get(i: Int): Any = apply(i)
+
+  /** Checks whether the value at position i is null. */
+  def isNullAt(i: Int): Boolean
+
+  /**
+   * Returns the value at position i as a primitive boolean.
+   *
+   * @throws ClassCastException when data type does not match.
+   * @throws NullPointerException when value is null.
+   */
+  def getBoolean(i: Int): Boolean
+
+  /**
+   * Returns the value at position i as a primitive byte.
+   *
+   * @throws ClassCastException when data type does not match.
+   * @throws NullPointerException when value is null.
+   */
+  def getByte(i: Int): Byte
+
+  /**
+   * Returns the value at position i as a primitive short.
+   *
+   * @throws ClassCastException when data type does not match.
+   * @throws NullPointerException when value is null.
+   */
+  def getShort(i: Int): Short
+
+  /**
+   * Returns the value at position i as a primitive int.
+   *
+   * @throws ClassCastException when data type does not match.
+   * @throws NullPointerException when value is null.
+   */
+  def getInt(i: Int): Int
+
+  /**
+   * Returns the value at position i as a primitive long.
+   *
+   * @throws ClassCastException when data type does not match.
+   * @throws NullPointerException when value is null.
+   */
+  def getLong(i: Int): Long
+
+  /**
+   * Returns the value at position i as a primitive float.
+   * Throws an exception if the type mismatches or if the value is null.
+   *
+   * @throws ClassCastException when data type does not match.
+   * @throws NullPointerException when value is null.
+   */
+  def getFloat(i: Int): Float
+
+  /**
+   * Returns the value at position i as a primitive double.
+   *
+   * @throws ClassCastException when data type does not match.
+   * @throws NullPointerException when value is null.
+   */
+  def getDouble(i: Int): Double
+
+  /**
+   * Returns the value at position i as a String object.
+   *
+   * @throws ClassCastException when data type does not match.
+   * @throws NullPointerException when value is null.
+   */
+  def getString(i: Int): String
+
+  /**
+   * Returns the value at position i of decimal type as java.math.BigDecimal.
+   *
+   * @throws ClassCastException when data type does not match.
+   */
+  def getDecimal(i: Int): java.math.BigDecimal = apply(i).asInstanceOf[java.math.BigDecimal]
+
+  /**
+   * Returns the value at position i of date type as java.sql.Date.
+   *
+   * @throws ClassCastException when data type does not match.
+   */
+  def getDate(i: Int): java.sql.Date = apply(i).asInstanceOf[java.sql.Date]
+
+  /**
+   * Returns the value at position i of array type as a Scala Seq.
+   *
+   * @throws ClassCastException when data type does not match.
+   */
+  def getSeq[T](i: Int): Seq[T] = apply(i).asInstanceOf[Seq[T]]
+
+  /**
+   * Returns the value at position i of array type as [[java.util.List]].
+   *
+   * @throws ClassCastException when data type does not match.
+   */
+  def getList[T](i: Int): java.util.List[T] = {
+    scala.collection.JavaConversions.seqAsJavaList(getSeq[T](i))
+  }
+
+  /**
+   * Returns the value at position i of map type as a Scala Map.
+   *
+   * @throws ClassCastException when data type does not match.
+   */
+  def getMap[K, V](i: Int): scala.collection.Map[K, V] = apply(i).asInstanceOf[Map[K, V]]
+
+  /**
+   * Returns the value at position i of array type as a [[java.util.Map]].
+   *
+   * @throws ClassCastException when data type does not match.
+   */
+  def getJavaMap[K, V](i: Int): java.util.Map[K, V] = {
+    scala.collection.JavaConversions.mapAsJavaMap(getMap[K, V](i))
+  }
+
+  /**
+   * Returns the value at position i of struct type as an [[Row]] object.
+   *
+   * @throws ClassCastException when data type does not match.
+   */
+  def getStruct(i: Int): Row = getAs[Row](i)
+
+  /**
+   * Returns the value at position i.
+   *
+   * @throws ClassCastException when data type does not match.
+   */
+  def getAs[T](i: Int): T = apply(i).asInstanceOf[T]
+
+  override def toString(): String = s"[${this.mkString(",")}]"
+
+  /**
+   * Make a copy of the current [[Row]] object.
+   */
+  def copy(): Row
+
+  /** Returns true if there are any NULL values in this row. */
+  def anyNull: Boolean = {
+    val len = length
+    var i = 0
+    while (i < len) {
+      if (isNullAt(i)) { return true }
+      i += 1
+    }
+    false
+  }
+
+  override def equals(that: Any): Boolean = that match {
+    case null => false
+    case that: Row =>
+      if (this.length != that.length) {
+        return false
+      }
+      var i = 0
+      val len = this.length
+      while (i < len) {
+        if (apply(i) != that.apply(i)) {
+          return false
+        }
+        i += 1
+      }
+      true
+    case _ => false
+  }
+
+  override def hashCode: Int = {
+    // Using Scala's Seq hash code implementation.
+    var n = 0
+    var h = MurmurHash3.seqSeed
+    val len = length
+    while (n < len) {
+      h = MurmurHash3.mix(h, apply(n).##)
+      n += 1
+    }
+    MurmurHash3.finalizeHash(h, n)
+  }
+
+  /* ---------------------- utility methods for Scala ---------------------- */
+
+  /**
+   * Return a Scala Seq representing the row. ELements are placed in the same order in the Seq.
+   */
+  def toSeq: Seq[Any]
+
+  /** Displays all elements of this sequence in a string (without a separator). */
+  def mkString: String = toSeq.mkString
+
+  /** Displays all elements of this sequence in a string using a separator string. */
+  def mkString(sep: String): String = toSeq.mkString(sep)
+
+  /**
+   * Displays all elements of this traversable or iterator in a string using
+   * start, end, and separator strings.
+   */
+  def mkString(start: String, sep: String, end: String): String = toSeq.mkString(start, sep, end)
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SparkSQLParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
similarity index 54%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SparkSQLParser.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
index f1a1ca6616a21..366be00473d1c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SparkSQLParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
@@ -25,15 +25,42 @@ import scala.util.parsing.input.CharArrayReader.EofCh
 
 import org.apache.spark.sql.catalyst.plans.logical._
 
+private[sql] object KeywordNormalizer {
+  def apply(str: String) = str.toLowerCase()
+}
+
 private[sql] abstract class AbstractSparkSQLParser
   extends StandardTokenParsers with PackratParsers {
 
-  def apply(input: String): LogicalPlan = phrase(start)(new lexical.Scanner(input)) match {
-    case Success(plan, _) => plan
-    case failureOrError => sys.error(failureOrError.toString)
+  def apply(input: String): LogicalPlan = {
+    // Initialize the Keywords.
+    lexical.initialize(reservedWords)
+    phrase(start)(new lexical.Scanner(input)) match {
+      case Success(plan, _) => plan
+      case failureOrError => sys.error(failureOrError.toString)
+    }
+  }
+
+  protected case class Keyword(str: String) {
+    def normalize = KeywordNormalizer(str)
+    def parser: Parser[String] = normalize
   }
 
-  protected case class Keyword(str: String)
+  protected implicit def asParser(k: Keyword): Parser[String] = k.parser
+
+  // By default, use Reflection to find the reserved words defined in the sub class.
+  // NOTICE, Since the Keyword properties defined by sub class, we couldn't call this
+  // method during the parent class instantiation, because the sub class instance
+  // isn't created yet.
+  protected lazy val reservedWords: Seq[String] =
+    this
+      .getClass
+      .getMethods
+      .filter(_.getReturnType == classOf[Keyword])
+      .map(_.invoke(this).asInstanceOf[Keyword].normalize)
+
+  // Set the keywords as empty by default, will change that later.
+  override val lexical = new SqlLexical
 
   protected def start: Parser[LogicalPlan]
 
@@ -52,18 +79,27 @@ private[sql] abstract class AbstractSparkSQLParser
   }
 }
 
-class SqlLexical(val keywords: Seq[String]) extends StdLexical {
+class SqlLexical extends StdLexical {
   case class FloatLit(chars: String) extends Token {
     override def toString = chars
   }
 
-  reserved ++= keywords.flatMap(w => allCaseVersions(w))
+  /* This is a work around to support the lazy setting */
+  def initialize(keywords: Seq[String]): Unit = {
+    reserved.clear()
+    reserved ++= keywords
+  }
 
   delimiters += (
     "@", "*", "+", "-", "<", "=", "<>", "!=", "<=", ">=", ">", "/", "(", ")",
     ",", ";", "%", "{", "}", ":", "[", "]", ".", "&", "|", "^", "~", "<=>"
   )
 
+  protected override def processIdent(name: String) = {
+    val token = KeywordNormalizer(name)
+    if (reserved contains token) Keyword(token) else Identifier(name)
+  }
+
   override lazy val token: Parser[Token] =
     ( identChar ~ (identChar | digit).* ^^
       { case first ~ rest => processIdent((first :: rest).mkString) }
@@ -94,83 +130,5 @@ class SqlLexical(val keywords: Seq[String]) extends StdLexical {
     | '-' ~ '-' ~ chrExcept(EofCh, '\n').*
     | '/' ~ '*' ~ failure("unclosed comment")
     ).*
-
-  /** Generate all variations of upper and lower case of a given string */
-  def allCaseVersions(s: String, prefix: String = ""): Stream[String] = {
-    if (s.isEmpty) {
-      Stream(prefix)
-    } else {
-      allCaseVersions(s.tail, prefix + s.head.toLower) #:::
-        allCaseVersions(s.tail, prefix + s.head.toUpper)
-    }
-  }
 }
 
-/**
- * The top level Spark SQL parser. This parser recognizes syntaxes that are available for all SQL
- * dialects supported by Spark SQL, and delegates all the other syntaxes to the `fallback` parser.
- *
- * @param fallback A function that parses an input string to a logical plan
- */
-private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends AbstractSparkSQLParser {
-
-  // A parser for the key-value part of the "SET [key = [value ]]" syntax
-  private object SetCommandParser extends RegexParsers {
-    private val key: Parser[String] = "(?m)[^=]+".r
-
-    private val value: Parser[String] = "(?m).*$".r
-
-    private val pair: Parser[LogicalPlan] =
-      (key ~ ("=".r ~> value).?).? ^^ {
-        case None => SetCommand(None)
-        case Some(k ~ v) => SetCommand(Some(k.trim -> v.map(_.trim)))
-      }
-
-    def apply(input: String): LogicalPlan = parseAll(pair, input) match {
-      case Success(plan, _) => plan
-      case x => sys.error(x.toString)
-    }
-  }
-
-  protected val AS      = Keyword("AS")
-  protected val CACHE   = Keyword("CACHE")
-  protected val LAZY    = Keyword("LAZY")
-  protected val SET     = Keyword("SET")
-  protected val TABLE   = Keyword("TABLE")
-  protected val UNCACHE = Keyword("UNCACHE")
-
-  protected implicit def asParser(k: Keyword): Parser[String] =
-    lexical.allCaseVersions(k.str).map(x => x : Parser[String]).reduce(_ | _)
-
-  private val reservedWords: Seq[String] =
-    this
-      .getClass
-      .getMethods
-      .filter(_.getReturnType == classOf[Keyword])
-      .map(_.invoke(this).asInstanceOf[Keyword].str)
-
-  override val lexical = new SqlLexical(reservedWords)
-
-  override protected lazy val start: Parser[LogicalPlan] = cache | uncache | set | others
-
-  private lazy val cache: Parser[LogicalPlan] =
-    CACHE ~> LAZY.? ~ (TABLE ~> ident) ~ (AS ~> restInput).? ^^ {
-      case isLazy ~ tableName ~ plan =>
-        CacheTableCommand(tableName, plan.map(fallback), isLazy.isDefined)
-    }
-
-  private lazy val uncache: Parser[LogicalPlan] =
-    UNCACHE ~ TABLE ~> ident ^^ {
-      case tableName => UncacheTableCommand(tableName)
-    }
-
-  private lazy val set: Parser[LogicalPlan] =
-    SET ~> restInput ^^ {
-      case input => SetCommandParser(input)
-    }
-
-  private lazy val others: Parser[LogicalPlan] =
-    wholeInput ^^ {
-      case input => fallback(input)
-    }
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 71034c2c43c77..191d16fb10b5f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -20,11 +20,9 @@ package org.apache.spark.sql.catalyst
 import java.sql.{Date, Timestamp}
 
 import org.apache.spark.util.Utils
-import org.apache.spark.sql.catalyst.annotation.SQLUserDefinedType
 import org.apache.spark.sql.catalyst.expressions.{GenericRow, Attribute, AttributeReference, Row}
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.catalyst.types._
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
+import org.apache.spark.sql.types._
 
 
 /**
@@ -68,6 +66,7 @@ trait ScalaReflection {
           convertToCatalyst(elem, field.dataType)
         }.toArray)
     case (d: BigDecimal, _) => Decimal(d)
+    case (d: java.math.BigDecimal, _) => Decimal(d)
     case (other, _) => other
   }
 
@@ -80,13 +79,14 @@ trait ScalaReflection {
       convertToScala(k, mapType.keyType) -> convertToScala(v, mapType.valueType)
     }
     case (r: Row, s: StructType) => convertRowToScala(r, s)
-    case (d: Decimal, _: DecimalType) => d.toBigDecimal
+    case (d: Decimal, _: DecimalType) => d.toJavaBigDecimal
     case (other, _) => other
   }
 
   def convertRowToScala(r: Row, schema: StructType): Row = {
+    // TODO: This is very slow!!!
     new GenericRow(
-      r.zip(schema.fields.map(_.dataType))
+      r.toSeq.zip(schema.fields.map(_.dataType))
         .map(r_dt => convertToScala(r_dt._1, r_dt._2)).toArray)
   }
 
@@ -118,7 +118,19 @@ trait ScalaReflection {
       case t if t <:< typeOf[Product] =>
         val formalTypeArgs = t.typeSymbol.asClass.typeParams
         val TypeRef(_, _, actualTypeArgs) = t
-        val params = t.member(nme.CONSTRUCTOR).asMethod.paramss
+        val constructorSymbol = t.member(nme.CONSTRUCTOR)
+        val params = if (constructorSymbol.isMethod) {
+          constructorSymbol.asMethod.paramss
+        } else {
+          // Find the primary constructor, and use its parameter ordering.
+          val primaryConstructorSymbol: Option[Symbol] = constructorSymbol.asTerm.alternatives.find(
+            s => s.isMethod && s.asMethod.isPrimaryConstructor)
+          if (primaryConstructorSymbol.isEmpty) {
+            sys.error("Internal SQL error: Product object did not have a primary constructor.")
+          } else {
+            primaryConstructorSymbol.get.asMethod.paramss
+          }
+        }
         Schema(StructType(
           params.head.map { p =>
             val Schema(dataType, nullable) =
@@ -142,6 +154,7 @@ trait ScalaReflection {
       case t if t <:< typeOf[Timestamp] => Schema(TimestampType, nullable = true)
       case t if t <:< typeOf[Date] => Schema(DateType, nullable = true)
       case t if t <:< typeOf[BigDecimal] => Schema(DecimalType.Unlimited, nullable = true)
+      case t if t <:< typeOf[java.math.BigDecimal] => Schema(DecimalType.Unlimited, nullable = true)
       case t if t <:< typeOf[Decimal] => Schema(DecimalType.Unlimited, nullable = true)
       case t if t <:< typeOf[java.lang.Integer] => Schema(IntegerType, nullable = true)
       case t if t <:< typeOf[java.lang.Long] => Schema(LongType, nullable = true)
@@ -172,7 +185,7 @@ trait ScalaReflection {
     case obj: FloatType.JvmType => FloatType
     case obj: DoubleType.JvmType => DoubleType
     case obj: DateType.JvmType => DateType
-    case obj: BigDecimal => DecimalType.Unlimited
+    case obj: java.math.BigDecimal => DecimalType.Unlimited
     case obj: Decimal => DecimalType.Unlimited
     case obj: TimestampType.JvmType => TimestampType
     case null => NullType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index a2bcd73b6074f..eaadbe9fd5099 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 /**
  * A very simple SQL parser.  Based loosely on:
@@ -36,9 +36,8 @@ import org.apache.spark.sql.catalyst.types._
  * for a SQL like language should checkout the HiveQL support in the sql/hive sub-project.
  */
 class SqlParser extends AbstractSparkSQLParser {
-  protected implicit def asParser(k: Keyword): Parser[String] =
-    lexical.allCaseVersions(k.str).map(x => x : Parser[String]).reduce(_ | _)
-
+  // Keyword is a convention with AbstractSparkSQLParser, which will scan all of the `Keyword`
+  // properties via reflection the class in runtime for constructing the SqlLexical object
   protected val ABS = Keyword("ABS")
   protected val ALL = Keyword("ALL")
   protected val AND = Keyword("AND")
@@ -85,6 +84,7 @@ class SqlParser extends AbstractSparkSQLParser {
   protected val ON = Keyword("ON")
   protected val OR = Keyword("OR")
   protected val ORDER = Keyword("ORDER")
+  protected val SORT = Keyword("SORT")
   protected val OUTER = Keyword("OUTER")
   protected val OVERWRITE = Keyword("OVERWRITE")
   protected val REGEXP = Keyword("REGEXP")
@@ -106,16 +106,6 @@ class SqlParser extends AbstractSparkSQLParser {
   protected val WHEN = Keyword("WHEN")
   protected val WHERE = Keyword("WHERE")
 
-  // Use reflection to find the reserved words defined in this class.
-  protected val reservedWords =
-    this
-      .getClass
-      .getMethods
-      .filter(_.getReturnType == classOf[Keyword])
-      .map(_.invoke(this).asInstanceOf[Keyword].str)
-
-  override val lexical = new SqlLexical(reservedWords)
-
   protected def assignAliases(exprs: Seq[Expression]): Seq[NamedExpression] = {
     exprs.zipWithIndex.map {
       case (ne: NamedExpression, _) => ne
@@ -124,7 +114,7 @@ class SqlParser extends AbstractSparkSQLParser {
   }
 
   protected lazy val start: Parser[LogicalPlan] =
-    ( select *
+    ( (select | ("(" ~> select <~ ")")) *
       ( UNION ~ ALL        ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Union(q1, q2) }
       | INTERSECT          ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Intersect(q1, q2) }
       | EXCEPT             ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Except(q1, q2)}
@@ -140,7 +130,7 @@ class SqlParser extends AbstractSparkSQLParser {
       (WHERE  ~> expression).? ~
       (GROUP  ~  BY ~> rep1sep(expression, ",")).? ~
       (HAVING ~> expression).? ~
-      (ORDER  ~  BY ~> ordering).? ~
+      sortType.? ~
       (LIMIT  ~> expression).? ^^ {
         case d ~ p ~ r ~ f ~ g ~ h ~ o ~ l  =>
           val base = r.getOrElse(NoRelation)
@@ -150,7 +140,7 @@ class SqlParser extends AbstractSparkSQLParser {
             .getOrElse(Project(assignAliases(p), withFilter))
           val withDistinct = d.map(_ => Distinct(withProjection)).getOrElse(withProjection)
           val withHaving = h.map(Filter(_, withDistinct)).getOrElse(withDistinct)
-          val withOrder = o.map(Sort(_, withHaving)).getOrElse(withHaving)
+          val withOrder = o.map(_(withHaving)).getOrElse(withHaving)
           val withLimit = l.map(Limit(_, withOrder)).getOrElse(withOrder)
           withLimit
       }
@@ -177,10 +167,10 @@ class SqlParser extends AbstractSparkSQLParser {
     joinedRelation | relationFactor
 
   protected lazy val relationFactor: Parser[LogicalPlan] =
-    ( ident ~ (opt(AS) ~> opt(ident)) ^^ {
-        case tableName ~ alias => UnresolvedRelation(None, tableName, alias)
+    ( rep1sep(ident, ".") ~ (opt(AS) ~> opt(ident)) ^^ {
+        case tableIdent ~ alias => UnresolvedRelation(tableIdent, alias)
       }
-    | ("(" ~> start <~ ")") ~ (AS.? ~> ident) ^^ { case s ~ a => Subquery(a, s) }
+      | ("(" ~> start <~ ")") ~ (AS.? ~> ident) ^^ { case s ~ a => Subquery(a, s) }
     )
 
   protected lazy val joinedRelation: Parser[LogicalPlan] =
@@ -202,16 +192,17 @@ class SqlParser extends AbstractSparkSQLParser {
     | FULL  ~ OUTER.? ^^^ FullOuter
     )
 
+  protected lazy val sortType: Parser[LogicalPlan => LogicalPlan] =
+    ( ORDER ~ BY  ~> ordering ^^ { case o => l: LogicalPlan => Sort(o, true, l) }
+    | SORT ~ BY  ~> ordering ^^ { case o => l: LogicalPlan => Sort(o, false, l) }
+    )
+
   protected lazy val ordering: Parser[Seq[SortOrder]] =
-    ( rep1sep(singleOrder, ",")
-    | rep1sep(expression, ",") ~ direction.? ^^ {
-        case exps ~ d => exps.map(SortOrder(_, d.getOrElse(Ascending)))
+    ( rep1sep(expression ~ direction.? , ",") ^^ {
+        case exps  => exps.map(pair => SortOrder(pair._1, pair._2.getOrElse(Ascending)))
       }
     )
 
-  protected lazy val singleOrder: Parser[SortOrder] =
-    expression ~ direction ^^ { case e ~ o => SortOrder(e, o) }
-
   protected lazy val direction: Parser[SortDirection] =
     ( ASC  ^^^ Ascending
     | DESC ^^^ Descending
@@ -341,13 +332,13 @@ class SqlParser extends AbstractSparkSQLParser {
     | floatLit ^^ { f => Literal(f.toDouble) }
     )
 
-  private def toNarrowestIntegerType(value: String) = {
+  private def toNarrowestIntegerType(value: String): Any = {
     val bigIntValue = BigDecimal(value)
 
     bigIntValue match {
       case v if bigIntValue.isValidInt => v.toIntExact
       case v if bigIntValue.isValidLong => v.toLongExact
-      case v => v
+      case v => v.underlying()
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index facbd8b975f10..7f4cc234dc9cd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import org.apache.spark.util.collection.OpenHashSet
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.IntegerType
 
 /**
  * A trivial [[Analyzer]] with an [[EmptyCatalog]] and [[EmptyFunctionRegistry]]. Used for testing
@@ -34,13 +37,15 @@ object SimpleAnalyzer extends Analyzer(EmptyCatalog, EmptyFunctionRegistry, true
  * [[UnresolvedRelation]]s into fully typed objects using information in a schema [[Catalog]] and
  * a [[FunctionRegistry]].
  */
-class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Boolean)
+class Analyzer(catalog: Catalog,
+               registry: FunctionRegistry,
+               caseSensitive: Boolean,
+               maxIterations: Int = 100)
   extends RuleExecutor[LogicalPlan] with HiveTypeCoercion {
 
   val resolver = if (caseSensitive) caseSensitiveResolution else caseInsensitiveResolution
 
-  // TODO: pass this in as a parameter.
-  val fixedPoint = FixedPoint(100)
+  val fixedPoint = FixedPoint(maxIterations)
 
   /**
    * Override to provide additional rules for the "Resolution" batch.
@@ -53,10 +58,10 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
     Batch("Resolution", fixedPoint,
       ResolveReferences ::
       ResolveRelations ::
+      ResolveGroupingAnalytics ::
       ResolveSortReferences ::
       NewRelationInstances ::
       ImplicitGenerate ::
-      StarExpansion ::
       ResolveFunctions ::
       GlobalAggregates ::
       UnresolvedHavingClauseAttributes ::
@@ -100,6 +105,93 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
     }
   }
 
+  object ResolveGroupingAnalytics extends Rule[LogicalPlan] {
+    /**
+     * Extract attribute set according to the grouping id
+     * @param bitmask bitmask to represent the selected of the attribute sequence
+     * @param exprs the attributes in sequence
+     * @return the attributes of non selected specified via bitmask (with the bit set to 1)
+     */
+    private def buildNonSelectExprSet(bitmask: Int, exprs: Seq[Expression])
+    : OpenHashSet[Expression] = {
+      val set = new OpenHashSet[Expression](2)
+
+      var bit = exprs.length - 1
+      while (bit >= 0) {
+        if (((bitmask >> bit) & 1) == 0) set.add(exprs(bit))
+        bit -= 1
+      }
+
+      set
+    }
+
+    /*
+     *  GROUP BY a, b, c WITH ROLLUP
+     *  is equivalent to
+     *  GROUP BY a, b, c GROUPING SETS ( (a, b, c), (a, b), (a), ( ) ).
+     *  Group Count: N + 1 (N is the number of group expressions)
+     *
+     *  We need to get all of its subsets for the rule described above, the subset is
+     *  represented as the bit masks.
+     */
+    def bitmasks(r: Rollup): Seq[Int] = {
+      Seq.tabulate(r.groupByExprs.length + 1)(idx => {(1 << idx) - 1})
+    }
+
+    /*
+     *  GROUP BY a, b, c WITH CUBE
+     *  is equivalent to
+     *  GROUP BY a, b, c GROUPING SETS ( (a, b, c), (a, b), (b, c), (a, c), (a), (b), (c), ( ) ).
+     *  Group Count: 2 ^ N (N is the number of group expressions)
+     *
+     *  We need to get all of its subsets for a given GROUPBY expression, the subsets are
+     *  represented as the bit masks.
+     */
+    def bitmasks(c: Cube): Seq[Int] = {
+      Seq.tabulate(1 << c.groupByExprs.length)(i => i)
+    }
+
+    /**
+     * Create an array of Projections for the child projection, and replace the projections'
+     * expressions which equal GroupBy expressions with Literal(null), if those expressions
+     * are not set for this grouping set (according to the bit mask).
+     */
+    private[this] def expand(g: GroupingSets): Seq[GroupExpression] = {
+      val result = new scala.collection.mutable.ArrayBuffer[GroupExpression]
+
+      g.bitmasks.foreach { bitmask =>
+        // get the non selected grouping attributes according to the bit mask
+        val nonSelectedGroupExprSet = buildNonSelectExprSet(bitmask, g.groupByExprs)
+
+        val substitution = (g.child.output :+ g.gid).map(expr => expr transformDown {
+          case x: Expression if nonSelectedGroupExprSet.contains(x) =>
+            // if the input attribute in the Invalid Grouping Expression set of for this group
+            // replace it with constant null
+            Literal(null, expr.dataType)
+          case x if x == g.gid =>
+            // replace the groupingId with concrete value (the bit mask)
+            Literal(bitmask, IntegerType)
+        })
+
+        result += GroupExpression(substitution)
+      }
+
+      result.toSeq
+    }
+
+    def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+      case a: Cube if a.resolved =>
+        GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations, a.gid)
+      case a: Rollup if a.resolved =>
+        GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations, a.gid)
+      case x: GroupingSets if x.resolved =>
+        Aggregate(
+          x.groupByExprs :+ x.gid,
+          x.aggregations,
+          Expand(expand(x), x.child.output :+ x.gid, x.child))
+    }
+  }
+
   /**
    * Checks for non-aggregated attributes with aggregation
    */
@@ -136,11 +228,11 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
    */
   object ResolveRelations extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-      case i @ InsertIntoTable(UnresolvedRelation(databaseName, name, alias), _, _, _) =>
+      case i @ InsertIntoTable(UnresolvedRelation(tableIdentifier, alias), _, _, _) =>
         i.copy(
-          table = EliminateAnalysisOperators(catalog.lookupRelation(databaseName, name, alias)))
-      case UnresolvedRelation(databaseName, name, alias) =>
-        catalog.lookupRelation(databaseName, name, alias)
+          table = EliminateAnalysisOperators(catalog.lookupRelation(tableIdentifier, alias)))
+      case UnresolvedRelation(tableIdentifier, alias) =>
+        catalog.lookupRelation(tableIdentifier, alias)
     }
   }
 
@@ -151,16 +243,63 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
    */
   object ResolveReferences extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
-      case q: LogicalPlan if q.childrenResolved =>
+      case p: LogicalPlan if !p.childrenResolved => p
+
+      // If the projection list contains Stars, expand it.
+      case p @ Project(projectList, child) if containsStar(projectList) =>
+        Project(
+          projectList.flatMap {
+            case s: Star => s.expand(child.output, resolver)
+            case o => o :: Nil
+          },
+          child)
+      case t: ScriptTransformation if containsStar(t.input) =>
+        t.copy(
+          input = t.input.flatMap {
+            case s: Star => s.expand(t.child.output, resolver)
+            case o => o :: Nil
+          }
+        )
+
+      // If the aggregate function argument contains Stars, expand it.
+      case a: Aggregate if containsStar(a.aggregateExpressions) =>
+        a.copy(
+          aggregateExpressions = a.aggregateExpressions.flatMap {
+            case s: Star => s.expand(a.child.output, resolver)
+            case o => o :: Nil
+          }
+        )
+
+      case q: LogicalPlan =>
         logTrace(s"Attempting to resolve ${q.simpleString}")
         q transformExpressions {
+          case u @ UnresolvedAttribute(name)
+              if resolver(name, VirtualColumn.groupingIdName) &&
+                q.isInstanceOf[GroupingAnalytics] =>
+              // Resolve the virtual column GROUPING__ID for the operator GroupingAnalytics
+            q.asInstanceOf[GroupingAnalytics].gid
           case u @ UnresolvedAttribute(name) =>
             // Leave unchanged if resolution fails.  Hopefully will be resolved next round.
             val result = q.resolveChildren(name, resolver).getOrElse(u)
             logDebug(s"Resolving $u to $result")
             result
+
+          // Resolve field names using the resolver.
+          case f @ GetField(child, fieldName) if !f.resolved && child.resolved =>
+            child.dataType match {
+              case StructType(fields) =>
+                val resolvedFieldName = fields.map(_.name).find(resolver(_, fieldName))
+                resolvedFieldName.map(n => f.copy(fieldName = n)).getOrElse(f)
+              case _ => f
+            }
         }
     }
+
+    /**
+     * Returns true if `exprs` contains a [[Star]].
+     */
+    protected def containsStar(exprs: Seq[Expression]): Boolean =
+      exprs.collect { case _: Star => true}.nonEmpty
   }
 
   /**
@@ -171,7 +310,8 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
    */
   object ResolveSortReferences extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
-      case s @ Sort(ordering, p @ Project(projectList, child)) if !s.resolved && p.resolved =>
+      case s @ Sort(ordering, global, p @ Project(projectList, child))
+          if !s.resolved && p.resolved =>
         val unresolved = ordering.flatMap(_.collect { case UnresolvedAttribute(name) => name })
         val resolved = unresolved.flatMap(child.resolve(_, resolver))
         val requiredAttributes = AttributeSet(resolved.collect { case a: Attribute => a })
@@ -180,13 +320,14 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
         if (missingInProject.nonEmpty) {
           // Add missing attributes and then project them away after the sort.
           Project(projectList.map(_.toAttribute),
-            Sort(ordering,
+            Sort(ordering, global,
               Project(projectList ++ missingInProject, child)))
         } else {
           logDebug(s"Failed to find $missingInProject in ${p.output.mkString(", ")}")
           s // Nothing we can do here. Return original plan.
         }
-      case s @ Sort(ordering, a @ Aggregate(grouping, aggs, child)) if !s.resolved && a.resolved =>
+      case s @ Sort(ordering, global, a @ Aggregate(grouping, aggs, child))
+          if !s.resolved && a.resolved =>
         val unresolved = ordering.flatMap(_.collect { case UnresolvedAttribute(name) => name })
         // A small hack to create an object that will allow us to resolve any references that
         // refer to named expressions that are present in the grouping expressions.
@@ -201,8 +342,7 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
         if (missingInAggs.nonEmpty) {
           // Add missing grouping exprs and then project them away after the sort.
           Project(a.output,
-            Sort(ordering,
-              Aggregate(grouping, aggs ++ missingInAggs, child)))
+            Sort(ordering, global, Aggregate(grouping, aggs ++ missingInAggs, child)))
         } else {
           s // Nothing we can do here. Return original plan.
         }
@@ -275,45 +415,6 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
         Generate(g, join = false, outer = false, None, child)
     }
   }
-
-  /**
-   * Expands any references to [[Star]] (*) in project operators.
-   */
-  object StarExpansion extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-      // Wait until children are resolved
-      case p: LogicalPlan if !p.childrenResolved => p
-      // If the projection list contains Stars, expand it.
-      case p @ Project(projectList, child) if containsStar(projectList) =>
-        Project(
-          projectList.flatMap {
-            case s: Star => s.expand(child.output, resolver)
-            case o => o :: Nil
-          },
-          child)
-      case t: ScriptTransformation if containsStar(t.input) =>
-        t.copy(
-          input = t.input.flatMap {
-            case s: Star => s.expand(t.child.output, resolver)
-            case o => o :: Nil
-          }
-        )
-      // If the aggregate function argument contains Stars, expand it.
-      case a: Aggregate if containsStar(a.aggregateExpressions) =>
-        a.copy(
-          aggregateExpressions = a.aggregateExpressions.flatMap {
-            case s: Star => s.expand(a.child.output, resolver)
-            case o => o :: Nil
-          }
-        )
-    }
-
-    /**
-     * Returns true if `exprs` contains a [[Star]].
-     */
-    protected def containsStar(exprs: Seq[Expression]): Boolean =
-      exprs.collect { case _: Star => true }.nonEmpty
-  }
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
index 0415d74bd8141..df8d03b86c533 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
@@ -28,77 +28,74 @@ trait Catalog {
 
   def caseSensitive: Boolean
 
-  def tableExists(db: Option[String], tableName: String): Boolean
+  def tableExists(tableIdentifier: Seq[String]): Boolean
 
   def lookupRelation(
-    databaseName: Option[String],
-    tableName: String,
-    alias: Option[String] = None): LogicalPlan
+      tableIdentifier: Seq[String],
+      alias: Option[String] = None): LogicalPlan
 
-  def registerTable(databaseName: Option[String], tableName: String, plan: LogicalPlan): Unit
+  def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit
 
-  def unregisterTable(databaseName: Option[String], tableName: String): Unit
+  def unregisterTable(tableIdentifier: Seq[String]): Unit
 
   def unregisterAllTables(): Unit
 
-  protected def processDatabaseAndTableName(
-      databaseName: Option[String],
-      tableName: String): (Option[String], String) = {
+  protected def processTableIdentifier(tableIdentifier: Seq[String]): Seq[String] = {
     if (!caseSensitive) {
-      (databaseName.map(_.toLowerCase), tableName.toLowerCase)
+      tableIdentifier.map(_.toLowerCase)
     } else {
-      (databaseName, tableName)
+      tableIdentifier
     }
   }
 
-  protected def processDatabaseAndTableName(
-      databaseName: String,
-      tableName: String): (String, String) = {
-    if (!caseSensitive) {
-      (databaseName.toLowerCase, tableName.toLowerCase)
+  protected def getDbTableName(tableIdent: Seq[String]): String = {
+    val size = tableIdent.size
+    if (size <= 2) {
+      tableIdent.mkString(".")
     } else {
-      (databaseName, tableName)
+      tableIdent.slice(size - 2, size).mkString(".")
     }
   }
+
+  protected def getDBTable(tableIdent: Seq[String]) : (Option[String], String) = {
+    (tableIdent.lift(tableIdent.size - 2), tableIdent.last)
+  }
 }
 
 class SimpleCatalog(val caseSensitive: Boolean) extends Catalog {
   val tables = new mutable.HashMap[String, LogicalPlan]()
 
   override def registerTable(
-      databaseName: Option[String],
-      tableName: String,
+      tableIdentifier: Seq[String],
       plan: LogicalPlan): Unit = {
-    val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName)
-    tables += ((tblName, plan))
+    val tableIdent = processTableIdentifier(tableIdentifier)
+    tables += ((getDbTableName(tableIdent), plan))
   }
 
-  override def unregisterTable(
-      databaseName: Option[String],
-      tableName: String) = {
-    val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName)
-    tables -= tblName
+  override def unregisterTable(tableIdentifier: Seq[String]) = {
+    val tableIdent = processTableIdentifier(tableIdentifier)
+    tables -= getDbTableName(tableIdent)
   }
 
   override def unregisterAllTables() = {
     tables.clear()
   }
 
-  override def tableExists(db: Option[String], tableName: String): Boolean = {
-    val (dbName, tblName) = processDatabaseAndTableName(db, tableName)
-    tables.get(tblName) match {
+  override def tableExists(tableIdentifier: Seq[String]): Boolean = {
+    val tableIdent = processTableIdentifier(tableIdentifier)
+    tables.get(getDbTableName(tableIdent)) match {
       case Some(_) => true
       case None => false
     }
   }
 
   override def lookupRelation(
-      databaseName: Option[String],
-      tableName: String,
+      tableIdentifier: Seq[String],
       alias: Option[String] = None): LogicalPlan = {
-    val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName)
-    val table = tables.getOrElse(tblName, sys.error(s"Table Not Found: $tableName"))
-    val tableWithQualifiers = Subquery(tblName, table)
+    val tableIdent = processTableIdentifier(tableIdentifier)
+    val tableFullName = getDbTableName(tableIdent)
+    val table = tables.getOrElse(tableFullName, sys.error(s"Table Not Found: $tableFullName"))
+    val tableWithQualifiers = Subquery(tableIdent.last, table)
 
     // If an alias was specified by the lookup, wrap the plan in a subquery so that attributes are
     // properly qualified with this alias.
@@ -117,41 +114,39 @@ trait OverrideCatalog extends Catalog {
   // TODO: This doesn't work when the database changes...
   val overrides = new mutable.HashMap[(Option[String],String), LogicalPlan]()
 
-  abstract override def tableExists(db: Option[String], tableName: String): Boolean = {
-    val (dbName, tblName) = processDatabaseAndTableName(db, tableName)
-    overrides.get((dbName, tblName)) match {
+  abstract override def tableExists(tableIdentifier: Seq[String]): Boolean = {
+    val tableIdent = processTableIdentifier(tableIdentifier)
+    overrides.get(getDBTable(tableIdent)) match {
       case Some(_) => true
-      case None => super.tableExists(db, tableName)
+      case None => super.tableExists(tableIdentifier)
     }
   }
 
   abstract override def lookupRelation(
-    databaseName: Option[String],
-    tableName: String,
+    tableIdentifier: Seq[String],
     alias: Option[String] = None): LogicalPlan = {
-    val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName)
-    val overriddenTable = overrides.get((dbName, tblName))
-    val tableWithQualifers = overriddenTable.map(r => Subquery(tblName, r))
+    val tableIdent = processTableIdentifier(tableIdentifier)
+    val overriddenTable = overrides.get(getDBTable(tableIdent))
+    val tableWithQualifers = overriddenTable.map(r => Subquery(tableIdent.last, r))
 
     // If an alias was specified by the lookup, wrap the plan in a subquery so that attributes are
     // properly qualified with this alias.
     val withAlias =
       tableWithQualifers.map(r => alias.map(a => Subquery(a, r)).getOrElse(r))
 
-    withAlias.getOrElse(super.lookupRelation(dbName, tblName, alias))
+    withAlias.getOrElse(super.lookupRelation(tableIdentifier, alias))
   }
 
   override def registerTable(
-      databaseName: Option[String],
-      tableName: String,
+      tableIdentifier: Seq[String],
       plan: LogicalPlan): Unit = {
-    val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName)
-    overrides.put((dbName, tblName), plan)
+    val tableIdent = processTableIdentifier(tableIdentifier)
+    overrides.put(getDBTable(tableIdent), plan)
   }
 
-  override def unregisterTable(databaseName: Option[String], tableName: String): Unit = {
-    val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName)
-    overrides.remove((dbName, tblName))
+  override def unregisterTable(tableIdentifier: Seq[String]): Unit = {
+    val tableIdent = processTableIdentifier(tableIdentifier)
+    overrides.remove(getDBTable(tableIdent))
   }
 
   override def unregisterAllTables(): Unit = {
@@ -167,22 +162,21 @@ object EmptyCatalog extends Catalog {
 
   val caseSensitive: Boolean = true
 
-  def tableExists(db: Option[String], tableName: String): Boolean = {
+  def tableExists(tableIdentifier: Seq[String]): Boolean = {
     throw new UnsupportedOperationException
   }
 
   def lookupRelation(
-    databaseName: Option[String],
-    tableName: String,
+    tableIdentifier: Seq[String],
     alias: Option[String] = None) = {
     throw new UnsupportedOperationException
   }
 
-  def registerTable(databaseName: Option[String], tableName: String, plan: LogicalPlan): Unit = {
+  def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit = {
     throw new UnsupportedOperationException
   }
 
-  def unregisterTable(databaseName: Option[String], tableName: String): Unit = {
+  def unregisterTable(tableIdentifier: Seq[String]): Unit = {
     throw new UnsupportedOperationException
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index e38114ab3cf25..6ef8577fd04da 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Union}
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 object HiveTypeCoercion {
   // See https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types.
@@ -361,6 +361,22 @@ trait HiveTypeCoercion {
           DecimalType(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
         )
 
+      case LessThan(e1 @ DecimalType.Expression(p1, s1),
+          e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 =>
+        LessThan(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited))
+
+      case LessThanOrEqual(e1 @ DecimalType.Expression(p1, s1),
+          e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 =>
+        LessThanOrEqual(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited))
+
+      case GreaterThan(e1 @ DecimalType.Expression(p1, s1),
+          e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 =>
+        GreaterThan(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited))
+
+      case GreaterThanOrEqual(e1 @ DecimalType.Expression(p1, s1),
+          e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 =>
+        GreaterThanOrEqual(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited))
+
       // Promote integers inside a binary expression with fixed-precision decimals to decimals,
       // and fixed-precision decimals in an expression with floats / doubles to doubles
       case b: BinaryExpression if b.left.dataType != b.right.dataType =>
@@ -387,8 +403,8 @@ trait HiveTypeCoercion {
    * Changes Boolean values to Bytes so that expressions like true < false can be Evaluated.
    */
   object BooleanComparisons extends Rule[LogicalPlan] {
-    val trueValues = Seq(1, 1L, 1.toByte, 1.toShort, BigDecimal(1)).map(Literal(_))
-    val falseValues = Seq(0, 0L, 0.toByte, 0.toShort, BigDecimal(0)).map(Literal(_))
+    val trueValues = Seq(1, 1L, 1.toByte, 1.toShort, new java.math.BigDecimal(1)).map(Literal(_))
+    val falseValues = Seq(0, 0L, 0.toByte, 0.toShort, new java.math.BigDecimal(0)).map(Literal(_))
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       // Skip nodes who's children have not been resolved yet.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index 77d84e1687e1b..71a738a0b2ca0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -34,8 +34,7 @@ class UnresolvedException[TreeType <: TreeNode[_]](tree: TreeType, function: Str
  * Holds the name of a relation that has yet to be looked up in a [[Catalog]].
  */
 case class UnresolvedRelation(
-    databaseName: Option[String],
-    tableName: String,
+    tableIdentifier: Seq[String],
     alias: Option[String] = None) extends LeafNode {
   override def output = Nil
   override lazy val resolved = false
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 70dabc4e6c2e9..417659eed5957 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -19,8 +19,6 @@ package org.apache.spark.sql.catalyst
 
 import java.sql.{Date, Timestamp}
 
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
-
 import scala.language.implicitConversions
 import scala.reflect.runtime.universe.{TypeTag, typeTag}
 
@@ -28,7 +26,7 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 /**
  * A collection of implicit conversions that create a DSL for constructing catalyst data structures.
@@ -119,21 +117,32 @@ package object dsl {
       def expr = e
     }
 
-    implicit def booleanToLiteral(b: Boolean) = Literal(b)
-    implicit def byteToLiteral(b: Byte) = Literal(b)
-    implicit def shortToLiteral(s: Short) = Literal(s)
-    implicit def intToLiteral(i: Int) = Literal(i)
-    implicit def longToLiteral(l: Long) = Literal(l)
-    implicit def floatToLiteral(f: Float) = Literal(f)
-    implicit def doubleToLiteral(d: Double) = Literal(d)
-    implicit def stringToLiteral(s: String) = Literal(s)
-    implicit def dateToLiteral(d: Date) = Literal(d)
-    implicit def bigDecimalToLiteral(d: BigDecimal) = Literal(d)
-    implicit def decimalToLiteral(d: Decimal) = Literal(d)
-    implicit def timestampToLiteral(t: Timestamp) = Literal(t)
-    implicit def binaryToLiteral(a: Array[Byte]) = Literal(a)
-
-    implicit def symbolToUnresolvedAttribute(s: Symbol) = analysis.UnresolvedAttribute(s.name)
+    implicit def booleanToLiteral(b: Boolean): Literal = Literal(b)
+    implicit def byteToLiteral(b: Byte): Literal = Literal(b)
+    implicit def shortToLiteral(s: Short): Literal = Literal(s)
+    implicit def intToLiteral(i: Int): Literal = Literal(i)
+    implicit def longToLiteral(l: Long): Literal = Literal(l)
+    implicit def floatToLiteral(f: Float): Literal = Literal(f)
+    implicit def doubleToLiteral(d: Double): Literal = Literal(d)
+    implicit def stringToLiteral(s: String): Literal = Literal(s)
+    implicit def dateToLiteral(d: Date): Literal = Literal(d)
+    implicit def bigDecimalToLiteral(d: BigDecimal): Literal = Literal(d.underlying())
+    implicit def bigDecimalToLiteral(d: java.math.BigDecimal): Literal = Literal(d)
+    implicit def decimalToLiteral(d: Decimal): Literal = Literal(d)
+    implicit def timestampToLiteral(t: Timestamp): Literal = Literal(t)
+    implicit def binaryToLiteral(a: Array[Byte]): Literal = Literal(a)
+
+    implicit def symbolToUnresolvedAttribute(s: Symbol): analysis.UnresolvedAttribute =
+      analysis.UnresolvedAttribute(s.name)
+
+    /** Converts $"col name" into an [[analysis.UnresolvedAttribute]]. */
+    implicit class StringToAttributeConversionHelper(val sc: StringContext) {
+      // Note that if we make ExpressionConversions an object rather than a trait, we can
+      // then make this a value class to avoid the small penalty of runtime instantiation.
+      def $(args: Any*): analysis.UnresolvedAttribute = {
+        analysis.UnresolvedAttribute(sc.s(args :_*))
+      }
+    }
 
     def sum(e: Expression) = Sum(e)
     def sumDistinct(e: Expression) = SumDistinct(e)
@@ -244,7 +253,9 @@ package object dsl {
         condition: Option[Expression] = None) =
       Join(logicalPlan, otherPlan, joinType, condition)
 
-    def orderBy(sortExprs: SortOrder*) = Sort(sortExprs, logicalPlan)
+    def orderBy(sortExprs: SortOrder*) = Sort(sortExprs, true, logicalPlan)
+
+    def sortBy(sortExprs: SortOrder*) = Sort(sortExprs, false, logicalPlan)
 
     def groupBy(groupingExprs: Expression*)(aggregateExprs: Expression*) = {
       val aliasedExprs = aggregateExprs.map {
@@ -261,9 +272,6 @@ package object dsl {
     def sfilter[T1](arg1: Symbol)(udf: (T1) => Boolean) =
       Filter(ScalaUdf(udf, BooleanType, Seq(UnresolvedAttribute(arg1.name))), logicalPlan)
 
-    def sfilter(dynamicUdf: (DynamicRow) => Boolean) =
-      Filter(ScalaUdf(dynamicUdf, BooleanType, Seq(WrapDynamic(logicalPlan.output))), logicalPlan)
-
     def sample(
         fraction: Double,
         withReplacement: Boolean = true,
@@ -279,7 +287,7 @@ package object dsl {
 
     def insertInto(tableName: String, overwrite: Boolean = false) =
       InsertIntoTable(
-        analysis.UnresolvedRelation(None, tableName), Map.empty, logicalPlan, overwrite)
+        analysis.UnresolvedRelation(Seq(tableName)), Map.empty, logicalPlan, overwrite)
 
     def analyze = analysis.SimpleAnalyzer(logicalPlan)
   }
@@ -299,52 +307,52 @@ package object dsl {
 
     (1 to 22).map { x =>
       val argTypes = Seq.fill(x)("_").mkString(", ")
-      s"implicit def functionToUdfBuilder[T: TypeTag](func: Function$x[$argTypes, T]) = ScalaUdfBuilder(func)"
+      s"implicit def functionToUdfBuilder[T: TypeTag](func: Function$x[$argTypes, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)"
     }
   */
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function1[_, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function1[_, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function2[_, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function2[_, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function3[_, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function3[_, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function4[_, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function4[_, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function5[_, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function5[_, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function6[_, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function6[_, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function7[_, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function7[_, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function8[_, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function8[_, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function9[_, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function9[_, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function10[_, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function10[_, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function11[_, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function11[_, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function12[_, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function12[_, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function13[_, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function13[_, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
 
-  implicit def functionToUdfBuilder[T: TypeTag](func: Function22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func)
+  implicit def functionToUdfBuilder[T: TypeTag](func: Function22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)
   // scalastyle:on
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
index 2b4969b7cfec0..171845ad14e3e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
@@ -112,4 +112,6 @@ class AttributeSet private (val baseSet: Set[AttributeEquals])
   override def toSeq: Seq[Attribute] = baseSet.map(_.a).toArray.toSeq
 
   override def toString = "{" + baseSet.map(_.a).mkString(", ") + "}"
+
+  override def isEmpty: Boolean = baseSet.isEmpty
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index fa80b07f8e6be..76a9f08dea85f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.errors.attachTree
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 import org.apache.spark.sql.catalyst.trees
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index b47865f87a3aa..ece5ee73618cb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -22,14 +22,18 @@ import java.text.{DateFormat, SimpleDateFormat}
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.types._
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
+import org.apache.spark.sql.types._
 
 /** Cast the child expression to the target data type. */
 case class Cast(child: Expression, dataType: DataType) extends UnaryExpression with Logging {
+
+  override lazy val resolved = childrenResolved && resolve(child.dataType, dataType)
+
   override def foldable = child.foldable
 
-  override def nullable = (child.dataType, dataType) match {
+  override def nullable = forceNullable(child.dataType, dataType) || child.nullable
+
+  private[this] def forceNullable(from: DataType, to: DataType) = (from, to) match {
     case (StringType, _: NumericType) => true
     case (StringType, TimestampType)  => true
     case (DoubleType, TimestampType)  => true
@@ -41,8 +45,62 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     case (DateType, BooleanType)      => true
     case (DoubleType, _: DecimalType) => true
     case (FloatType, _: DecimalType)  => true
-    case (_, DecimalType.Fixed(_, _)) => true  // TODO: not all upcasts here can really give null
-    case _                            => child.nullable
+    case (_, DecimalType.Fixed(_, _)) => true // TODO: not all upcasts here can really give null
+    case _                            => false
+  }
+
+  private[this] def resolvableNullability(from: Boolean, to: Boolean) = !from || to
+
+  private[this] def resolve(from: DataType, to: DataType): Boolean = {
+    (from, to) match {
+      case (from, to) if from == to         => true
+
+      case (NullType, _)                    => true
+
+      case (_, StringType)                  => true
+
+      case (StringType, BinaryType)         => true
+
+      case (StringType, BooleanType)        => true
+      case (DateType, BooleanType)          => true
+      case (TimestampType, BooleanType)     => true
+      case (_: NumericType, BooleanType)    => true
+
+      case (StringType, TimestampType)      => true
+      case (BooleanType, TimestampType)     => true
+      case (DateType, TimestampType)        => true
+      case (_: NumericType, TimestampType)  => true
+
+      case (_, DateType)                    => true
+
+      case (StringType, _: NumericType)     => true
+      case (BooleanType, _: NumericType)    => true
+      case (DateType, _: NumericType)       => true
+      case (TimestampType, _: NumericType)  => true
+      case (_: NumericType, _: NumericType) => true
+
+      case (ArrayType(from, fn), ArrayType(to, tn)) =>
+        resolve(from, to) &&
+          resolvableNullability(fn || forceNullable(from, to), tn)
+
+      case (MapType(fromKey, fromValue, fn), MapType(toKey, toValue, tn)) =>
+        resolve(fromKey, toKey) &&
+          (!forceNullable(fromKey, toKey)) &&
+          resolve(fromValue, toValue) &&
+          resolvableNullability(fn || forceNullable(fromValue, toValue), tn)
+
+      case (StructType(fromFields), StructType(toFields)) =>
+        fromFields.size == toFields.size &&
+          fromFields.zip(toFields).forall {
+            case (fromField, toField) =>
+              resolve(fromField.dataType, toField.dataType) &&
+                resolvableNullability(
+                  fromField.nullable || forceNullable(fromField.dataType, toField.dataType),
+                  toField.nullable)
+          }
+
+      case _ => false
+    }
   }
 
   override def toString = s"CAST($child, $dataType)"
@@ -53,7 +111,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   @inline private[this] def buildCast[T](a: Any, func: T => Any): Any = func(a.asInstanceOf[T])
 
   // UDFToString
-  private[this] def castToString: Any => Any = child.dataType match {
+  private[this] def castToString(from: DataType): Any => Any = from match {
     case BinaryType => buildCast[Array[Byte]](_, new String(_, "UTF-8"))
     case DateType => buildCast[Date](_, dateToString)
     case TimestampType => buildCast[Timestamp](_, timestampToString)
@@ -61,12 +119,12 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   }
 
   // BinaryConverter
-  private[this] def castToBinary: Any => Any = child.dataType match {
+  private[this] def castToBinary(from: DataType): Any => Any = from match {
     case StringType => buildCast[String](_, _.getBytes("UTF-8"))
   }
 
   // UDFToBoolean
-  private[this] def castToBoolean: Any => Any = child.dataType match {
+  private[this] def castToBoolean(from: DataType): Any => Any = from match {
     case StringType =>
       buildCast[String](_, _.length() != 0)
     case TimestampType =>
@@ -91,7 +149,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   }
 
   // TimestampConverter
-  private[this] def castToTimestamp: Any => Any = child.dataType match {
+  private[this] def castToTimestamp(from: DataType): Any => Any = from match {
     case StringType =>
       buildCast[String](_, s => {
         // Throw away extra if more than 9 decimal places
@@ -133,7 +191,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       })
   }
 
-  private[this]  def decimalToTimestamp(d: Decimal) = {
+  private[this] def decimalToTimestamp(d: Decimal) = {
     val seconds = Math.floor(d.toDouble).toLong
     val bd = (d.toBigDecimal - seconds) * 1000000000
     val nanos = bd.intValue()
@@ -172,11 +230,10 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   }
 
   // DateConverter
-  private[this] def castToDate: Any => Any = child.dataType match {
+  private[this] def castToDate(from: DataType): Any => Any = from match {
     case StringType =>
       buildCast[String](_, s =>
-        try Date.valueOf(s) catch { case _: java.lang.IllegalArgumentException => null }
-      )
+        try Date.valueOf(s) catch { case _: java.lang.IllegalArgumentException => null })
     case TimestampType =>
       // throw valid precision more than seconds, according to Hive.
       // Timestamp.nanos is in 0 to 999,999,999, no more than a second.
@@ -199,7 +256,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   }
 
   // LongConverter
-  private[this] def castToLong: Any => Any = child.dataType match {
+  private[this] def castToLong(from: DataType): Any => Any = from match {
     case StringType =>
       buildCast[String](_, s => try s.toLong catch {
         case _: NumberFormatException => null
@@ -210,14 +267,12 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       buildCast[Date](_, d => dateToLong(d))
     case TimestampType =>
       buildCast[Timestamp](_, t => timestampToLong(t))
-    case DecimalType() =>
-      buildCast[Decimal](_, _.toLong)
     case x: NumericType =>
       b => x.numeric.asInstanceOf[Numeric[Any]].toLong(b)
   }
 
   // IntConverter
-  private[this] def castToInt: Any => Any = child.dataType match {
+  private[this] def castToInt(from: DataType): Any => Any = from match {
     case StringType =>
       buildCast[String](_, s => try s.toInt catch {
         case _: NumberFormatException => null
@@ -228,14 +283,12 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       buildCast[Date](_, d => dateToLong(d))
     case TimestampType =>
       buildCast[Timestamp](_, t => timestampToLong(t).toInt)
-    case DecimalType() =>
-      buildCast[Decimal](_, _.toInt)
     case x: NumericType =>
       b => x.numeric.asInstanceOf[Numeric[Any]].toInt(b)
   }
 
   // ShortConverter
-  private[this] def castToShort: Any => Any = child.dataType match {
+  private[this] def castToShort(from: DataType): Any => Any = from match {
     case StringType =>
       buildCast[String](_, s => try s.toShort catch {
         case _: NumberFormatException => null
@@ -246,14 +299,12 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       buildCast[Date](_, d => dateToLong(d))
     case TimestampType =>
       buildCast[Timestamp](_, t => timestampToLong(t).toShort)
-    case DecimalType() =>
-      buildCast[Decimal](_, _.toShort)
     case x: NumericType =>
       b => x.numeric.asInstanceOf[Numeric[Any]].toInt(b).toShort
   }
 
   // ByteConverter
-  private[this] def castToByte: Any => Any = child.dataType match {
+  private[this] def castToByte(from: DataType): Any => Any = from match {
     case StringType =>
       buildCast[String](_, s => try s.toByte catch {
         case _: NumberFormatException => null
@@ -264,8 +315,6 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       buildCast[Date](_, d => dateToLong(d))
     case TimestampType =>
       buildCast[Timestamp](_, t => timestampToLong(t).toByte)
-    case DecimalType() =>
-      buildCast[Decimal](_, _.toByte)
     case x: NumericType =>
       b => x.numeric.asInstanceOf[Numeric[Any]].toInt(b).toByte
   }
@@ -285,7 +334,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     }
   }
 
-  private[this] def castToDecimal(target: DecimalType): Any => Any = child.dataType match {
+  private[this] def castToDecimal(from: DataType, target: DecimalType): Any => Any = from match {
     case StringType =>
       buildCast[String](_, s => try changePrecision(Decimal(s.toDouble), target) catch {
         case _: NumberFormatException => null
@@ -301,7 +350,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       b => changePrecision(b.asInstanceOf[Decimal].clone(), target)
     case LongType =>
       b => changePrecision(Decimal(b.asInstanceOf[Long]), target)
-    case x: NumericType =>  // All other numeric types can be represented precisely as Doubles
+    case x: NumericType => // All other numeric types can be represented precisely as Doubles
       b => try {
         changePrecision(Decimal(x.numeric.asInstanceOf[Numeric[Any]].toDouble(b)), target)
       } catch {
@@ -310,7 +359,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   }
 
   // DoubleConverter
-  private[this] def castToDouble: Any => Any = child.dataType match {
+  private[this] def castToDouble(from: DataType): Any => Any = from match {
     case StringType =>
       buildCast[String](_, s => try s.toDouble catch {
         case _: NumberFormatException => null
@@ -321,14 +370,12 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       buildCast[Date](_, d => dateToDouble(d))
     case TimestampType =>
       buildCast[Timestamp](_, t => timestampToDouble(t))
-    case DecimalType() =>
-      buildCast[Decimal](_, _.toDouble)
     case x: NumericType =>
       b => x.numeric.asInstanceOf[Numeric[Any]].toDouble(b)
   }
 
   // FloatConverter
-  private[this] def castToFloat: Any => Any = child.dataType match {
+  private[this] def castToFloat(from: DataType): Any => Any = from match {
     case StringType =>
       buildCast[String](_, s => try s.toFloat catch {
         case _: NumberFormatException => null
@@ -339,28 +386,54 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       buildCast[Date](_, d => dateToDouble(d))
     case TimestampType =>
       buildCast[Timestamp](_, t => timestampToDouble(t).toFloat)
-    case DecimalType() =>
-      buildCast[Decimal](_, _.toFloat)
     case x: NumericType =>
       b => x.numeric.asInstanceOf[Numeric[Any]].toFloat(b)
   }
 
-  private[this] lazy val cast: Any => Any = dataType match {
+  private[this] def castArray(from: ArrayType, to: ArrayType): Any => Any = {
+    val elementCast = cast(from.elementType, to.elementType)
+    buildCast[Seq[Any]](_, _.map(v => if (v == null) null else elementCast(v)))
+  }
+
+  private[this] def castMap(from: MapType, to: MapType): Any => Any = {
+    val keyCast = cast(from.keyType, to.keyType)
+    val valueCast = cast(from.valueType, to.valueType)
+    buildCast[Map[Any, Any]](_, _.map {
+      case (key, value) => (keyCast(key), if (value == null) null else valueCast(value))
+    })
+  }
+
+  private[this] def castStruct(from: StructType, to: StructType): Any => Any = {
+    val casts = from.fields.zip(to.fields).map {
+      case (fromField, toField) => cast(fromField.dataType, toField.dataType)
+    }
+    // TODO: This is very slow!
+    buildCast[Row](_, row => Row(row.toSeq.zip(casts).map {
+      case (v, cast) => if (v == null) null else cast(v)
+    }: _*))
+  }
+
+  private[this] def cast(from: DataType, to: DataType): Any => Any = to match {
     case dt if dt == child.dataType => identity[Any]
-    case StringType    => castToString
-    case BinaryType    => castToBinary
-    case DateType      => castToDate
-    case decimal: DecimalType => castToDecimal(decimal)
-    case TimestampType => castToTimestamp
-    case BooleanType   => castToBoolean
-    case ByteType      => castToByte
-    case ShortType     => castToShort
-    case IntegerType   => castToInt
-    case FloatType     => castToFloat
-    case LongType      => castToLong
-    case DoubleType    => castToDouble
+    case StringType                 => castToString(from)
+    case BinaryType                 => castToBinary(from)
+    case DateType                   => castToDate(from)
+    case decimal: DecimalType       => castToDecimal(from, decimal)
+    case TimestampType              => castToTimestamp(from)
+    case BooleanType                => castToBoolean(from)
+    case ByteType                   => castToByte(from)
+    case ShortType                  => castToShort(from)
+    case IntegerType                => castToInt(from)
+    case FloatType                  => castToFloat(from)
+    case LongType                   => castToLong(from)
+    case DoubleType                 => castToDouble(from)
+    case array: ArrayType           => castArray(from.asInstanceOf[ArrayType], array)
+    case map: MapType               => castMap(from.asInstanceOf[MapType], map)
+    case struct: StructType         => castStruct(from.asInstanceOf[StructType], struct)
   }
 
+  private[this] lazy val cast: Any => Any = cast(child.dataType, dataType)
+
   override def eval(input: Row): Any = {
     val evaluated = child.eval(input)
     if (evaluated == null) null else cast(evaluated)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index bc45881e42748..cf14992ef835c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -20,8 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.trees.TreeNode
-import org.apache.spark.sql.catalyst.types.{DataType, FractionalType, IntegralType, NumericType, NativeType}
-import org.apache.spark.sql.catalyst.util.Metadata
+import org.apache.spark.sql.types._
 
 abstract class Expression extends TreeNode[Expression] {
   self: Product =>
@@ -284,6 +283,17 @@ abstract class LeafExpression extends Expression with trees.LeafNode[Expression]
 
 abstract class UnaryExpression extends Expression with trees.UnaryNode[Expression] {
   self: Product =>
+}
 
-
+// TODO Semantically we probably not need GroupExpression
+// All we need is holding the Seq[Expression], and ONLY used in doing the
+// expressions transformation correctly. Probably will be removed since it's
+// not like a real expressions.
+case class GroupExpression(children: Seq[Expression]) extends Expression {
+  self: Product =>
+  type EvaluatedType = Seq[Any]
+  override def eval(input: Row): EvaluatedType = ???
+  override def nullable = false
+  override def foldable = false
+  override def dataType = ???
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index e7e81a21fdf03..db5d897ee569f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -105,45 +105,45 @@ class JoinedRow extends Row {
     this
   }
 
-  def iterator = row1.iterator ++ row2.iterator
+  override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
 
-  def length = row1.length + row2.length
+  override def length = row1.length + row2.length
 
-  def apply(i: Int) =
-    if (i < row1.size) row1(i) else row2(i - row1.size)
+  override def apply(i: Int) =
+    if (i < row1.length) row1(i) else row2(i - row1.length)
 
-  def isNullAt(i: Int) =
-    if (i < row1.size) row1.isNullAt(i) else row2.isNullAt(i - row1.size)
+  override def isNullAt(i: Int) =
+    if (i < row1.length) row1.isNullAt(i) else row2.isNullAt(i - row1.length)
 
-  def getInt(i: Int): Int =
-    if (i < row1.size) row1.getInt(i) else row2.getInt(i - row1.size)
+  override def getInt(i: Int): Int =
+    if (i < row1.length) row1.getInt(i) else row2.getInt(i - row1.length)
 
-  def getLong(i: Int): Long =
-    if (i < row1.size) row1.getLong(i) else row2.getLong(i - row1.size)
+  override def getLong(i: Int): Long =
+    if (i < row1.length) row1.getLong(i) else row2.getLong(i - row1.length)
 
-  def getDouble(i: Int): Double =
-    if (i < row1.size) row1.getDouble(i) else row2.getDouble(i - row1.size)
+  override def getDouble(i: Int): Double =
+    if (i < row1.length) row1.getDouble(i) else row2.getDouble(i - row1.length)
 
-  def getBoolean(i: Int): Boolean =
-    if (i < row1.size) row1.getBoolean(i) else row2.getBoolean(i - row1.size)
+  override def getBoolean(i: Int): Boolean =
+    if (i < row1.length) row1.getBoolean(i) else row2.getBoolean(i - row1.length)
 
-  def getShort(i: Int): Short =
-    if (i < row1.size) row1.getShort(i) else row2.getShort(i - row1.size)
+  override def getShort(i: Int): Short =
+    if (i < row1.length) row1.getShort(i) else row2.getShort(i - row1.length)
 
-  def getByte(i: Int): Byte =
-    if (i < row1.size) row1.getByte(i) else row2.getByte(i - row1.size)
+  override def getByte(i: Int): Byte =
+    if (i < row1.length) row1.getByte(i) else row2.getByte(i - row1.length)
 
-  def getFloat(i: Int): Float =
-    if (i < row1.size) row1.getFloat(i) else row2.getFloat(i - row1.size)
+  override def getFloat(i: Int): Float =
+    if (i < row1.length) row1.getFloat(i) else row2.getFloat(i - row1.length)
 
-  def getString(i: Int): String =
-    if (i < row1.size) row1.getString(i) else row2.getString(i - row1.size)
+  override def getString(i: Int): String =
+    if (i < row1.length) row1.getString(i) else row2.getString(i - row1.length)
 
   override def getAs[T](i: Int): T =
-    if (i < row1.size) row1.getAs[T](i) else row2.getAs[T](i - row1.size)
+    if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  def copy() = {
-    val totalSize = row1.size + row2.size
+  override def copy() = {
+    val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
     while(i < totalSize) {
@@ -154,8 +154,16 @@ class JoinedRow extends Row {
   }
 
   override def toString() = {
-    val row = (if (row1 != null) row1 else Seq[Any]()) ++ (if (row2 != null) row2 else Seq[Any]())
-    s"[${row.mkString(",")}]"
+    // Make sure toString never throws NullPointerException.
+    if ((row1 eq null) && (row2 eq null)) {
+      "[ empty row ]"
+    } else if (row1 eq null) {
+      row2.mkString("[", ",", "]")
+    } else if (row2 eq null) {
+      row1.mkString("[", ",", "]")
+    } else {
+      mkString("[", ",", "]")
+    }
   }
 }
 
@@ -197,45 +205,45 @@ class JoinedRow2 extends Row {
     this
   }
 
-  def iterator = row1.iterator ++ row2.iterator
+  override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
 
-  def length = row1.length + row2.length
+  override def length = row1.length + row2.length
 
-  def apply(i: Int) =
-    if (i < row1.size) row1(i) else row2(i - row1.size)
+  override def apply(i: Int) =
+    if (i < row1.length) row1(i) else row2(i - row1.length)
 
-  def isNullAt(i: Int) =
-    if (i < row1.size) row1.isNullAt(i) else row2.isNullAt(i - row1.size)
+  override def isNullAt(i: Int) =
+    if (i < row1.length) row1.isNullAt(i) else row2.isNullAt(i - row1.length)
 
-  def getInt(i: Int): Int =
-    if (i < row1.size) row1.getInt(i) else row2.getInt(i - row1.size)
+  override def getInt(i: Int): Int =
+    if (i < row1.length) row1.getInt(i) else row2.getInt(i - row1.length)
 
-  def getLong(i: Int): Long =
-    if (i < row1.size) row1.getLong(i) else row2.getLong(i - row1.size)
+  override def getLong(i: Int): Long =
+    if (i < row1.length) row1.getLong(i) else row2.getLong(i - row1.length)
 
-  def getDouble(i: Int): Double =
-    if (i < row1.size) row1.getDouble(i) else row2.getDouble(i - row1.size)
+  override def getDouble(i: Int): Double =
+    if (i < row1.length) row1.getDouble(i) else row2.getDouble(i - row1.length)
 
-  def getBoolean(i: Int): Boolean =
-    if (i < row1.size) row1.getBoolean(i) else row2.getBoolean(i - row1.size)
+  override def getBoolean(i: Int): Boolean =
+    if (i < row1.length) row1.getBoolean(i) else row2.getBoolean(i - row1.length)
 
-  def getShort(i: Int): Short =
-    if (i < row1.size) row1.getShort(i) else row2.getShort(i - row1.size)
+  override def getShort(i: Int): Short =
+    if (i < row1.length) row1.getShort(i) else row2.getShort(i - row1.length)
 
-  def getByte(i: Int): Byte =
-    if (i < row1.size) row1.getByte(i) else row2.getByte(i - row1.size)
+  override def getByte(i: Int): Byte =
+    if (i < row1.length) row1.getByte(i) else row2.getByte(i - row1.length)
 
-  def getFloat(i: Int): Float =
-    if (i < row1.size) row1.getFloat(i) else row2.getFloat(i - row1.size)
+  override def getFloat(i: Int): Float =
+    if (i < row1.length) row1.getFloat(i) else row2.getFloat(i - row1.length)
 
-  def getString(i: Int): String =
-    if (i < row1.size) row1.getString(i) else row2.getString(i - row1.size)
+  override def getString(i: Int): String =
+    if (i < row1.length) row1.getString(i) else row2.getString(i - row1.length)
 
   override def getAs[T](i: Int): T =
-    if (i < row1.size) row1.getAs[T](i) else row2.getAs[T](i - row1.size)
+    if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  def copy() = {
-    val totalSize = row1.size + row2.size
+  override def copy() = {
+    val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
     while(i < totalSize) {
@@ -246,8 +254,16 @@ class JoinedRow2 extends Row {
   }
 
   override def toString() = {
-    val row = (if (row1 != null) row1 else Seq[Any]()) ++ (if (row2 != null) row2 else Seq[Any]())
-    s"[${row.mkString(",")}]"
+    // Make sure toString never throws NullPointerException.
+    if ((row1 eq null) && (row2 eq null)) {
+      "[ empty row ]"
+    } else if (row1 eq null) {
+      row2.mkString("[", ",", "]")
+    } else if (row2 eq null) {
+      row1.mkString("[", ",", "]")
+    } else {
+      mkString("[", ",", "]")
+    }
   }
 }
 
@@ -283,45 +299,45 @@ class JoinedRow3 extends Row {
     this
   }
 
-  def iterator = row1.iterator ++ row2.iterator
+  override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
 
-  def length = row1.length + row2.length
+  override def length = row1.length + row2.length
 
-  def apply(i: Int) =
-    if (i < row1.size) row1(i) else row2(i - row1.size)
+  override def apply(i: Int) =
+    if (i < row1.length) row1(i) else row2(i - row1.length)
 
-  def isNullAt(i: Int) =
-    if (i < row1.size) row1.isNullAt(i) else row2.isNullAt(i - row1.size)
+  override def isNullAt(i: Int) =
+    if (i < row1.length) row1.isNullAt(i) else row2.isNullAt(i - row1.length)
 
-  def getInt(i: Int): Int =
-    if (i < row1.size) row1.getInt(i) else row2.getInt(i - row1.size)
+  override def getInt(i: Int): Int =
+    if (i < row1.length) row1.getInt(i) else row2.getInt(i - row1.length)
 
-  def getLong(i: Int): Long =
-    if (i < row1.size) row1.getLong(i) else row2.getLong(i - row1.size)
+  override def getLong(i: Int): Long =
+    if (i < row1.length) row1.getLong(i) else row2.getLong(i - row1.length)
 
-  def getDouble(i: Int): Double =
-    if (i < row1.size) row1.getDouble(i) else row2.getDouble(i - row1.size)
+  override def getDouble(i: Int): Double =
+    if (i < row1.length) row1.getDouble(i) else row2.getDouble(i - row1.length)
 
-  def getBoolean(i: Int): Boolean =
-    if (i < row1.size) row1.getBoolean(i) else row2.getBoolean(i - row1.size)
+  override def getBoolean(i: Int): Boolean =
+    if (i < row1.length) row1.getBoolean(i) else row2.getBoolean(i - row1.length)
 
-  def getShort(i: Int): Short =
-    if (i < row1.size) row1.getShort(i) else row2.getShort(i - row1.size)
+  override def getShort(i: Int): Short =
+    if (i < row1.length) row1.getShort(i) else row2.getShort(i - row1.length)
 
-  def getByte(i: Int): Byte =
-    if (i < row1.size) row1.getByte(i) else row2.getByte(i - row1.size)
+  override def getByte(i: Int): Byte =
+    if (i < row1.length) row1.getByte(i) else row2.getByte(i - row1.length)
 
-  def getFloat(i: Int): Float =
-    if (i < row1.size) row1.getFloat(i) else row2.getFloat(i - row1.size)
+  override def getFloat(i: Int): Float =
+    if (i < row1.length) row1.getFloat(i) else row2.getFloat(i - row1.length)
 
-  def getString(i: Int): String =
-    if (i < row1.size) row1.getString(i) else row2.getString(i - row1.size)
+  override def getString(i: Int): String =
+    if (i < row1.length) row1.getString(i) else row2.getString(i - row1.length)
 
   override def getAs[T](i: Int): T =
-    if (i < row1.size) row1.getAs[T](i) else row2.getAs[T](i - row1.size)
+    if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  def copy() = {
-    val totalSize = row1.size + row2.size
+  override def copy() = {
+    val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
     while(i < totalSize) {
@@ -332,8 +348,16 @@ class JoinedRow3 extends Row {
   }
 
   override def toString() = {
-    val row = (if (row1 != null) row1 else Seq[Any]()) ++ (if (row2 != null) row2 else Seq[Any]())
-    s"[${row.mkString(",")}]"
+    // Make sure toString never throws NullPointerException.
+    if ((row1 eq null) && (row2 eq null)) {
+      "[ empty row ]"
+    } else if (row1 eq null) {
+      row2.mkString("[", ",", "]")
+    } else if (row2 eq null) {
+      row1.mkString("[", ",", "]")
+    } else {
+      mkString("[", ",", "]")
+    }
   }
 }
 
@@ -369,45 +393,45 @@ class JoinedRow4 extends Row {
     this
   }
 
-  def iterator = row1.iterator ++ row2.iterator
+  override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
 
-  def length = row1.length + row2.length
+  override def length = row1.length + row2.length
 
-  def apply(i: Int) =
-    if (i < row1.size) row1(i) else row2(i - row1.size)
+  override def apply(i: Int) =
+    if (i < row1.length) row1(i) else row2(i - row1.length)
 
-  def isNullAt(i: Int) =
-    if (i < row1.size) row1.isNullAt(i) else row2.isNullAt(i - row1.size)
+  override def isNullAt(i: Int) =
+    if (i < row1.length) row1.isNullAt(i) else row2.isNullAt(i - row1.length)
 
-  def getInt(i: Int): Int =
-    if (i < row1.size) row1.getInt(i) else row2.getInt(i - row1.size)
+  override def getInt(i: Int): Int =
+    if (i < row1.length) row1.getInt(i) else row2.getInt(i - row1.length)
 
-  def getLong(i: Int): Long =
-    if (i < row1.size) row1.getLong(i) else row2.getLong(i - row1.size)
+  override def getLong(i: Int): Long =
+    if (i < row1.length) row1.getLong(i) else row2.getLong(i - row1.length)
 
-  def getDouble(i: Int): Double =
-    if (i < row1.size) row1.getDouble(i) else row2.getDouble(i - row1.size)
+  override def getDouble(i: Int): Double =
+    if (i < row1.length) row1.getDouble(i) else row2.getDouble(i - row1.length)
 
-  def getBoolean(i: Int): Boolean =
-    if (i < row1.size) row1.getBoolean(i) else row2.getBoolean(i - row1.size)
+  override def getBoolean(i: Int): Boolean =
+    if (i < row1.length) row1.getBoolean(i) else row2.getBoolean(i - row1.length)
 
-  def getShort(i: Int): Short =
-    if (i < row1.size) row1.getShort(i) else row2.getShort(i - row1.size)
+  override def getShort(i: Int): Short =
+    if (i < row1.length) row1.getShort(i) else row2.getShort(i - row1.length)
 
-  def getByte(i: Int): Byte =
-    if (i < row1.size) row1.getByte(i) else row2.getByte(i - row1.size)
+  override def getByte(i: Int): Byte =
+    if (i < row1.length) row1.getByte(i) else row2.getByte(i - row1.length)
 
-  def getFloat(i: Int): Float =
-    if (i < row1.size) row1.getFloat(i) else row2.getFloat(i - row1.size)
+  override def getFloat(i: Int): Float =
+    if (i < row1.length) row1.getFloat(i) else row2.getFloat(i - row1.length)
 
-  def getString(i: Int): String =
-    if (i < row1.size) row1.getString(i) else row2.getString(i - row1.size)
+  override def getString(i: Int): String =
+    if (i < row1.length) row1.getString(i) else row2.getString(i - row1.length)
 
   override def getAs[T](i: Int): T =
-    if (i < row1.size) row1.getAs[T](i) else row2.getAs[T](i - row1.size)
+    if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  def copy() = {
-    val totalSize = row1.size + row2.size
+  override def copy() = {
+    val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
     while(i < totalSize) {
@@ -418,8 +442,16 @@ class JoinedRow4 extends Row {
   }
 
   override def toString() = {
-    val row = (if (row1 != null) row1 else Seq[Any]()) ++ (if (row2 != null) row2 else Seq[Any]())
-    s"[${row.mkString(",")}]"
+    // Make sure toString never throws NullPointerException.
+    if ((row1 eq null) && (row2 eq null)) {
+      "[ empty row ]"
+    } else if (row1 eq null) {
+      row2.mkString("[", ",", "]")
+    } else if (row2 eq null) {
+      row1.mkString("[", ",", "]")
+    } else {
+      mkString("[", ",", "]")
+    }
   }
 }
 
@@ -455,45 +487,45 @@ class JoinedRow5 extends Row {
     this
   }
 
-  def iterator = row1.iterator ++ row2.iterator
+  override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
 
-  def length = row1.length + row2.length
+  override def length = row1.length + row2.length
 
-  def apply(i: Int) =
-    if (i < row1.size) row1(i) else row2(i - row1.size)
+  override def apply(i: Int) =
+    if (i < row1.length) row1(i) else row2(i - row1.length)
 
-  def isNullAt(i: Int) =
-    if (i < row1.size) row1.isNullAt(i) else row2.isNullAt(i - row1.size)
+  override def isNullAt(i: Int) =
+    if (i < row1.length) row1.isNullAt(i) else row2.isNullAt(i - row1.length)
 
-  def getInt(i: Int): Int =
-    if (i < row1.size) row1.getInt(i) else row2.getInt(i - row1.size)
+  override def getInt(i: Int): Int =
+    if (i < row1.length) row1.getInt(i) else row2.getInt(i - row1.length)
 
-  def getLong(i: Int): Long =
-    if (i < row1.size) row1.getLong(i) else row2.getLong(i - row1.size)
+  override def getLong(i: Int): Long =
+    if (i < row1.length) row1.getLong(i) else row2.getLong(i - row1.length)
 
-  def getDouble(i: Int): Double =
-    if (i < row1.size) row1.getDouble(i) else row2.getDouble(i - row1.size)
+  override def getDouble(i: Int): Double =
+    if (i < row1.length) row1.getDouble(i) else row2.getDouble(i - row1.length)
 
-  def getBoolean(i: Int): Boolean =
-    if (i < row1.size) row1.getBoolean(i) else row2.getBoolean(i - row1.size)
+  override def getBoolean(i: Int): Boolean =
+    if (i < row1.length) row1.getBoolean(i) else row2.getBoolean(i - row1.length)
 
-  def getShort(i: Int): Short =
-    if (i < row1.size) row1.getShort(i) else row2.getShort(i - row1.size)
+  override def getShort(i: Int): Short =
+    if (i < row1.length) row1.getShort(i) else row2.getShort(i - row1.length)
 
-  def getByte(i: Int): Byte =
-    if (i < row1.size) row1.getByte(i) else row2.getByte(i - row1.size)
+  override def getByte(i: Int): Byte =
+    if (i < row1.length) row1.getByte(i) else row2.getByte(i - row1.length)
 
-  def getFloat(i: Int): Float =
-    if (i < row1.size) row1.getFloat(i) else row2.getFloat(i - row1.size)
+  override def getFloat(i: Int): Float =
+    if (i < row1.length) row1.getFloat(i) else row2.getFloat(i - row1.length)
 
-  def getString(i: Int): String =
-    if (i < row1.size) row1.getString(i) else row2.getString(i - row1.size)
+  override def getString(i: Int): String =
+    if (i < row1.length) row1.getString(i) else row2.getString(i - row1.length)
 
   override def getAs[T](i: Int): T =
-    if (i < row1.size) row1.getAs[T](i) else row2.getAs[T](i - row1.size)
+    if (i < row1.length) row1.getAs[T](i) else row2.getAs[T](i - row1.length)
 
-  def copy() = {
-    val totalSize = row1.size + row2.size
+  override def copy() = {
+    val totalSize = row1.length + row2.length
     val copiedValues = new Array[Any](totalSize)
     var i = 0
     while(i < totalSize) {
@@ -504,7 +536,15 @@ class JoinedRow5 extends Row {
   }
 
   override def toString() = {
-    val row = (if (row1 != null) row1 else Seq[Any]()) ++ (if (row2 != null) row2 else Seq[Any]())
-    s"[${row.mkString(",")}]"
+    // Make sure toString never throws NullPointerException.
+    if ((row1 eq null) && (row2 eq null)) {
+      "[ empty row ]"
+    } else if (row1 eq null) {
+      row2.mkString("[", ",", "]")
+    } else if (row2 eq null) {
+      row1.mkString("[", ",", "]")
+    } else {
+      mkString("[", ",", "]")
+    }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Rand.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Rand.scala
index 851db95b9177e..b2c6d3029031d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Rand.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Rand.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import java.util.Random
-import org.apache.spark.sql.catalyst.types.DoubleType
+import org.apache.spark.sql.types.DoubleType
 
 
 case object Rand extends LeafExpression {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
index 18c96da2f87fb..8a36c6810790d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
@@ -18,8 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.catalyst.types.DataType
-import org.apache.spark.util.ClosureCleaner
+import org.apache.spark.sql.types.DataType
 
 /**
  * User-defined function.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
index 570379c533e1f..7434165f654f8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 /**
  * A parent class for mutable container objects that are reused when the values are changed,
@@ -209,6 +209,8 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
 
   override def length: Int = values.length
 
+  override def toSeq: Seq[Any] = values.map(_.boxed).toSeq
+
   override def setNullAt(i: Int): Unit = {
     values(i).isNull = true
   }
@@ -231,8 +233,6 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
     if (value == null) setNullAt(ordinal) else values(ordinal).update(value)
   }
 
-  override def iterator: Iterator[Any] = values.map(_.boxed).iterator
-
   override def setString(ordinal: Int, value: String) = update(ordinal, value)
 
   override def getString(ordinal: Int) = apply(ordinal).asInstanceOf[String]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/WrapDynamic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/WrapDynamic.scala
deleted file mode 100644
index 1a4ac06c7a79d..0000000000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/WrapDynamic.scala
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions
-
-import scala.language.dynamics
-
-import org.apache.spark.sql.catalyst.types.DataType
-
-/**
- * The data type representing [[DynamicRow]] values.
- */
-case object DynamicType extends DataType
-
-/**
- * Wrap a [[Row]] as a [[DynamicRow]].
- */
-case class WrapDynamic(children: Seq[Attribute]) extends Expression {
-  type EvaluatedType = DynamicRow
-
-  def nullable = false
-
-  def dataType = DynamicType
-
-  override def eval(input: Row): DynamicRow = input match {
-    // Avoid copy for generic rows.
-    case g: GenericRow => new DynamicRow(children, g.values)
-    case otherRowType => new DynamicRow(children, otherRowType.toArray)
-  }
-}
-
-/**
- * DynamicRows use scala's Dynamic trait to emulate an ORM of in a dynamically typed language.
- * Since the type of the column is not known at compile time, all attributes are converted to
- * strings before being passed to the function.
- */
-class DynamicRow(val schema: Seq[Attribute], values: Array[Any])
-  extends GenericRow(values) with Dynamic {
-
-  def selectDynamic(attributeName: String): String = {
-    val ordinal = schema.indexWhere(_.name == attributeName)
-    values(ordinal).toString
-  }
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index 0cd90866e14a2..735b7488fdcbd 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLog
 
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.util.collection.OpenHashSet
@@ -361,10 +361,10 @@ case class Sum(child: Expression) extends PartialAggregate with trees.UnaryNode[
 }
 
 case class SumDistinct(child: Expression)
-  extends AggregateExpression with trees.UnaryNode[Expression] {
+  extends PartialAggregate with trees.UnaryNode[Expression] {
 
+  def this() = this(null)
   override def nullable = true
-
   override def dataType = child.dataType match {
     case DecimalType.Fixed(precision, scale) =>
       DecimalType(precision + 10, scale)  // Add 10 digits left of decimal point, like Hive
@@ -373,10 +373,55 @@ case class SumDistinct(child: Expression)
     case _ =>
       child.dataType
   }
+  override def toString = s"SUM(DISTINCT ${child})"
+  override def newInstance() = new SumDistinctFunction(child, this)
+
+  override def asPartial = {
+    val partialSet = Alias(CollectHashSet(child :: Nil), "partialSets")()
+    SplitEvaluation(
+      CombineSetsAndSum(partialSet.toAttribute, this),
+      partialSet :: Nil)
+  }
+}
 
-  override def toString = s"SUM(DISTINCT $child)"
+case class CombineSetsAndSum(inputSet: Expression, base: Expression) extends AggregateExpression {
+  def this() = this(null, null)
 
-  override def newInstance() = new SumDistinctFunction(child, this)
+  override def children = inputSet :: Nil
+  override def nullable = true
+  override def dataType = base.dataType
+  override def toString = s"CombineAndSum($inputSet)"
+  override def newInstance() = new CombineSetsAndSumFunction(inputSet, this)
+}
+
+case class CombineSetsAndSumFunction(
+    @transient inputSet: Expression,
+    @transient base: AggregateExpression)
+  extends AggregateFunction {
+
+  def this() = this(null, null) // Required for serialization.
+
+  val seen = new OpenHashSet[Any]()
+
+  override def update(input: Row): Unit = {
+    val inputSetEval = inputSet.eval(input).asInstanceOf[OpenHashSet[Any]]
+    val inputIterator = inputSetEval.iterator
+    while (inputIterator.hasNext) {
+      seen.add(inputIterator.next)
+    }
+  }
+
+  override def eval(input: Row): Any = {
+    val casted = seen.asInstanceOf[OpenHashSet[Row]]
+    if (casted.size == 0) {
+      null
+    } else {
+      Cast(Literal(
+        casted.iterator.map(f => f.apply(0)).reduceLeft(
+          base.dataType.asInstanceOf[NumericType].numeric.asInstanceOf[Numeric[Any]].plus)),
+        base.dataType).eval(null)
+    }
+  }
 }
 
 case class First(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 61c26c50a6662..574907f566c0f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 case class UnaryMinus(child: Expression) extends UnaryExpression {
   type EvaluatedType = Any
@@ -38,11 +38,22 @@ case class Sqrt(child: Expression) extends UnaryExpression {
 
   def dataType = DoubleType
   override def foldable = child.foldable
-  def nullable = child.nullable
+  def nullable = true
   override def toString = s"SQRT($child)"
 
   override def eval(input: Row): Any = {
-    n1(child, input, (na,a) => math.sqrt(na.toDouble(a)))
+    val evalE = child.eval(input)
+    if (evalE == null) {
+      null
+    } else {
+      child.dataType match {
+        case n: NumericType =>
+          val value = n.numeric.toDouble(evalE.asInstanceOf[n.JvmType])
+          if (value < 0) null
+          else math.sqrt(value)
+        case other => sys.error(s"Type $other does not support non-negative numeric operations")
+      }
+    }
   }
 }
 
@@ -122,9 +133,16 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
 case class Remainder(left: Expression, right: Expression) extends BinaryArithmetic {
   def symbol = "%"
 
-  override def nullable = left.nullable || right.nullable || dataType.isInstanceOf[DecimalType]
+  override def nullable = true
 
-  override def eval(input: Row): Any = i2(input, left, right, _.rem(_, _))
+  override def eval(input: Row): Any = {
+    val evalE2 = right.eval(input)
+    dataType match {
+      case _ if evalE2 == null => null
+      case _ if evalE2 == 0 => null
+      case nt: NumericType => i1(input, left, _.rem(_, evalE2.asInstanceOf[nt.JvmType]))
+    }
+  }
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index ab71e15e1f573..4cae5c4718683 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -18,14 +18,13 @@
 package org.apache.spark.sql.catalyst.expressions.codegen
 
 import com.google.common.cache.{CacheLoader, CacheBuilder}
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
 
 import scala.language.existentials
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 // These classes are here to avoid issues with serialization and integration with quasiquotes.
 class IntegerHashSet extends org.apache.spark.util.collection.OpenHashSet[Int]
@@ -314,20 +313,20 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
         val eval1 = expressionEvaluator(e1)
         val eval2 = expressionEvaluator(e2)
 
-        eval1.code ++ eval2.code ++
         q"""
+          ..${eval1.code}
           var $nullTerm = false
           var $primitiveTerm: ${termForType(BooleanType)} = false
 
-          if ((!${eval1.nullTerm} && !${eval1.primitiveTerm}) ||
-              (!${eval2.nullTerm} && !${eval2.primitiveTerm})) {
-            $nullTerm = false
-            $primitiveTerm = false
-          } else if (${eval1.nullTerm} || ${eval2.nullTerm} ) {
-            $nullTerm = true
+          if (!${eval1.nullTerm} && ${eval1.primitiveTerm} == false) {
           } else {
-            $nullTerm = false
-            $primitiveTerm = true
+            ..${eval2.code}
+            if (!${eval2.nullTerm} && ${eval2.primitiveTerm} == false) {
+            } else if (!${eval1.nullTerm} && !${eval2.nullTerm}) {
+              $primitiveTerm = true
+            } else {
+              $nullTerm = true
+            }
           }
          """.children
 
@@ -335,20 +334,22 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
         val eval1 = expressionEvaluator(e1)
         val eval2 = expressionEvaluator(e2)
 
-        eval1.code ++ eval2.code ++
         q"""
+          ..${eval1.code}
           var $nullTerm = false
           var $primitiveTerm: ${termForType(BooleanType)} = false
 
-          if ((!${eval1.nullTerm} && ${eval1.primitiveTerm}) ||
-              (!${eval2.nullTerm} && ${eval2.primitiveTerm})) {
-            $nullTerm = false
+          if (!${eval1.nullTerm} && ${eval1.primitiveTerm}) {
             $primitiveTerm = true
-          } else if (${eval1.nullTerm} || ${eval2.nullTerm} ) {
-            $nullTerm = true
           } else {
-            $nullTerm = false
-            $primitiveTerm = false
+            ..${eval2.code}
+            if (!${eval2.nullTerm} && ${eval2.primitiveTerm}) {
+              $primitiveTerm = true
+            } else if (!${eval1.nullTerm} && !${eval2.nullTerm}) {
+              $primitiveTerm = false
+            } else {
+              $nullTerm = true
+            }
           }
          """.children
 
@@ -373,11 +374,29 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
           } else if (${eval2.primitiveTerm} == 0)
             $nullTerm = true
           else {
-            $nullTerm = false
             $primitiveTerm = ${eval1.primitiveTerm} / ${eval2.primitiveTerm}
           }
          """.children
 
+      case Remainder(e1, e2) =>
+        val eval1 = expressionEvaluator(e1)
+        val eval2 = expressionEvaluator(e2)
+
+        eval1.code ++ eval2.code ++
+        q"""
+          var $nullTerm = false
+          var $primitiveTerm: ${termForType(e1.dataType)} = 0
+
+          if (${eval1.nullTerm} || ${eval2.nullTerm} ) {
+            $nullTerm = true
+          } else if (${eval2.primitiveTerm} == 0)
+            $nullTerm = true
+          else {
+            $nullTerm = false
+            $primitiveTerm = ${eval1.primitiveTerm} % ${eval2.primitiveTerm}
+          }
+         """.children
+
       case IsNotNull(e) =>
         val eval = expressionEvaluator(e)
         q"""
@@ -494,7 +513,6 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
             $nullTerm = ${eval1.nullTerm}
             $primitiveTerm = ${eval1.primitiveTerm}
           } else {
-            $nullTerm = false
             if (${eval1.primitiveTerm} > ${eval2.primitiveTerm}) {
               $primitiveTerm = ${eval1.primitiveTerm}
             } else {
@@ -522,11 +540,11 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
         childEval.code ++
         q"""
          var $nullTerm = ${childEval.nullTerm}
-         var $primitiveTerm: org.apache.spark.sql.catalyst.types.decimal.Decimal =
+         var $primitiveTerm: org.apache.spark.sql.types.Decimal =
            ${defaultPrimitive(DecimalType())}
 
          if (!$nullTerm) {
-           $primitiveTerm = new org.apache.spark.sql.catalyst.types.decimal.Decimal()
+           $primitiveTerm = new org.apache.spark.sql.types.Decimal()
            $primitiveTerm = $primitiveTerm.setOrNull(${childEval.primitiveTerm}, $precision, $scale)
            $nullTerm = $primitiveTerm == null
          }
@@ -608,7 +626,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
     case LongType => ru.Literal(Constant(1L))
     case ByteType => ru.Literal(Constant(-1.toByte))
     case DoubleType => ru.Literal(Constant(-1.toDouble))
-    case DecimalType() => q"org.apache.spark.sql.catalyst.types.decimal.Decimal(-1)"
+    case DecimalType() => q"org.apache.spark.sql.types.Decimal(-1)"
     case IntegerType => ru.Literal(Constant(-1))
     case _ => ru.Literal(Constant(null))
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index 094ff14552283..0db29eb404bd1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions.codegen
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.types.{StringType, NumericType}
+import org.apache.spark.sql.types.{StringType, NumericType}
 
 /**
  * Generates bytecode for an [[Ordering]] of [[Row Rows]] for a given set of
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index 2ff61169a17db..69397a73a8880 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions.codegen
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 
 /**
@@ -77,14 +77,6 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         """.children : Seq[Tree]
     }
 
-    val iteratorFunction = {
-      val allColumns = (0 until expressions.size).map { i =>
-        val iLit = ru.Literal(Constant(i))
-        q"if(isNullAt($iLit)) { null } else { ${newTermName(s"c$i")} }"
-      }
-      q"override def iterator = Iterator[Any](..$allColumns)"
-    }
-
     val accessorFailure = q"""scala.sys.error("Invalid ordinal:" + i)"""
     val applyFunction = {
       val cases = (0 until expressions.size).map { i =>
@@ -191,20 +183,26 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         }
       """
 
+    val allColumns = (0 until expressions.size).map { i =>
+      val iLit = ru.Literal(Constant(i))
+      q"if(isNullAt($iLit)) { null } else { ${newTermName(s"c$i")} }"
+    }
+
     val copyFunction =
-      q"""
-        override def copy() = new $genericRowType(this.toArray)
-      """
+      q"override def copy() = new $genericRowType(Array[Any](..$allColumns))"
+
+    val toSeqFunction =
+      q"override def toSeq: Seq[Any] = Seq(..$allColumns)"
 
     val classBody =
       nullFunctions ++ (
         lengthDef +:
-        iteratorFunction +:
         applyFunction +:
         updateFunction +:
         equalsFunction +:
         hashCodeFunction +:
         copyFunction +:
+        toSeqFunction +:
         (tupleElements ++ specificAccessorFunctions ++ specificMutatorFunctions))
 
     val code = q"""
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
index 917b346086dcb..1bc34f71441fe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import scala.collection.Map
 
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 /**
  * Returns the item at `ordinal` in the Array `child` or the Key `ordinal` in Map `child`.
@@ -92,7 +92,13 @@ case class GetField(child: Expression, fieldName: String) extends UnaryExpressio
 
   lazy val ordinal = structType.fields.indexOf(field)
 
-  override lazy val resolved = childrenResolved && child.dataType.isInstanceOf[StructType]
+  override lazy val resolved = childrenResolved && fieldResolved
+
+  /** Returns true only if the fieldName is found in the child struct. */
+  private def fieldResolved = child.dataType match {
+    case StructType(fields) => fields.map(_.name).contains(fieldName)
+    case _ => false
+  }
 
   override def eval(input: Row): Any = {
     val baseValue = child.eval(input).asInstanceOf[Row]
@@ -107,7 +113,9 @@ case class GetField(child: Expression, fieldName: String) extends UnaryExpressio
  */
 case class CreateArray(children: Seq[Expression]) extends Expression {
   override type EvaluatedType = Any
-
+  
+  override def foldable = !children.exists(!_.foldable)
+  
   lazy val childTypes = children.map(_.dataType).distinct
 
   override lazy val resolved =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
index d1eab2eb4ed56..83d8c1d42bca4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
-import org.apache.spark.sql.catalyst.types.{DecimalType, LongType, DoubleType, DataType}
+import org.apache.spark.sql.types._
 
 /** Return the unscaled Long value of a Decimal, assuming it fits in a Long */
 case class UnscaledValue(child: Expression) extends UnaryExpression {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index ab0701fd9a80b..43b6482c0171c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import scala.collection.Map
 
 import org.apache.spark.sql.catalyst.trees
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 /**
  * An expression that produces zero or more rows given a single input row.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 93c19325151bf..5b389aad7a85d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -19,8 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.sql.{Date, Timestamp}
 
-import org.apache.spark.sql.catalyst.types._
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
+import org.apache.spark.sql.types._
 
 object Literal {
   def apply(v: Any): Literal = v match {
@@ -33,6 +32,7 @@ object Literal {
     case s: String => Literal(s, StringType)
     case b: Boolean => Literal(b, BooleanType)
     case d: BigDecimal => Literal(Decimal(d), DecimalType.Unlimited)
+    case d: java.math.BigDecimal => Literal(Decimal(d), DecimalType.Unlimited)
     case d: Decimal => Literal(d, DecimalType.Unlimited)
     case t: Timestamp => Literal(t, TimestampType)
     case d: Date => Literal(d, DateType)
@@ -41,6 +41,15 @@ object Literal {
   }
 }
 
+/**
+ * An extractor that matches non-null literal values
+ */
+object NonNullLiteral {
+  def unapply(literal: Literal): Option[(Any, DataType)] = {
+    Option(literal.value).map(_ => (literal.value, literal.dataType))
+  }
+}
+
 /**
  * Extractor for retrieving Int literals.
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 7634d392d4111..3035d934ff9f8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -20,8 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.types._
-import org.apache.spark.sql.catalyst.util.Metadata
+import org.apache.spark.sql.types._
 
 object NamedExpression {
   private val curId = new java.util.concurrent.atomic.AtomicLong()
@@ -187,3 +186,8 @@ case class AttributeReference(
 
   override def toString: String = s"$name#${exprId.id}$typeSuffix"
 }
+
+object VirtualColumn {
+  val groupingIdName = "grouping__id"
+  def newGroupingId = AttributeReference(groupingIdName, IntegerType, false)()
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
index 55d95991c5f11..fbc97b2e75312 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
@@ -49,6 +49,10 @@ package org.apache.spark.sql.catalyst
  */
 package object expressions  {
 
+  type Row = org.apache.spark.sql.Row
+
+  val Row = org.apache.spark.sql.Row
+
   /**
    * Converts a [[Row]] to another Row given a sequence of expression that define each column of the
    * new row. If the schema of the input row is specified, then the given expression will be bound
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 94b6fb084d38a..c84cc95520a19 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import scala.collection.immutable.HashSet
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.types.BooleanType
+import org.apache.spark.sql.types.BooleanType
 
 object InterpretedPredicate {
   def apply(expression: Expression, inputSchema: Seq[Attribute]): (Row => Boolean) =
@@ -48,6 +47,14 @@ trait PredicateHelper {
     }
   }
 
+  protected def splitDisjunctivePredicates(condition: Expression): Seq[Expression] = {
+    condition match {
+      case Or(cond1, cond2) =>
+        splitDisjunctivePredicates(cond1) ++ splitDisjunctivePredicates(cond2)
+      case other => other :: Nil
+    }
+  }
+
   /**
    * Returns true if `expr` can be evaluated using only the output of `plan`.  This method
    * can be used to determine when is is acceptable to move expression evaluation within a query
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
similarity index 67%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index 463f3667fc445..8df150e2f855f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -17,70 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.types.NativeType
-
-object Row {
-  /**
-   * This method can be used to extract fields from a [[Row]] object in a pattern match. Example:
-   * {{{
-   * import org.apache.spark.sql._
-   *
-   * val pairs = sql("SELECT key, value FROM src").rdd.map {
-   *   case Row(key: Int, value: String) =>
-   *     key -> value
-   * }
-   * }}}
-   */
-  def unapplySeq(row: Row): Some[Seq[Any]] = Some(row)
-
-  /**
-   * This method can be used to construct a [[Row]] with the given values.
-   */
-  def apply(values: Any*): Row = new GenericRow(values.toArray)
-
-  /**
-   * This method can be used to construct a [[Row]] from a [[Seq]] of values.
-   */
-  def fromSeq(values: Seq[Any]): Row = new GenericRow(values.toArray)
-}
-
-/**
- * Represents one row of output from a relational operator.  Allows both generic access by ordinal,
- * which will incur boxing overhead for primitives, as well as native primitive access.
- *
- * It is invalid to use the native primitive interface to retrieve a value that is null, instead a
- * user must check [[isNullAt]] before attempting to retrieve a value that might be null.
- */
-trait Row extends Seq[Any] with Serializable {
-  def apply(i: Int): Any
-
-  def isNullAt(i: Int): Boolean
-
-  def getInt(i: Int): Int
-  def getLong(i: Int): Long
-  def getDouble(i: Int): Double
-  def getFloat(i: Int): Float
-  def getBoolean(i: Int): Boolean
-  def getShort(i: Int): Short
-  def getByte(i: Int): Byte
-  def getString(i: Int): String
-  def getAs[T](i: Int): T = apply(i).asInstanceOf[T]
-
-  override def toString() =
-    s"[${this.mkString(",")}]"
+import org.apache.spark.sql.types.NativeType
 
-  def copy(): Row
-
-  /** Returns true if there are any NULL values in this row. */
-  def anyNull: Boolean = {
-    var i = 0
-    while (i < length) {
-      if (isNullAt(i)) { return true }
-      i += 1
-    }
-    false
-  }
-}
 
 /**
  * An extended interface to [[Row]] that allows the values for each column to be updated.  Setting
@@ -105,22 +43,19 @@ trait MutableRow extends Row {
  * A row with no data.  Calling any methods will result in an error.  Can be used as a placeholder.
  */
 object EmptyRow extends Row {
-  def apply(i: Int): Any = throw new UnsupportedOperationException
-
-  def iterator = Iterator.empty
-  def length = 0
-  def isNullAt(i: Int): Boolean = throw new UnsupportedOperationException
-
-  def getInt(i: Int): Int = throw new UnsupportedOperationException
-  def getLong(i: Int): Long = throw new UnsupportedOperationException
-  def getDouble(i: Int): Double = throw new UnsupportedOperationException
-  def getFloat(i: Int): Float = throw new UnsupportedOperationException
-  def getBoolean(i: Int): Boolean = throw new UnsupportedOperationException
-  def getShort(i: Int): Short = throw new UnsupportedOperationException
-  def getByte(i: Int): Byte = throw new UnsupportedOperationException
-  def getString(i: Int): String = throw new UnsupportedOperationException
+  override def apply(i: Int): Any = throw new UnsupportedOperationException
+  override def toSeq = Seq.empty
+  override def length = 0
+  override def isNullAt(i: Int): Boolean = throw new UnsupportedOperationException
+  override def getInt(i: Int): Int = throw new UnsupportedOperationException
+  override def getLong(i: Int): Long = throw new UnsupportedOperationException
+  override def getDouble(i: Int): Double = throw new UnsupportedOperationException
+  override def getFloat(i: Int): Float = throw new UnsupportedOperationException
+  override def getBoolean(i: Int): Boolean = throw new UnsupportedOperationException
+  override def getShort(i: Int): Short = throw new UnsupportedOperationException
+  override def getByte(i: Int): Byte = throw new UnsupportedOperationException
+  override def getString(i: Int): String = throw new UnsupportedOperationException
   override def getAs[T](i: Int): T = throw new UnsupportedOperationException
-
   def copy() = this
 }
 
@@ -135,56 +70,56 @@ class GenericRow(protected[sql] val values: Array[Any]) extends Row {
 
   def this(size: Int) = this(new Array[Any](size))
 
-  def iterator = values.iterator
+  override def toSeq = values.toSeq
 
-  def length = values.length
+  override def length = values.length
 
-  def apply(i: Int) = values(i)
+  override def apply(i: Int) = values(i)
 
-  def isNullAt(i: Int) = values(i) == null
+  override def isNullAt(i: Int) = values(i) == null
 
-  def getInt(i: Int): Int = {
+  override def getInt(i: Int): Int = {
     if (values(i) == null) sys.error("Failed to check null bit for primitive int value.")
     values(i).asInstanceOf[Int]
   }
 
-  def getLong(i: Int): Long = {
+  override def getLong(i: Int): Long = {
     if (values(i) == null) sys.error("Failed to check null bit for primitive long value.")
     values(i).asInstanceOf[Long]
   }
 
-  def getDouble(i: Int): Double = {
+  override def getDouble(i: Int): Double = {
     if (values(i) == null) sys.error("Failed to check null bit for primitive double value.")
     values(i).asInstanceOf[Double]
   }
 
-  def getFloat(i: Int): Float = {
+  override def getFloat(i: Int): Float = {
     if (values(i) == null) sys.error("Failed to check null bit for primitive float value.")
     values(i).asInstanceOf[Float]
   }
 
-  def getBoolean(i: Int): Boolean = {
+  override def getBoolean(i: Int): Boolean = {
     if (values(i) == null) sys.error("Failed to check null bit for primitive boolean value.")
     values(i).asInstanceOf[Boolean]
   }
 
-  def getShort(i: Int): Short = {
+  override def getShort(i: Int): Short = {
     if (values(i) == null) sys.error("Failed to check null bit for primitive short value.")
     values(i).asInstanceOf[Short]
   }
 
-  def getByte(i: Int): Byte = {
+  override def getByte(i: Int): Byte = {
     if (values(i) == null) sys.error("Failed to check null bit for primitive byte value.")
     values(i).asInstanceOf[Byte]
   }
 
-  def getString(i: Int): String = {
+  override def getString(i: Int): String = {
     if (values(i) == null) sys.error("Failed to check null bit for primitive String value.")
     values(i).asInstanceOf[String]
   }
 
   // Custom hashCode function that matches the efficient code generated version.
-  override def hashCode(): Int = {
+  override def hashCode: Int = {
     var result: Int = 37
 
     var i = 0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
index 3d4c4a8853c12..3a5bdca1f07c3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index f6349767764a3..f85ee0a9bb6d8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -23,7 +23,7 @@ import scala.collection.IndexedSeqOptimized
 
 
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
-import org.apache.spark.sql.catalyst.types.{BinaryType, BooleanType, DataType, StringType}
+import org.apache.spark.sql.types.{BinaryType, BooleanType, DataType, StringType}
 
 trait StringRegexExpression {
   self: BinaryExpression =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index f164a6c68a0de..376a9f36568a7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -26,8 +26,7 @@ import org.apache.spark.sql.catalyst.plans.RightOuter
 import org.apache.spark.sql.catalyst.plans.LeftSemi
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.catalyst.types._
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
+import org.apache.spark.sql.types._
 
 abstract class Optimizer extends RuleExecutor[LogicalPlan]
 
@@ -142,16 +141,16 @@ object ColumnPruning extends Rule[LogicalPlan] {
     case Project(projectList1, Project(projectList2, child)) =>
       // Create a map of Aliases to their values from the child projection.
       // e.g., 'SELECT ... FROM (SELECT a + b AS c, d ...)' produces Map(c -> Alias(a + b, c)).
-      val aliasMap = projectList2.collect {
-        case a @ Alias(e, _) => (a.toAttribute: Expression, a)
-      }.toMap
+      val aliasMap = AttributeMap(projectList2.collect {
+        case a @ Alias(e, _) => (a.toAttribute, a)
+      })
 
       // Substitute any attributes that are produced by the child projection, so that we safely
       // eliminate it.
       // e.g., 'SELECT c + 1 FROM (SELECT a + b AS C ...' produces 'SELECT a + b + 1 ...'
       // TODO: Fix TransformBase to avoid the cast below.
       val substitutedProjection = projectList1.map(_.transform {
-        case a if aliasMap.contains(a) => aliasMap(a)
+        case a: Attribute if aliasMap.contains(a) => aliasMap(a)
       }).asInstanceOf[Seq[NamedExpression]]
 
       Project(substitutedProjection, child)
@@ -203,8 +202,6 @@ object NullPropagation extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     case q: LogicalPlan => q transformExpressionsUp {
       case e @ Count(Literal(null, _)) => Cast(Literal(0L), e.dataType)
-      case e @ Sum(Literal(c, _)) if c == 0 => Cast(Literal(0L), e.dataType)
-      case e @ Average(Literal(c, _)) if c == 0 => Literal(0.0, e.dataType)
       case e @ IsNull(c) if !c.nullable => Literal(false, BooleanType)
       case e @ IsNotNull(c) if !c.nullable => Literal(true, BooleanType)
       case e @ GetItem(Literal(null, _), _) => Literal(null, e.dataType)
@@ -212,6 +209,7 @@ object NullPropagation extends Rule[LogicalPlan] {
       case e @ GetField(Literal(null, _), _) => Literal(null, e.dataType)
       case e @ EqualNullSafe(Literal(null, _), r) => IsNull(r)
       case e @ EqualNullSafe(l, Literal(null, _)) => IsNull(l)
+      case e @ Count(expr) if !expr.nullable => Count(Literal(1))
 
       // For Coalesce, remove null literals.
       case e @ Coalesce(children) =>
@@ -295,44 +293,111 @@ object OptimizeIn extends Rule[LogicalPlan] {
 }
 
 /**
- * Simplifies boolean expressions where the answer can be determined without evaluating both sides.
- * Note that this rule can eliminate expressions that might otherwise have been evaluated and thus
- * is only safe when evaluations of expressions does not result in side effects.
+ * Simplifies boolean expressions:
+ * 1. Simplifies expressions whose answer can be determined without evaluating both sides.
+ * 2. Eliminates / extracts common factors.
+ * 3. Merge same expressions
+ * 4. Removes `Not` operator.
  */
-object BooleanSimplification extends Rule[LogicalPlan] {
+object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     case q: LogicalPlan => q transformExpressionsUp {
-      case and @ And(left, right) =>
-        (left, right) match {
-          case (Literal(true, BooleanType), r) => r
-          case (l, Literal(true, BooleanType)) => l
-          case (Literal(false, BooleanType), _) => Literal(false)
-          case (_, Literal(false, BooleanType)) => Literal(false)
-          case (_, _) => and
-        }
-
-      case or @ Or(left, right) =>
-        (left, right) match {
-          case (Literal(true, BooleanType), _) => Literal(true)
-          case (_, Literal(true, BooleanType)) => Literal(true)
-          case (Literal(false, BooleanType), r) => r
-          case (l, Literal(false, BooleanType)) => l
-          case (_, _) => or
-        }
-
-      case not @ Not(exp) =>
-        exp match {
-          case Literal(true, BooleanType) => Literal(false)
-          case Literal(false, BooleanType) => Literal(true)
-          case GreaterThan(l, r) => LessThanOrEqual(l, r)
-          case GreaterThanOrEqual(l, r) => LessThan(l, r)
-          case LessThan(l, r) => GreaterThanOrEqual(l, r)
-          case LessThanOrEqual(l, r) => GreaterThan(l, r)
-          case Not(e) => e
-          case _ => not
-        }
-
-      // Turn "if (true) a else b" into "a", and if (false) a else b" into "b".
+      case and @ And(left, right) => (left, right) match {
+        // true && r  =>  r
+        case (Literal(true, BooleanType), r) => r
+        // l && true  =>  l
+        case (l, Literal(true, BooleanType)) => l
+        // false && r  =>  false
+        case (Literal(false, BooleanType), _) => Literal(false)
+        // l && false  =>  false
+        case (_, Literal(false, BooleanType)) => Literal(false)
+        // a && a  =>  a
+        case (l, r) if l fastEquals r => l
+        // (a || b) && (a || c)  =>  a || (b && c)
+        case _ =>
+          // 1. Split left and right to get the disjunctive predicates,
+          //   i.e. lhsSet = (a, b), rhsSet = (a, c)
+          // 2. Find the common predict between lhsSet and rhsSet, i.e. common = (a)
+          // 3. Remove common predict from lhsSet and rhsSet, i.e. ldiff = (b), rdiff = (c)
+          // 4. Apply the formula, get the optimized predicate: common || (ldiff && rdiff)
+          val lhsSet = splitDisjunctivePredicates(left).toSet
+          val rhsSet = splitDisjunctivePredicates(right).toSet
+          val common = lhsSet.intersect(rhsSet)
+          if (common.isEmpty) {
+            // No common factors, return the original predicate
+            and
+          } else {
+            val ldiff = lhsSet.diff(common)
+            val rdiff = rhsSet.diff(common)
+            if (ldiff.isEmpty || rdiff.isEmpty) {
+              // (a || b || c || ...) && (a || b) => (a || b)
+              common.reduce(Or)
+            } else {
+              // (a || b || c || ...) && (a || b || d || ...) =>
+              // ((c || ...) && (d || ...)) || a || b
+              (common + And(ldiff.reduce(Or), rdiff.reduce(Or))).reduce(Or)
+            }
+          }
+      }  // end of And(left, right)
+
+      case or @ Or(left, right) => (left, right) match {
+        // true || r  =>  true
+        case (Literal(true, BooleanType), _) => Literal(true)
+        // r || true  =>  true
+        case (_, Literal(true, BooleanType)) => Literal(true)
+        // false || r  =>  r
+        case (Literal(false, BooleanType), r) => r
+        // l || false  =>  l
+        case (l, Literal(false, BooleanType)) => l
+        // a || a => a
+        case (l, r) if l fastEquals r => l
+        // (a && b) || (a && c)  =>  a && (b || c)
+        case _ =>
+           // 1. Split left and right to get the conjunctive predicates,
+           //   i.e.  lhsSet = (a, b), rhsSet = (a, c)
+           // 2. Find the common predict between lhsSet and rhsSet, i.e. common = (a)
+           // 3. Remove common predict from lhsSet and rhsSet, i.e. ldiff = (b), rdiff = (c)
+           // 4. Apply the formula, get the optimized predicate: common && (ldiff || rdiff)
+          val lhsSet = splitConjunctivePredicates(left).toSet
+          val rhsSet = splitConjunctivePredicates(right).toSet
+          val common = lhsSet.intersect(rhsSet)
+          if (common.isEmpty) {
+            // No common factors, return the original predicate
+            or
+          } else {
+            val ldiff = lhsSet.diff(common)
+            val rdiff = rhsSet.diff(common)
+            if (ldiff.isEmpty || rdiff.isEmpty) {
+              // (a && b) || (a && b && c && ...) => a && b
+              common.reduce(And)
+            } else {
+              // (a && b && c && ...) || (a && b && d && ...) =>
+              // ((c && ...) || (d && ...)) && a && b
+              (common + Or(ldiff.reduce(And), rdiff.reduce(And))).reduce(And)
+            }
+          }
+      }  // end of Or(left, right)
+
+      case not @ Not(exp) => exp match {
+        // not(true)  =>  false
+        case Literal(true, BooleanType) => Literal(false)
+        // not(false)  =>  true
+        case Literal(false, BooleanType) => Literal(true)
+        // not(l > r)  =>  l <= r
+        case GreaterThan(l, r) => LessThanOrEqual(l, r)
+        // not(l >= r)  =>  l < r
+        case GreaterThanOrEqual(l, r) => LessThan(l, r)
+        // not(l < r)  =>  l >= r
+        case LessThan(l, r) => GreaterThanOrEqual(l, r)
+        // not(l <= r)  =>  l > r
+        case LessThanOrEqual(l, r) => GreaterThan(l, r)
+        // not(not(e))  =>  e
+        case Not(e) => e
+        case _ => not
+      }  // end of Not(exp)
+
+      // if (true) a else b  =>  a
+      // if (false) a else b  =>  b
       case e @ If(Literal(v, _), trueValue, falseValue) => if (v == true) trueValue else falseValue
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
index a38079ced34b2..105cdf52500c6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
@@ -27,6 +27,6 @@ package object catalyst {
    * scala.reflect.*.  Note that Scala Reflection API is made thread-safe in 2.11, but not yet for
    * 2.10.* builds.  See SI-6240 for more details.
    */
-  protected[catalyst] object ScalaReflectionLock
+  protected[sql] object ScalaReflectionLock
 
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index dcbbb62c0aca4..619f42859cbb8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.plans
 
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression}
 import org.apache.spark.sql.catalyst.trees.TreeNode
-import org.apache.spark.sql.catalyst.types.{ArrayType, DataType, StructField, StructType}
+import org.apache.spark.sql.types.{ArrayType, DataType, StructField, StructType}
 
 abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanType] {
   self: PlanType with Product =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index ed578e081be73..65ae066e4b4b5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.trees.TreeNode
-import org.apache.spark.sql.catalyst.types.StructType
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.catalyst.trees
 
 /**
@@ -191,14 +191,13 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
       case (Nil, _) => expression
       case (requestedField :: rest, StructType(fields)) =>
         val actualField = fields.filter(f => resolver(f.name, requestedField))
-        actualField match {
-          case Seq() =>
-            sys.error(
-              s"No such struct field $requestedField in ${fields.map(_.name).mkString(", ")}")
-          case Seq(singleMatch) =>
-            resolveNesting(rest, GetField(expression, singleMatch.name), resolver)
-          case multipleMatches =>
-            sys.error(s"Ambiguous reference to fields ${multipleMatches.mkString(", ")}")
+        if (actualField.length == 0) {
+          sys.error(
+            s"No such struct field $requestedField in ${fields.map(_.name).mkString(", ")}")
+        } else if (actualField.length == 1) {
+          resolveNesting(rest, GetField(expression, actualField(0).name), resolver)
+        } else {
+          sys.error(s"Ambiguous reference to fields ${actualField.mkString(", ")}")
         }
       case (_, dt) => sys.error(s"Can't access nested field in type $dt")
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 00bdf108a8398..9628e93274a11 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.plans.logical
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) extends UnaryNode {
   def output = projectList.map(_.toAttribute)
@@ -121,7 +121,7 @@ case class CreateTableAsSelect[T](
     allowExisting: Boolean,
     desc: Option[T] = None) extends UnaryNode {
   override def output = Seq.empty[Attribute]
-  override lazy val resolved = (databaseName != None && childrenResolved)
+  override lazy val resolved = databaseName != None && childrenResolved
 }
 
 case class WriteToFile(
@@ -130,7 +130,16 @@ case class WriteToFile(
   override def output = child.output
 }
 
-case class Sort(order: Seq[SortOrder], child: LogicalPlan) extends UnaryNode {
+/**
+ * @param order  The ordering expressions 
+ * @param global True means global sorting apply for entire data set, 
+ *               False means sorting only apply within the partition.
+ * @param child  Child logical plan              
+ */
+case class Sort(
+    order: Seq[SortOrder],
+    global: Boolean,
+    child: LogicalPlan) extends UnaryNode {
   override def output = child.output
 }
 
@@ -143,19 +152,97 @@ case class Aggregate(
   override def output = aggregateExpressions.map(_.toAttribute)
 }
 
+/**
+ * Apply the all of the GroupExpressions to every input row, hence we will get
+ * multiple output rows for a input row.
+ * @param projections The group of expressions, all of the group expressions should
+ *                    output the same schema specified by the parameter `output`
+ * @param output      The output Schema
+ * @param child       Child operator
+ */
+case class Expand(
+    projections: Seq[GroupExpression],
+    output: Seq[Attribute],
+    child: LogicalPlan) extends UnaryNode
+
+trait GroupingAnalytics extends UnaryNode {
+  self: Product =>
+  def gid: AttributeReference
+  def groupByExprs: Seq[Expression]
+  def aggregations: Seq[NamedExpression]
+
+  override def output = aggregations.map(_.toAttribute)
+}
+
+/**
+ * A GROUP BY clause with GROUPING SETS can generate a result set equivalent
+ * to generated by a UNION ALL of multiple simple GROUP BY clauses.
+ *
+ * We will transform GROUPING SETS into logical plan Aggregate(.., Expand) in Analyzer
+ * @param bitmasks     A list of bitmasks, each of the bitmask indicates the selected
+ *                     GroupBy expressions
+ * @param groupByExprs The Group By expressions candidates, take effective only if the
+ *                     associated bit in the bitmask set to 1.
+ * @param child        Child operator
+ * @param aggregations The Aggregation expressions, those non selected group by expressions
+ *                     will be considered as constant null if it appears in the expressions
+ * @param gid          The attribute represents the virtual column GROUPING__ID, and it's also
+ *                     the bitmask indicates the selected GroupBy Expressions for each
+ *                     aggregating output row.
+ *                     The associated output will be one of the value in `bitmasks`
+ */
+case class GroupingSets(
+    bitmasks: Seq[Int],
+    groupByExprs: Seq[Expression],
+    child: LogicalPlan,
+    aggregations: Seq[NamedExpression],
+    gid: AttributeReference = VirtualColumn.newGroupingId) extends GroupingAnalytics
+
+/**
+ * Cube is a syntactic sugar for GROUPING SETS, and will be transformed to GroupingSets,
+ * and eventually will be transformed to Aggregate(.., Expand) in Analyzer
+ *
+ * @param groupByExprs The Group By expressions candidates.
+ * @param child        Child operator
+ * @param aggregations The Aggregation expressions, those non selected group by expressions
+ *                     will be considered as constant null if it appears in the expressions
+ * @param gid          The attribute represents the virtual column GROUPING__ID, and it's also
+ *                     the bitmask indicates the selected GroupBy Expressions for each
+ *                     aggregating output row.
+ */
+case class Cube(
+    groupByExprs: Seq[Expression],
+    child: LogicalPlan,
+    aggregations: Seq[NamedExpression],
+    gid: AttributeReference = VirtualColumn.newGroupingId) extends GroupingAnalytics
+
+/**
+ * Rollup is a syntactic sugar for GROUPING SETS, and will be transformed to GroupingSets,
+ * and eventually will be transformed to Aggregate(.., Expand) in Analyzer
+ *
+ * @param groupByExprs The Group By expressions candidates, take effective only if the
+ *                     associated bit in the bitmask set to 1.
+ * @param child        Child operator
+ * @param aggregations The Aggregation expressions, those non selected group by expressions
+ *                     will be considered as constant null if it appears in the expressions
+ * @param gid          The attribute represents the virtual column GROUPING__ID, and it's also
+ *                     the bitmask indicates the selected GroupBy Expressions for each
+ *                     aggregating output row.
+ */
+case class Rollup(
+    groupByExprs: Seq[Expression],
+    child: LogicalPlan,
+    aggregations: Seq[NamedExpression],
+    gid: AttributeReference = VirtualColumn.newGroupingId) extends GroupingAnalytics
+
 case class Limit(limitExpr: Expression, child: LogicalPlan) extends UnaryNode {
   override def output = child.output
 
-  override lazy val statistics: Statistics =
-    if (output.forall(_.dataType.isInstanceOf[NativeType])) {
-      val limit = limitExpr.eval(null).asInstanceOf[Int]
-      val sizeInBytes = (limit: Long) * output.map { a =>
-        NativeType.defaultSizeOf(a.dataType.asInstanceOf[NativeType])
-      }.sum
-      Statistics(sizeInBytes = sizeInBytes)
-    } else {
-      Statistics(sizeInBytes = children.map(_.statistics).map(_.sizeInBytes).product)
-    }
+  override lazy val statistics: Statistics = {
+    val limit = limitExpr.eval(null).asInstanceOf[Int]
+    val sizeInBytes = (limit: Long) * output.map(a => a.dataType.defaultSize).sum
+    Statistics(sizeInBytes = sizeInBytes)
+  }
 }
 
 case class Subquery(alias: String, child: LogicalPlan) extends UnaryNode {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala
index 1d513d7789763..45905f8ef98c5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.sql.catalyst.plans.logical
 
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
-import org.apache.spark.sql.catalyst.types.StringType
+import org.apache.spark.sql.catalyst.expressions.Attribute
 
 /**
  * A logical node that represents a non-query command to be executed by the system.  For example,
@@ -28,65 +27,3 @@ abstract class Command extends LeafNode {
   self: Product =>
   def output: Seq[Attribute] = Seq.empty
 }
-
-/**
- * Returned for commands supported by a given parser, but not catalyst.  In general these are DDL
- * commands that are passed directly to another system.
- */
-case class NativeCommand(cmd: String) extends Command {
-  override def output =
-    Seq(AttributeReference("result", StringType, nullable = false)())
-}
-
-/**
- * Commands of the form "SET [key [= value] ]".
- */
-case class DFSCommand(kv: Option[(String, Option[String])]) extends Command {
-  override def output = Seq(
-    AttributeReference("DFS output", StringType, nullable = false)())
-}
-
-/**
- *
- * Commands of the form "SET [key [= value] ]".
- */
-case class SetCommand(kv: Option[(String, Option[String])]) extends Command {
-  override def output = Seq(
-    AttributeReference("", StringType, nullable = false)())
-}
-
-/**
- * Returned by a parser when the users only wants to see what query plan would be executed, without
- * actually performing the execution.
- */
-case class ExplainCommand(plan: LogicalPlan, extended: Boolean = false) extends Command {
-  override def output =
-    Seq(AttributeReference("plan", StringType, nullable = false)())
-}
-
-/**
- * Returned for the "CACHE TABLE tableName [AS SELECT ...]" command.
- */
-case class CacheTableCommand(tableName: String, plan: Option[LogicalPlan], isLazy: Boolean)
-  extends Command
-
-/**
- * Returned for the "UNCACHE TABLE tableName" command.
- */
-case class UncacheTableCommand(tableName: String) extends Command
-
-/**
- * Returned for the "DESCRIBE [EXTENDED] [dbName.]tableName" command.
- * @param table The table to be described.
- * @param isExtended True if "DESCRIBE EXTENDED" is used. Otherwise, false.
- *                   It is effective only when the table is a Hive table.
- */
-case class DescribeCommand(
-    table: LogicalPlan,
-    isExtended: Boolean) extends Command {
-  override def output = Seq(
-    // Column names are based on Hive.
-    AttributeReference("col_name", StringType, nullable = false)(),
-    AttributeReference("data_type", StringType, nullable = false)(),
-    AttributeReference("comment", StringType, nullable = false)())
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index ccb0df113c063..3c3d7a3119064 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.plans.physical
 
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions.{Expression, Row, SortOrder}
-import org.apache.spark.sql.catalyst.types.IntegerType
+import org.apache.spark.sql.types.IntegerType
 
 /**
  * Specifies how tuples that share common expressions will be distributed when a query is executed
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeConversions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeConversions.scala
new file mode 100644
index 0000000000000..21f478c80c94b
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeConversions.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.types
+
+import java.text.SimpleDateFormat
+
+import org.apache.spark.sql.catalyst.ScalaReflection
+
+
+protected[sql] object DataTypeConversions {
+
+  def stringToTime(s: String): java.util.Date = {
+    if (!s.contains('T')) {
+      // JDBC escape string
+      if (s.contains(' ')) {
+        java.sql.Timestamp.valueOf(s)
+      } else {
+        java.sql.Date.valueOf(s)
+      }
+    } else if (s.endsWith("Z")) {
+      // this is zero timezone of ISO8601
+      stringToTime(s.substring(0, s.length - 1) + "GMT-00:00")
+    } else if (s.indexOf("GMT") == -1) {
+      // timezone with ISO8601
+      val inset = "+00.00".length
+      val s0 = s.substring(0, s.length - inset)
+      val s1 = s.substring(s.length - inset, s.length)
+      if (s0.substring(s0.lastIndexOf(':')).contains('.')) {
+        stringToTime(s0 + "GMT" + s1)
+      } else {
+        stringToTime(s0 + ".0GMT" + s1)
+      }
+    } else {
+      // ISO8601 with GMT insert
+      val ISO8601GMT: SimpleDateFormat = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSSz" )
+      ISO8601GMT.parse(s)
+    }
+  }
+
+  /** Converts Java objects to catalyst rows / types */
+  def convertJavaToCatalyst(a: Any, dataType: DataType): Any = (a, dataType) match {
+    case (obj, udt: UserDefinedType[_]) => ScalaReflection.convertToCatalyst(obj, udt) // Scala type
+    case (d: java.math.BigDecimal, _) => Decimal(d)
+    case (other, _) => other
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/decimal/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
similarity index 97%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/decimal/Decimal.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index 708362acf32dc..21cc6cea4bf54 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/decimal/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst.types.decimal
+package org.apache.spark.sql.types
 
 import org.apache.spark.annotation.DeveloperApi
 
@@ -28,7 +28,7 @@ import org.apache.spark.annotation.DeveloperApi
  * - Otherwise, the decimal value is longVal / (10 ** _scale)
  */
 final class Decimal extends Ordered[Decimal] with Serializable {
-  import Decimal.{MAX_LONG_DIGITS, POW_10, ROUNDING_MODE, BIG_DEC_ZERO}
+  import org.apache.spark.sql.types.Decimal.{BIG_DEC_ZERO, MAX_LONG_DIGITS, POW_10, ROUNDING_MODE}
 
   private var decimalVal: BigDecimal = null
   private var longVal: Long = 0L
@@ -143,6 +143,8 @@ final class Decimal extends Ordered[Decimal] with Serializable {
     }
   }
 
+  def toJavaBigDecimal: java.math.BigDecimal = toBigDecimal.underlying()
+
   def toUnscaledLong: Long = {
     if (decimalVal.ne(null)) {
       decimalVal.underlying().unscaledValue().longValue()
@@ -296,6 +298,8 @@ object Decimal {
 
   def apply(value: BigDecimal): Decimal = new Decimal().set(value)
 
+  def apply(value: java.math.BigDecimal): Decimal = new Decimal().set(value)
+
   def apply(value: BigDecimal, precision: Int, scale: Int): Decimal =
     new Decimal().set(value, precision, scale)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/Metadata.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Metadata.scala
similarity index 96%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/Metadata.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/types/Metadata.scala
index 8172733e94dd5..e50e9761431f5 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/Metadata.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Metadata.scala
@@ -15,24 +15,31 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst.util
+package org.apache.spark.sql.types
 
 import scala.collection.mutable
 
 import org.json4s._
 import org.json4s.jackson.JsonMethods._
 
+import org.apache.spark.annotation.DeveloperApi
+
+
 /**
+ * :: DeveloperApi ::
+ *
  * Metadata is a wrapper over Map[String, Any] that limits the value type to simple ones: Boolean,
  * Long, Double, String, Metadata, Array[Boolean], Array[Long], Array[Double], Array[String], and
  * Array[Metadata]. JSON is used for serialization.
  *
  * The default constructor is private. User should use either [[MetadataBuilder]] or
- * [[Metadata$#fromJson]] to create Metadata instances.
+ * [[Metadata.fromJson()]] to create Metadata instances.
  *
  * @param map an immutable map that stores the data
  */
-sealed class Metadata private[util] (private[util] val map: Map[String, Any]) extends Serializable {
+@DeveloperApi
+sealed class Metadata private[types] (private[types] val map: Map[String, Any])
+  extends Serializable {
 
   /** Tests whether this Metadata contains a binding for a key. */
   def contains(key: String): Boolean = map.contains(key)
@@ -201,8 +208,11 @@ object Metadata {
 }
 
 /**
+ * :: DeveloperApi ::
+ *
  * Builder for [[Metadata]]. If there is a key collision, the latter will overwrite the former.
  */
+@DeveloperApi
 class MetadataBuilder {
 
   private val map: mutable.Map[String, Any] = mutable.Map.empty
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/annotation/SQLUserDefinedType.java b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/SQLUserDefinedType.java
similarity index 93%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/annotation/SQLUserDefinedType.java
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/types/SQLUserDefinedType.java
index e966aeea1cb23..a64d2bb7cde37 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/annotation/SQLUserDefinedType.java
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/SQLUserDefinedType.java
@@ -15,12 +15,11 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst.annotation;
+package org.apache.spark.sql.types;
 
 import java.lang.annotation.*;
 
 import org.apache.spark.annotation.DeveloperApi;
-import org.apache.spark.sql.catalyst.types.UserDefinedType;
 
 /**
  * ::DeveloperApi::
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala
similarity index 69%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala
index 892b7e1a97c8b..9f30f40a173e0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala
@@ -15,11 +15,11 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst.types
+package org.apache.spark.sql.types
 
 import java.sql.{Date, Timestamp}
 
-import scala.math.Numeric.{FloatAsIfIntegral, BigDecimalAsIfIntegral, DoubleAsIfIntegral}
+import scala.math.Numeric.{FloatAsIfIntegral, DoubleAsIfIntegral}
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.{TypeTag, runtimeMirror, typeTag}
 import scala.util.parsing.combinator.RegexParsers
@@ -31,11 +31,10 @@ import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.ScalaReflectionLock
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, Row}
-import org.apache.spark.sql.catalyst.types.decimal._
-import org.apache.spark.sql.catalyst.util.Metadata
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression}
 import org.apache.spark.util.Utils
 
+
 object DataType {
   def fromJson(json: String): DataType = parseDataType(parse(json))
 
@@ -140,7 +139,7 @@ object DataType {
 
     protected lazy val structType: Parser[DataType] =
       "StructType\\([A-zA-z]*\\(".r ~> repsep(structField, ",") <~ "))" ^^ {
-        case fields => new StructType(fields)
+        case fields => StructType(fields)
       }
 
     protected lazy val dataType: Parser[DataType] =
@@ -181,7 +180,7 @@ object DataType {
   /**
    * Compares two types, ignoring nullability of ArrayType, MapType, StructType.
    */
-  def equalsIgnoreNullability(left: DataType, right: DataType): Boolean = {
+  private[sql] def equalsIgnoreNullability(left: DataType, right: DataType): Boolean = {
     (left, right) match {
       case (ArrayType(leftElementType, _), ArrayType(rightElementType, _)) =>
         equalsIgnoreNullability(leftElementType, rightElementType)
@@ -200,6 +199,15 @@ object DataType {
   }
 }
 
+
+/**
+ * :: DeveloperApi ::
+ *
+ * The base type of all Spark SQL data types.
+ *
+ * @group dataType
+ */
+@DeveloperApi
 abstract class DataType {
   /** Matches any expression that evaluates to this DataType */
   def unapply(a: Expression): Boolean = a match {
@@ -207,6 +215,9 @@ abstract class DataType {
     case _ => false
   }
 
+  /** The default size of a value of this data type. */
+  def defaultSize: Int
+
   def isPrimitive: Boolean = false
 
   def typeName: String = this.getClass.getSimpleName.stripSuffix("$").dropRight(4).toLowerCase
@@ -218,30 +229,34 @@ abstract class DataType {
   def prettyJson: String = pretty(render(jsonValue))
 }
 
-case object NullType extends DataType
 
-object NativeType {
+/**
+ * :: DeveloperApi ::
+ *
+ * The data type representing `NULL` values. Please use the singleton [[DataTypes.NullType]].
+ *
+ * @group dataType
+ */
+@DeveloperApi
+case object NullType extends DataType {
+  override def defaultSize: Int = 1
+}
+
+
+protected[sql] object NativeType {
   val all = Seq(
     IntegerType, BooleanType, LongType, DoubleType, FloatType, ShortType, ByteType, StringType)
 
   def unapply(dt: DataType): Boolean = all.contains(dt)
-
-  val defaultSizeOf: Map[NativeType, Int] = Map(
-    IntegerType -> 4,
-    BooleanType -> 1,
-    LongType -> 8,
-    DoubleType -> 8,
-    FloatType -> 4,
-    ShortType -> 2,
-    ByteType -> 1,
-    StringType -> 4096)
 }
 
-trait PrimitiveType extends DataType {
+
+protected[sql] trait PrimitiveType extends DataType {
   override def isPrimitive = true
 }
 
-object PrimitiveType {
+
+protected[sql] object PrimitiveType {
   private val nonDecimals = Seq(NullType, DateType, TimestampType, BinaryType) ++ NativeType.all
   private val nonDecimalNameToType = nonDecimals.map(t => t.typeName -> t).toMap
 
@@ -256,7 +271,7 @@ object PrimitiveType {
   }
 }
 
-abstract class NativeType extends DataType {
+protected[sql] abstract class NativeType extends DataType {
   private[sql] type JvmType
   @transient private[sql] val tag: TypeTag[JvmType]
   private[sql] val ordering: Ordering[JvmType]
@@ -267,12 +282,36 @@ abstract class NativeType extends DataType {
   }
 }
 
+
+/**
+ * :: DeveloperApi ::
+ *
+ * The data type representing `String` values. Please use the singleton [[DataTypes.StringType]].
+ *
+ * @group dataType
+ */
+@DeveloperApi
 case object StringType extends NativeType with PrimitiveType {
   private[sql] type JvmType = String
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val ordering = implicitly[Ordering[JvmType]]
+
+  /**
+   * The default size of a value of the StringType is 4096 bytes.
+   */
+  override def defaultSize: Int = 4096
 }
 
+
+/**
+ * :: DeveloperApi ::
+ *
+ * The data type representing `Array[Byte]` values.
+ * Please use the singleton [[DataTypes.BinaryType]].
+ *
+ * @group dataType
+ */
+@DeveloperApi
 case object BinaryType extends NativeType with PrimitiveType {
   private[sql] type JvmType = Array[Byte]
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
@@ -285,14 +324,43 @@ case object BinaryType extends NativeType with PrimitiveType {
       x.length - y.length
     }
   }
+
+  /**
+   * The default size of a value of the BinaryType is 4096 bytes.
+   */
+  override def defaultSize: Int = 4096
 }
 
+
+/**
+ * :: DeveloperApi ::
+ *
+ * The data type representing `Boolean` values. Please use the singleton [[DataTypes.BooleanType]].
+ *
+ *@group dataType
+ */
+@DeveloperApi
 case object BooleanType extends NativeType with PrimitiveType {
   private[sql] type JvmType = Boolean
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val ordering = implicitly[Ordering[JvmType]]
+
+  /**
+   * The default size of a value of the BooleanType is 1 byte.
+   */
+  override def defaultSize: Int = 1
 }
 
+
+/**
+ * :: DeveloperApi ::
+ *
+ * The data type representing `java.sql.Timestamp` values.
+ * Please use the singleton [[DataTypes.TimestampType]].
+ *
+ * @group dataType
+ */
+@DeveloperApi
 case object TimestampType extends NativeType {
   private[sql] type JvmType = Timestamp
 
@@ -301,8 +369,23 @@ case object TimestampType extends NativeType {
   private[sql] val ordering = new Ordering[JvmType] {
     def compare(x: Timestamp, y: Timestamp) = x.compareTo(y)
   }
+
+  /**
+   * The default size of a value of the TimestampType is 8 bytes.
+   */
+  override def defaultSize: Int = 8
 }
 
+
+/**
+ * :: DeveloperApi ::
+ *
+ * The data type representing `java.sql.Date` values.
+ * Please use the singleton [[DataTypes.DateType]].
+ *
+ * @group dataType
+ */
+@DeveloperApi
 case object DateType extends NativeType {
   private[sql] type JvmType = Date
 
@@ -311,9 +394,15 @@ case object DateType extends NativeType {
   private[sql] val ordering = new Ordering[JvmType] {
     def compare(x: Date, y: Date) = x.compareTo(y)
   }
+
+  /**
+   * The default size of a value of the DateType is 8 bytes.
+   */
+  override def defaultSize: Int = 8
 }
 
-abstract class NumericType extends NativeType with PrimitiveType {
+
+protected[sql] abstract class NumericType extends NativeType with PrimitiveType {
   // Unfortunately we can't get this implicitly as that breaks Spark Serialization. In order for
   // implicitly[Numeric[JvmType]] to be valid, we have to change JvmType from a type variable to a
   // type parameter and and add a numeric annotation (i.e., [JvmType : Numeric]). This gets
@@ -322,71 +411,144 @@ abstract class NumericType extends NativeType with PrimitiveType {
   private[sql] val numeric: Numeric[JvmType]
 }
 
-object NumericType {
+
+protected[sql] object NumericType {
   def unapply(e: Expression): Boolean = e.dataType.isInstanceOf[NumericType]
 }
 
+
 /** Matcher for any expressions that evaluate to [[IntegralType]]s */
-object IntegralType {
+protected[sql] object IntegralType {
   def unapply(a: Expression): Boolean = a match {
     case e: Expression if e.dataType.isInstanceOf[IntegralType] => true
     case _ => false
   }
 }
 
-abstract class IntegralType extends NumericType {
+
+protected[sql] sealed abstract class IntegralType extends NumericType {
   private[sql] val integral: Integral[JvmType]
 }
 
+
+/**
+ * :: DeveloperApi ::
+ *
+ * The data type representing `Long` values. Please use the singleton [[DataTypes.LongType]].
+ *
+ * @group dataType
+ */
+@DeveloperApi
 case object LongType extends IntegralType {
   private[sql] type JvmType = Long
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val numeric = implicitly[Numeric[Long]]
   private[sql] val integral = implicitly[Integral[Long]]
   private[sql] val ordering = implicitly[Ordering[JvmType]]
+
+  /**
+   * The default size of a value of the LongType is 8 bytes.
+   */
+  override def defaultSize: Int = 8
 }
 
+
+/**
+ * :: DeveloperApi ::
+ *
+ * The data type representing `Int` values. Please use the singleton [[DataTypes.IntegerType]].
+ *
+ * @group dataType
+ */
+@DeveloperApi
 case object IntegerType extends IntegralType {
   private[sql] type JvmType = Int
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val numeric = implicitly[Numeric[Int]]
   private[sql] val integral = implicitly[Integral[Int]]
   private[sql] val ordering = implicitly[Ordering[JvmType]]
+
+  /**
+   * The default size of a value of the IntegerType is 4 bytes.
+   */
+  override def defaultSize: Int = 4
 }
 
+
+/**
+ * :: DeveloperApi ::
+ *
+ * The data type representing `Short` values. Please use the singleton [[DataTypes.ShortType]].
+ *
+ * @group dataType
+ */
+@DeveloperApi
 case object ShortType extends IntegralType {
   private[sql] type JvmType = Short
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val numeric = implicitly[Numeric[Short]]
   private[sql] val integral = implicitly[Integral[Short]]
   private[sql] val ordering = implicitly[Ordering[JvmType]]
+
+  /**
+   * The default size of a value of the ShortType is 2 bytes.
+   */
+  override def defaultSize: Int = 2
 }
 
+
+/**
+ * :: DeveloperApi ::
+ *
+ * The data type representing `Byte` values. Please use the singleton [[DataTypes.ByteType]].
+ *
+ * @group dataType
+ */
+@DeveloperApi
 case object ByteType extends IntegralType {
   private[sql] type JvmType = Byte
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val numeric = implicitly[Numeric[Byte]]
   private[sql] val integral = implicitly[Integral[Byte]]
   private[sql] val ordering = implicitly[Ordering[JvmType]]
+
+  /**
+   * The default size of a value of the ByteType is 1 byte.
+   */
+  override def defaultSize: Int = 1
 }
 
+
 /** Matcher for any expressions that evaluate to [[FractionalType]]s */
-object FractionalType {
+protected[sql] object FractionalType {
   def unapply(a: Expression): Boolean = a match {
     case e: Expression if e.dataType.isInstanceOf[FractionalType] => true
     case _ => false
   }
 }
 
-abstract class FractionalType extends NumericType {
+
+protected[sql] sealed abstract class FractionalType extends NumericType {
   private[sql] val fractional: Fractional[JvmType]
   private[sql] val asIntegral: Integral[JvmType]
 }
 
+
 /** Precision parameters for a Decimal */
 case class PrecisionInfo(precision: Int, scale: Int)
 
-/** A Decimal that might have fixed precision and scale, or unlimited values for these */
+
+/**
+ * :: DeveloperApi ::
+ *
+ * The data type representing `java.math.BigDecimal` values.
+ * A Decimal that might have fixed precision and scale, or unlimited values for these.
+ *
+ * Please use [[DataTypes.createDecimalType()]] to create a specific instance.
+ *
+ * @group dataType
+ */
+@DeveloperApi
 case class DecimalType(precisionInfo: Option[PrecisionInfo]) extends FractionalType {
   private[sql] type JvmType = Decimal
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
@@ -395,6 +557,10 @@ case class DecimalType(precisionInfo: Option[PrecisionInfo]) extends FractionalT
   private[sql] val ordering = Decimal.DecimalIsFractional
   private[sql] val asIntegral = Decimal.DecimalAsIfIntegral
 
+  def precision: Int = precisionInfo.map(_.precision).getOrElse(-1)
+
+  def scale: Int = precisionInfo.map(_.scale).getOrElse(-1)
+
   override def typeName: String = precisionInfo match {
     case Some(PrecisionInfo(precision, scale)) => s"decimal($precision,$scale)"
     case None => "decimal"
@@ -404,8 +570,14 @@ case class DecimalType(precisionInfo: Option[PrecisionInfo]) extends FractionalT
     case Some(PrecisionInfo(precision, scale)) => s"DecimalType($precision,$scale)"
     case None => "DecimalType()"
   }
+
+  /**
+   * The default size of a value of the DecimalType is 4096 bytes.
+   */
+  override def defaultSize: Int = 4096
 }
 
+
 /** Extra factory methods and pattern matchers for Decimals */
 object DecimalType {
   val Unlimited: DecimalType = DecimalType(None)
@@ -437,6 +609,15 @@ object DecimalType {
   }
 }
 
+
+/**
+ * :: DeveloperApi ::
+ *
+ * The data type representing `Double` values. Please use the singleton [[DataTypes.DoubleType]].
+ *
+ * @group dataType
+ */
+@DeveloperApi
 case object DoubleType extends FractionalType {
   private[sql] type JvmType = Double
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
@@ -444,8 +625,22 @@ case object DoubleType extends FractionalType {
   private[sql] val fractional = implicitly[Fractional[Double]]
   private[sql] val ordering = implicitly[Ordering[JvmType]]
   private[sql] val asIntegral = DoubleAsIfIntegral
+
+  /**
+   * The default size of a value of the DoubleType is 8 bytes.
+   */
+  override def defaultSize: Int = 8
 }
 
+
+/**
+ * :: DeveloperApi ::
+ *
+ * The data type representing `Float` values. Please use the singleton [[DataTypes.FloatType]].
+ *
+ * @group dataType
+ */
+@DeveloperApi
 case object FloatType extends FractionalType {
   private[sql] type JvmType = Float
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
@@ -453,20 +648,38 @@ case object FloatType extends FractionalType {
   private[sql] val fractional = implicitly[Fractional[Float]]
   private[sql] val ordering = implicitly[Ordering[JvmType]]
   private[sql] val asIntegral = FloatAsIfIntegral
+
+  /**
+   * The default size of a value of the FloatType is 4 bytes.
+   */
+  override def defaultSize: Int = 4
 }
 
+
 object ArrayType {
   /** Construct a [[ArrayType]] object with the given element type. The `containsNull` is true. */
   def apply(elementType: DataType): ArrayType = ArrayType(elementType, true)
 }
 
+
 /**
+ * :: DeveloperApi ::
+ *
  * The data type for collections of multiple values.
  * Internally these are represented as columns that contain a ``scala.collection.Seq``.
  *
+ * Please use [[DataTypes.createArrayType()]] to create a specific instance.
+ *
+ * An [[ArrayType]] object comprises two fields, `elementType: [[DataType]]` and
+ * `containsNull: Boolean`. The field of `elementType` is used to specify the type of
+ * array elements. The field of `containsNull` is used to specify if the array has `null` values.
+ *
  * @param elementType The data type of values.
  * @param containsNull Indicates if values have `null` values
+ *
+ * @group dataType
  */
+@DeveloperApi
 case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataType {
   private[sql] def buildFormattedString(prefix: String, builder: StringBuilder): Unit = {
     builder.append(
@@ -478,10 +691,18 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT
     ("type" -> typeName) ~
       ("elementType" -> elementType.jsonValue) ~
       ("containsNull" -> containsNull)
+
+  /**
+   * The default size of a value of the ArrayType is 100 * the default size of the element type.
+   * (We assume that there are 100 elements).
+   */
+  override def defaultSize: Int = 100 * elementType.defaultSize
 }
 
+
 /**
  * A field inside a StructType.
+ *
  * @param name The name of this field.
  * @param dataType The data type of this field.
  * @param nullable Indicates if values of this field can be `null` values.
@@ -510,19 +731,92 @@ case class StructField(
   }
 }
 
+
 object StructType {
   protected[sql] def fromAttributes(attributes: Seq[Attribute]): StructType =
     StructType(attributes.map(a => StructField(a.name, a.dataType, a.nullable, a.metadata)))
+
+  def apply(fields: Seq[StructField]): StructType = StructType(fields.toArray)
+
+  def apply(fields: java.util.List[StructField]): StructType = {
+    StructType(fields.toArray.asInstanceOf[Array[StructField]])
+  }
 }
 
-case class StructType(fields: Seq[StructField]) extends DataType {
 
-  /**
-   * Returns all field names in a [[Seq]].
-   */
-  lazy val fieldNames: Seq[String] = fields.map(_.name)
+/**
+ * :: DeveloperApi ::
+ *
+ * A [[StructType]] object can be constructed by
+ * {{{
+ * StructType(fields: Seq[StructField])
+ * }}}
+ * For a [[StructType]] object, one or multiple [[StructField]]s can be extracted by names.
+ * If multiple [[StructField]]s are extracted, a [[StructType]] object will be returned.
+ * If a provided name does not have a matching field, it will be ignored. For the case
+ * of extracting a single StructField, a `null` will be returned.
+ * Example:
+ * {{{
+ * import org.apache.spark.sql._
+ *
+ * val struct =
+ *   StructType(
+ *     StructField("a", IntegerType, true) ::
+ *     StructField("b", LongType, false) ::
+ *     StructField("c", BooleanType, false) :: Nil)
+ *
+ * // Extract a single StructField.
+ * val singleField = struct("b")
+ * // singleField: StructField = StructField(b,LongType,false)
+ *
+ * // This struct does not have a field called "d". null will be returned.
+ * val nonExisting = struct("d")
+ * // nonExisting: StructField = null
+ *
+ * // Extract multiple StructFields. Field names are provided in a set.
+ * // A StructType object will be returned.
+ * val twoFields = struct(Set("b", "c"))
+ * // twoFields: StructType =
+ * //   StructType(List(StructField(b,LongType,false), StructField(c,BooleanType,false)))
+ *
+ * // Any names without matching fields will be ignored.
+ * // For the case shown below, "d" will be ignored and
+ * // it is treated as struct(Set("b", "c")).
+ * val ignoreNonExisting = struct(Set("b", "c", "d"))
+ * // ignoreNonExisting: StructType =
+ * //   StructType(List(StructField(b,LongType,false), StructField(c,BooleanType,false)))
+ * }}}
+ *
+ * A [[org.apache.spark.sql.Row]] object is used as a value of the StructType.
+ * Example:
+ * {{{
+ * import org.apache.spark.sql._
+ *
+ * val innerStruct =
+ *   StructType(
+ *     StructField("f1", IntegerType, true) ::
+ *     StructField("f2", LongType, false) ::
+ *     StructField("f3", BooleanType, false) :: Nil)
+ *
+ * val struct = StructType(
+ *   StructField("a", innerStruct, true) :: Nil)
+ *
+ * // Create a Row with the schema defined by struct
+ * val row = Row(Row(1, 2, true))
+ * // row: Row = [[1,2,true]]
+ * }}}
+ *
+ * @group dataType
+ */
+@DeveloperApi
+case class StructType(fields: Array[StructField]) extends DataType with Seq[StructField] {
+
+  /** Returns all field names in an array. */
+  def fieldNames: Array[String] = fields.map(_.name)
+
   private lazy val fieldNamesSet: Set[String] = fieldNames.toSet
   private lazy val nameToField: Map[String, StructField] = fields.map(f => f.name -> f).toMap
+
   /**
    * Extracts a [[StructField]] of the given name. If the [[StructType]] object does not
    * have a name matching the given name, `null` will be returned.
@@ -532,8 +826,8 @@ case class StructType(fields: Seq[StructField]) extends DataType {
   }
 
   /**
-   * Returns a [[StructType]] containing [[StructField]]s of the given names.
-   * Those names which do not have matching fields will be ignored.
+   * Returns a [[StructType]] containing [[StructField]]s of the given names, preserving the
+   * original order of fields. Those names which do not have matching fields will be ignored.
    */
   def apply(names: Set[String]): StructType = {
     val nonExistFields = names -- fieldNamesSet
@@ -545,8 +839,8 @@ case class StructType(fields: Seq[StructField]) extends DataType {
     StructType(fields.filter(f => names.contains(f.name)))
   }
 
-  protected[sql] def toAttributes =
-    fields.map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)())
+  protected[sql] def toAttributes: Seq[AttributeReference] =
+    map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)())
 
   def treeString: String = {
     val builder = new StringBuilder
@@ -565,23 +859,43 @@ case class StructType(fields: Seq[StructField]) extends DataType {
 
   override private[sql] def jsonValue =
     ("type" -> typeName) ~
-      ("fields" -> fields.map(_.jsonValue))
+      ("fields" -> map(_.jsonValue))
+
+  override def apply(fieldIndex: Int): StructField = fields(fieldIndex)
+
+  override def length: Int = fields.length
+
+  override def iterator: Iterator[StructField] = fields.iterator
+
+  /**
+   * The default size of a value of the StructType is the total default sizes of all field types.
+   */
+  override def defaultSize: Int = fields.map(_.dataType.defaultSize).sum
 }
 
+
 object MapType {
   /**
    * Construct a [[MapType]] object with the given key type and value type.
    * The `valueContainsNull` is true.
    */
   def apply(keyType: DataType, valueType: DataType): MapType =
-    MapType(keyType: DataType, valueType: DataType, true)
+    MapType(keyType: DataType, valueType: DataType, valueContainsNull = true)
 }
 
+
 /**
+ * :: DeveloperApi ::
+ *
  * The data type for Maps. Keys in a map are not allowed to have `null` values.
+ *
+ * Please use [[DataTypes.createMapType()]] to create a specific instance.
+ *
  * @param keyType The data type of map keys.
  * @param valueType The data type of map values.
  * @param valueContainsNull Indicates if map values have `null` values.
+ *
+ * @group dataType
  */
 case class MapType(
     keyType: DataType,
@@ -600,8 +914,16 @@ case class MapType(
       ("keyType" -> keyType.jsonValue) ~
       ("valueType" -> valueType.jsonValue) ~
       ("valueContainsNull" -> valueContainsNull)
+
+  /**
+   * The default size of a value of the MapType is
+   * 100 * (the default size of the key type + the default size of the value type).
+   * (We assume that there are 100 elements).
+   */
+  override def defaultSize: Int = 100 * (keyType.defaultSize + valueType.defaultSize)
 }
 
+
 /**
  * ::DeveloperApi::
  * The data type for User Defined Types (UDTs).
@@ -611,7 +933,7 @@ case class MapType(
  * a SchemaRDD which has class X in the schema.
  *
  * For SparkSQL to recognize UDTs, the UDT must be annotated with
- * [[org.apache.spark.sql.catalyst.annotation.SQLUserDefinedType]].
+ * [[SQLUserDefinedType]].
  *
  * The conversion via `serialize` occurs when instantiating a `SchemaRDD` from another RDD.
  * The conversion via `deserialize` occurs when reading from a `SchemaRDD`.
@@ -647,4 +969,9 @@ abstract class UserDefinedType[UserType] extends DataType with Serializable {
    * Class object for the UserType
    */
   def userClass: java.lang.Class[UserType]
+
+  /**
+   * The default size of a value of the UserDefinedType is 4096 bytes.
+   */
+  override def defaultSize: Int = 4096
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala
similarity index 96%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/package.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala
index de24449590f9a..346a51ea10c82 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala
@@ -15,7 +15,8 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst
+package org.apache.spark.sql
+
 /**
  * Contains a type system for attributes produced by relations, including complex types like
  * structs, arrays and maps.
diff --git a/sbin/spark-executor b/sql/catalyst/src/test/resources/log4j.properties
old mode 100755
new mode 100644
similarity index 60%
rename from sbin/spark-executor
rename to sql/catalyst/src/test/resources/log4j.properties
index 674ce906d9421..287c8e3563503
--- a/sbin/spark-executor
+++ b/sql/catalyst/src/test/resources/log4j.properties
@@ -1,5 +1,3 @@
-#!/bin/sh
-
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -17,10 +15,14 @@
 # limitations under the License.
 #
 
-FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
-
-export PYTHONPATH="$FWDIR/python:$PYTHONPATH"
-export PYTHONPATH="$FWDIR/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH"
+# Set everything to be logged to the file target/unit-tests.log
+log4j.rootCategory=INFO, file
+log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file.append=true
+log4j.appender.file.file=target/unit-tests.log
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
 
-echo "Running spark-executor with framework dir = $FWDIR"
-exec "$FWDIR"/bin/spark-class org.apache.spark.executor.MesosExecutorBackend
+# Ignore messages below warning level from Jetty, because it's a bit verbose
+log4j.logger.org.eclipse.jetty=WARN
+org.eclipse.jetty.LEVEL=WARN
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
index ddc3d44869c98..5138942a55daa 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -23,7 +23,7 @@ import java.sql.{Date, Timestamp}
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.catalyst.expressions.Row
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 case class PrimitiveData(
     intField: Int,
@@ -43,7 +43,7 @@ case class NullableData(
     byteField: java.lang.Byte,
     booleanField: java.lang.Boolean,
     stringField: String,
-    decimalField: BigDecimal,
+    decimalField: java.math.BigDecimal,
     dateField: Date,
     timestampField: Timestamp,
     binaryField: Array[Byte])
@@ -68,6 +68,10 @@ case class ComplexData(
 case class GenericData[A](
     genericField: A)
 
+case class MultipleConstructorsData(a: Int, b: String, c: Double) {
+  def this(b: String, a: Int) = this(a, b, c = 1.0)
+}
+
 class ScalaReflectionSuite extends FunSuite {
   import ScalaReflection._
 
@@ -200,7 +204,8 @@ class ScalaReflectionSuite extends FunSuite {
     assert(DoubleType === typeOfObject(1.7976931348623157E308))
 
     // DecimalType
-    assert(DecimalType.Unlimited === typeOfObject(BigDecimal("1.7976931348623157E318")))
+    assert(DecimalType.Unlimited ===
+      typeOfObject(new java.math.BigDecimal("1.7976931348623157E318")))
 
     // DateType
     assert(DateType === typeOfObject(Date.valueOf("2014-07-25")))
@@ -239,7 +244,7 @@ class ScalaReflectionSuite extends FunSuite {
 
   test("convert PrimitiveData to catalyst") {
     val data = PrimitiveData(1, 1, 1, 1, 1, 1, true)
-    val convertedData = Seq(1, 1.toLong, 1.toDouble, 1.toFloat, 1.toShort, 1.toByte, true)
+    val convertedData = Row(1, 1.toLong, 1.toDouble, 1.toFloat, 1.toShort, 1.toByte, true)
     val dataType = schemaFor[PrimitiveData].dataType
     assert(convertToCatalyst(data, dataType) === convertedData)
   }
@@ -253,4 +258,14 @@ class ScalaReflectionSuite extends FunSuite {
       Row(1, 1, 1, 1, 1, 1, true))
     assert(convertToCatalyst(data, dataType) === convertedData)
   }
+
+  test("infer schema from case class with multiple constructors") {
+    val dataType = schemaFor[MultipleConstructorsData].dataType
+    dataType match {
+      case s: StructType =>
+        // Schema should have order: a: Int, b: String, c: Double
+        assert(s.fieldNames === Seq("a", "b", "c"))
+        assert(s.fields.map(_.dataType) === Seq(IntegerType, StringType, DoubleType))
+    }
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
new file mode 100644
index 0000000000000..1a0a0e6154ad2
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst
+
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.Command
+import org.scalatest.FunSuite
+
+private[sql] case class TestCommand(cmd: String) extends Command
+
+private[sql] class SuperLongKeywordTestParser extends AbstractSparkSQLParser {
+  protected val EXECUTE   = Keyword("THISISASUPERLONGKEYWORDTEST")
+
+  override protected lazy val start: Parser[LogicalPlan] = set
+
+  private lazy val set: Parser[LogicalPlan] =
+    EXECUTE ~> ident ^^ {
+      case fileName => TestCommand(fileName)
+    }
+}
+
+private[sql] class CaseInsensitiveTestParser extends AbstractSparkSQLParser {
+  protected val EXECUTE   = Keyword("EXECUTE")
+
+  override protected lazy val start: Parser[LogicalPlan] = set
+
+  private lazy val set: Parser[LogicalPlan] =
+    EXECUTE ~> ident ^^ {
+      case fileName => TestCommand(fileName)
+    }
+}
+
+class SqlParserSuite extends FunSuite {
+
+  test("test long keyword") {
+    val parser = new SuperLongKeywordTestParser
+    assert(TestCommand("NotRealCommand") === parser("ThisIsASuperLongKeyWordTest NotRealCommand"))
+  }
+
+  test("test case insensitive") {
+    val parser = new CaseInsensitiveTestParser
+    assert(TestCommand("NotRealCommand") === parser("EXECUTE NotRealCommand"))
+    assert(TestCommand("NotRealCommand") === parser("execute NotRealCommand"))
+    assert(TestCommand("NotRealCommand") === parser("exEcute NotRealCommand"))
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 33a3cba3d4c0e..3aea337460d42 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -19,11 +19,13 @@ package org.apache.spark.sql.catalyst.analysis
 
 import org.scalatest.{BeforeAndAfter, FunSuite}
 
-import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference}
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
 
 class AnalysisSuite extends FunSuite with BeforeAndAfter {
   val caseSensitiveCatalog = new SimpleCatalog(true)
@@ -42,8 +44,16 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
     AttributeReference("e", ShortType)())
 
   before {
-    caseSensitiveCatalog.registerTable(None, "TaBlE", testRelation)
-    caseInsensitiveCatalog.registerTable(None, "TaBlE", testRelation)
+    caseSensitiveCatalog.registerTable(Seq("TaBlE"), testRelation)
+    caseInsensitiveCatalog.registerTable(Seq("TaBlE"), testRelation)
+  }
+
+  test("union project *") {
+    val plan = (1 to 100)
+      .map(_ => testRelation)
+      .fold[LogicalPlan](testRelation)((a,b) => a.select(Star(None)).select('a).unionAll(b.select(Star(None))))
+
+    assert(caseInsensitiveAnalyze(plan).resolved)
   }
 
   test("analyze project") {
@@ -54,45 +64,45 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
     assert(
       caseSensitiveAnalyze(
         Project(Seq(UnresolvedAttribute("TbL.a")),
-          UnresolvedRelation(None, "TaBlE", Some("TbL")))) ===
+          UnresolvedRelation(Seq("TaBlE"), Some("TbL")))) ===
         Project(testRelation.output, testRelation))
 
     val e = intercept[TreeNodeException[_]] {
       caseSensitiveAnalyze(
         Project(Seq(UnresolvedAttribute("tBl.a")),
-          UnresolvedRelation(None, "TaBlE", Some("TbL"))))
+          UnresolvedRelation(Seq("TaBlE"), Some("TbL"))))
     }
     assert(e.getMessage().toLowerCase.contains("unresolved"))
 
     assert(
       caseInsensitiveAnalyze(
         Project(Seq(UnresolvedAttribute("TbL.a")),
-          UnresolvedRelation(None, "TaBlE", Some("TbL")))) ===
+          UnresolvedRelation(Seq("TaBlE"), Some("TbL")))) ===
         Project(testRelation.output, testRelation))
 
     assert(
       caseInsensitiveAnalyze(
         Project(Seq(UnresolvedAttribute("tBl.a")),
-          UnresolvedRelation(None, "TaBlE", Some("TbL")))) ===
+          UnresolvedRelation(Seq("TaBlE"), Some("TbL")))) ===
         Project(testRelation.output, testRelation))
   }
 
   test("resolve relations") {
     val e = intercept[RuntimeException] {
-      caseSensitiveAnalyze(UnresolvedRelation(None, "tAbLe", None))
+      caseSensitiveAnalyze(UnresolvedRelation(Seq("tAbLe"), None))
     }
     assert(e.getMessage == "Table Not Found: tAbLe")
 
     assert(
-      caseSensitiveAnalyze(UnresolvedRelation(None, "TaBlE", None)) ===
+      caseSensitiveAnalyze(UnresolvedRelation(Seq("TaBlE"), None)) ===
         testRelation)
 
     assert(
-      caseInsensitiveAnalyze(UnresolvedRelation(None, "tAbLe", None)) ===
+      caseInsensitiveAnalyze(UnresolvedRelation(Seq("tAbLe"), None)) ===
         testRelation)
 
     assert(
-      caseInsensitiveAnalyze(UnresolvedRelation(None, "TaBlE", None)) ===
+      caseInsensitiveAnalyze(UnresolvedRelation(Seq("TaBlE"), None)) ===
         testRelation)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
index d5b7d2789a103..bc2ec754d5865 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{Project, LocalRelation}
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 import org.scalatest.{BeforeAndAfter, FunSuite}
 
 class DecimalPrecisionSuite extends FunSuite with BeforeAndAfter {
@@ -41,7 +41,7 @@ class DecimalPrecisionSuite extends FunSuite with BeforeAndAfter {
   val f: Expression = UnresolvedAttribute("f")
 
   before {
-    catalog.registerTable(None, "table", relation)
+    catalog.registerTable(Seq("table"), relation)
   }
 
   private def checkType(expression: Expression, expectedType: DataType): Unit = {
@@ -49,6 +49,15 @@ class DecimalPrecisionSuite extends FunSuite with BeforeAndAfter {
     assert(analyzer(plan).schema.fields(0).dataType === expectedType)
   }
 
+  private def checkComparison(expression: Expression, expectedType: DataType): Unit = {
+    val plan = Project(Alias(expression, "c")() :: Nil, relation)
+    val comparison = analyzer(plan).collect {
+      case Project(Alias(e: BinaryComparison, _) :: Nil, _) => e
+    }.head
+    assert(comparison.left.dataType === expectedType)
+    assert(comparison.right.dataType === expectedType)
+  }
+
   test("basic operations") {
     checkType(Add(d1, d2), DecimalType(6, 2))
     checkType(Subtract(d1, d2), DecimalType(6, 2))
@@ -65,6 +74,14 @@ class DecimalPrecisionSuite extends FunSuite with BeforeAndAfter {
     checkType(Add(Add(d1, d2), Add(d1, d2)), DecimalType(7, 2))
   }
 
+  test("Comparison operations") {
+    checkComparison(LessThan(i, d1), DecimalType.Unlimited)
+    checkComparison(LessThanOrEqual(d1, d2), DecimalType.Unlimited)
+    checkComparison(GreaterThan(d2, u), DecimalType.Unlimited)
+    checkComparison(GreaterThanOrEqual(d1, f), DoubleType)
+    checkComparison(GreaterThan(d2, d2), DecimalType(5, 2))
+  }
+
   test("bringing in primitive types") {
     checkType(Add(d1, i), DecimalType(12, 1))
     checkType(Add(d1, f), DoubleType)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index dfa2d958c0faf..f5a502b43f80b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -21,7 +21,7 @@ import org.scalatest.FunSuite
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 class HiveTypeCoercionSuite extends FunSuite {
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
index cd2f67f448b0b..37e64adeea853 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -21,16 +21,13 @@ import java.sql.{Date, Timestamp}
 
 import scala.collection.immutable.HashSet
 
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
+import org.scalactic.TripleEqualsSupport.Spread
 import org.scalatest.FunSuite
 import org.scalatest.Matchers._
-import org.scalactic.TripleEqualsSupport.Spread
-
-import org.apache.spark.sql.catalyst.types._
-
 
-/* Implicit conversions */
 import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.types._
+
 
 class ExpressionEvaluationSuite extends FunSuite {
 
@@ -179,6 +176,21 @@ class ExpressionEvaluationSuite extends FunSuite {
     checkEvaluation(Divide(Literal(null, IntegerType), Literal(null, IntegerType)), null)
   }
 
+  test("Remainder") {
+    checkEvaluation(Remainder(Literal(2), Literal(1)), 0)
+    checkEvaluation(Remainder(Literal(1.0), Literal(2.0)), 1.0)
+    checkEvaluation(Remainder(Literal(1), Literal(2)), 1)
+    checkEvaluation(Remainder(Literal(1), Literal(0)), null)
+    checkEvaluation(Remainder(Literal(1.0), Literal(0.0)), null)
+    checkEvaluation(Remainder(Literal(0.0), Literal(0.0)), null)
+    checkEvaluation(Remainder(Literal(0), Literal(null, IntegerType)), null)
+    checkEvaluation(Remainder(Literal(1), Literal(null, IntegerType)), null)
+    checkEvaluation(Remainder(Literal(null, IntegerType), Literal(0)), null)
+    checkEvaluation(Remainder(Literal(null, DoubleType), Literal(0.0)), null)
+    checkEvaluation(Remainder(Literal(null, IntegerType), Literal(1)), null)
+    checkEvaluation(Remainder(Literal(null, IntegerType), Literal(null, IntegerType)), null)
+  }
+
   test("INSET") {
     val hS = HashSet[Any]() + 1 + 2
     val nS = HashSet[Any]() + 1 + 2 + null
@@ -487,6 +499,242 @@ class ExpressionEvaluationSuite extends FunSuite {
     checkEvaluation(Cast(Literal(1.0f / 0.0f), TimestampType), null)
   }
 
+  test("array casting") {
+    val array = Literal(Seq("123", "abc", "", null), ArrayType(StringType, containsNull = true))
+    val array_notNull = Literal(Seq("123", "abc", ""), ArrayType(StringType, containsNull = false))
+
+    {
+      val cast = Cast(array, ArrayType(IntegerType, containsNull = true))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Seq(123, null, null, null))
+    }
+    {
+      val cast = Cast(array, ArrayType(IntegerType, containsNull = false))
+      assert(cast.resolved === false)
+    }
+    {
+      val cast = Cast(array, ArrayType(BooleanType, containsNull = true))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Seq(true, true, false, null))
+    }
+    {
+      val cast = Cast(array, ArrayType(BooleanType, containsNull = false))
+      assert(cast.resolved === false)
+    }
+
+    {
+      val cast = Cast(array_notNull, ArrayType(IntegerType, containsNull = true))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Seq(123, null, null))
+    }
+    {
+      val cast = Cast(array_notNull, ArrayType(IntegerType, containsNull = false))
+      assert(cast.resolved === false)
+    }
+    {
+      val cast = Cast(array_notNull, ArrayType(BooleanType, containsNull = true))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Seq(true, true, false))
+    }
+    {
+      val cast = Cast(array_notNull, ArrayType(BooleanType, containsNull = false))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Seq(true, true, false))
+    }
+
+    {
+      val cast = Cast(array, IntegerType)
+      assert(cast.resolved === false)
+    }
+  }
+
+  test("map casting") {
+    val map = Literal(
+      Map("a" -> "123", "b" -> "abc", "c" -> "", "d" -> null),
+      MapType(StringType, StringType, valueContainsNull = true))
+    val map_notNull = Literal(
+      Map("a" -> "123", "b" -> "abc", "c" -> ""),
+      MapType(StringType, StringType, valueContainsNull = false))
+
+    {
+      val cast = Cast(map, MapType(StringType, IntegerType, valueContainsNull = true))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Map("a" -> 123, "b" -> null, "c" -> null, "d" -> null))
+    }
+    {
+      val cast = Cast(map, MapType(StringType, IntegerType, valueContainsNull = false))
+      assert(cast.resolved === false)
+    }
+    {
+      val cast = Cast(map, MapType(StringType, BooleanType, valueContainsNull = true))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Map("a" -> true, "b" -> true, "c" -> false, "d" -> null))
+    }
+    {
+      val cast = Cast(map, MapType(StringType, BooleanType, valueContainsNull = false))
+      assert(cast.resolved === false)
+    }
+    {
+      val cast = Cast(map, MapType(IntegerType, StringType, valueContainsNull = true))
+      assert(cast.resolved === false)
+    }
+
+    {
+      val cast = Cast(map_notNull, MapType(StringType, IntegerType, valueContainsNull = true))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Map("a" -> 123, "b" -> null, "c" -> null))
+    }
+    {
+      val cast = Cast(map_notNull, MapType(StringType, IntegerType, valueContainsNull = false))
+      assert(cast.resolved === false)
+    }
+    {
+      val cast = Cast(map_notNull, MapType(StringType, BooleanType, valueContainsNull = true))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Map("a" -> true, "b" -> true, "c" -> false))
+    }
+    {
+      val cast = Cast(map_notNull, MapType(StringType, BooleanType, valueContainsNull = false))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Map("a" -> true, "b" -> true, "c" -> false))
+    }
+    {
+      val cast = Cast(map_notNull, MapType(IntegerType, StringType, valueContainsNull = true))
+      assert(cast.resolved === false)
+    }
+
+    {
+      val cast = Cast(map, IntegerType)
+      assert(cast.resolved === false)
+    }
+  }
+
+  test("struct casting") {
+    val struct = Literal(
+      Row("123", "abc", "", null),
+      StructType(Seq(
+        StructField("a", StringType, nullable = true),
+        StructField("b", StringType, nullable = true),
+        StructField("c", StringType, nullable = true),
+        StructField("d", StringType, nullable = true))))
+    val struct_notNull = Literal(
+      Row("123", "abc", ""),
+      StructType(Seq(
+        StructField("a", StringType, nullable = false),
+        StructField("b", StringType, nullable = false),
+        StructField("c", StringType, nullable = false))))
+
+    {
+      val cast = Cast(struct, StructType(Seq(
+        StructField("a", IntegerType, nullable = true),
+        StructField("b", IntegerType, nullable = true),
+        StructField("c", IntegerType, nullable = true),
+        StructField("d", IntegerType, nullable = true))))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Row(123, null, null, null))
+    }
+    {
+      val cast = Cast(struct, StructType(Seq(
+        StructField("a", IntegerType, nullable = true),
+        StructField("b", IntegerType, nullable = true),
+        StructField("c", IntegerType, nullable = false),
+        StructField("d", IntegerType, nullable = true))))
+      assert(cast.resolved === false)
+    }
+    {
+      val cast = Cast(struct, StructType(Seq(
+        StructField("a", BooleanType, nullable = true),
+        StructField("b", BooleanType, nullable = true),
+        StructField("c", BooleanType, nullable = true),
+        StructField("d", BooleanType, nullable = true))))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Row(true, true, false, null))
+    }
+    {
+      val cast = Cast(struct, StructType(Seq(
+        StructField("a", BooleanType, nullable = true),
+        StructField("b", BooleanType, nullable = true),
+        StructField("c", BooleanType, nullable = false),
+        StructField("d", BooleanType, nullable = true))))
+      assert(cast.resolved === false)
+    }
+
+    {
+      val cast = Cast(struct_notNull, StructType(Seq(
+        StructField("a", IntegerType, nullable = true),
+        StructField("b", IntegerType, nullable = true),
+        StructField("c", IntegerType, nullable = true))))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Row(123, null, null))
+    }
+    {
+      val cast = Cast(struct_notNull, StructType(Seq(
+        StructField("a", IntegerType, nullable = true),
+        StructField("b", IntegerType, nullable = true),
+        StructField("c", IntegerType, nullable = false))))
+      assert(cast.resolved === false)
+    }
+    {
+      val cast = Cast(struct_notNull, StructType(Seq(
+        StructField("a", BooleanType, nullable = true),
+        StructField("b", BooleanType, nullable = true),
+        StructField("c", BooleanType, nullable = true))))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Row(true, true, false))
+    }
+    {
+      val cast = Cast(struct_notNull, StructType(Seq(
+        StructField("a", BooleanType, nullable = true),
+        StructField("b", BooleanType, nullable = true),
+        StructField("c", BooleanType, nullable = false))))
+      assert(cast.resolved === true)
+      checkEvaluation(cast, Row(true, true, false))
+    }
+
+    {
+      val cast = Cast(struct, StructType(Seq(
+        StructField("a", StringType, nullable = true),
+        StructField("b", StringType, nullable = true),
+        StructField("c", StringType, nullable = true))))
+      assert(cast.resolved === false)
+    }
+    {
+      val cast = Cast(struct, IntegerType)
+      assert(cast.resolved === false)
+    }
+  }
+
+  test("complex casting") {
+    val complex = Literal(
+      Row(
+        Seq("123", "abc", ""),
+        Map("a" -> "123", "b" -> "abc", "c" -> ""),
+        Row(0)),
+      StructType(Seq(
+        StructField("a",
+          ArrayType(StringType, containsNull = false), nullable = true),
+        StructField("m",
+          MapType(StringType, StringType, valueContainsNull = false), nullable = true),
+        StructField("s",
+          StructType(Seq(
+            StructField("i", IntegerType, nullable = true)))))))
+
+    val cast = Cast(complex, StructType(Seq(
+      StructField("a",
+        ArrayType(IntegerType, containsNull = true), nullable = true),
+      StructField("m",
+        MapType(StringType, BooleanType, valueContainsNull = false), nullable = true),
+      StructField("s",
+        StructType(Seq(
+          StructField("l", LongType, nullable = true)))))))
+
+    assert(cast.resolved === true)
+    checkEvaluation(cast, Row(
+      Seq(123, null, null),
+      Map("a" -> true, "b" -> true, "c" -> false),
+      Row(0L)))
+  }
+
   test("null checking") {
     val row = new GenericRow(Array[Any]("^Ba*n", null, true, null))
     val c1 = 'a.string.at(0)
@@ -786,6 +1034,8 @@ class ExpressionEvaluationSuite extends FunSuite {
     }
 
     checkEvaluation(Sqrt(Literal(null, DoubleType)), null, new GenericRow(Array[Any](null)))
+    checkEvaluation(Sqrt(-1), null, EmptyRow)
+    checkEvaluation(Sqrt(-1.5), null, EmptyRow)
   }
 
   test("Bitwise operations") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
new file mode 100644
index 0000000000000..264a0eff37d34
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.analysis.EliminateAnalysisOperators
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.dsl.expressions._
+
+class BooleanSimplificationSuite extends PlanTest with PredicateHelper {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("AnalysisNodes", Once,
+        EliminateAnalysisOperators) ::
+      Batch("Constant Folding", FixedPoint(50),
+        NullPropagation,
+        ConstantFolding,
+        BooleanSimplification,
+        SimplifyFilters) :: Nil
+  }
+
+  val testRelation = LocalRelation('a.int, 'b.int, 'c.int, 'd.string)
+
+  // The `foldLeft` is required to handle cases like comparing `a && (b && c)` and `(a && b) && c`
+  def compareConditions(e1: Expression, e2: Expression): Boolean = (e1, e2) match {
+    case (lhs: And, rhs: And) =>
+      val lhsSet = splitConjunctivePredicates(lhs).toSet
+      val rhsSet = splitConjunctivePredicates(rhs).toSet
+      lhsSet.foldLeft(rhsSet) { (set, e) =>
+        set.find(compareConditions(_, e)).map(set - _).getOrElse(set)
+      }.isEmpty
+
+    case (lhs: Or, rhs: Or) =>
+      val lhsSet = splitDisjunctivePredicates(lhs).toSet
+      val rhsSet = splitDisjunctivePredicates(rhs).toSet
+      lhsSet.foldLeft(rhsSet) { (set, e) =>
+        set.find(compareConditions(_, e)).map(set - _).getOrElse(set)
+      }.isEmpty
+
+    case (l, r) => l == r
+  }
+
+  def checkCondition(input: Expression, expected: Expression): Unit = {
+    val plan = testRelation.where(input).analyze
+    val actual = Optimize(plan).expressions.head
+    compareConditions(actual, expected)
+  }
+
+  test("a && a => a") {
+    checkCondition(Literal(1) < 'a && Literal(1) < 'a, Literal(1) < 'a)
+    checkCondition(Literal(1) < 'a && Literal(1) < 'a && Literal(1) < 'a, Literal(1) < 'a)
+  }
+
+  test("a || a => a") {
+    checkCondition(Literal(1) < 'a || Literal(1) < 'a, Literal(1) < 'a)
+    checkCondition(Literal(1) < 'a || Literal(1) < 'a || Literal(1) < 'a, Literal(1) < 'a)
+  }
+
+  test("(a && b && c && ...) || (a && b && d && ...) || (a && b && e && ...) ...") {
+    checkCondition('b > 3 || 'c > 5, 'b > 3 || 'c > 5)
+
+    checkCondition(('a < 2 && 'a > 3 && 'b > 5) || 'a < 2,  'a < 2)
+
+    checkCondition('a < 2 || ('a < 2 && 'a > 3 && 'b > 5),  'a < 2)
+
+    val input = ('a === 'b && 'b > 3 && 'c > 2) ||
+      ('a === 'b && 'c < 1 && 'a === 5) ||
+      ('a === 'b && 'b < 5 && 'a > 1)
+
+    val expected =
+      (((('b > 3) && ('c > 2)) ||
+        (('c < 1) && ('a === 5))) ||
+        (('b < 5) && ('a > 1))) && ('a === 'b)
+
+    checkCondition(input, expected)
+  }
+
+  test("(a || b || c || ...) && (a || b || d || ...) && (a || b || e || ...) ...") {
+    checkCondition('b > 3 && 'c > 5, 'b > 3 && 'c > 5)
+
+    checkCondition(('a < 2 || 'a > 3 || 'b > 5) && 'a < 2, 'a < 2)
+
+    checkCondition('a < 2 && ('a < 2 || 'a > 3 || 'b > 5) , 'a < 2)
+
+    checkCondition(('a < 2 || 'b > 3) && ('a < 2 || 'c > 5), ('b > 3 && 'c > 5) || 'a < 2)
+
+    checkCondition(
+      ('a === 'b || 'b > 3) && ('a === 'b || 'a > 3) && ('a === 'b || 'a < 5),
+      ('b > 3 && 'a > 3 && 'a < 5) || 'a === 'b)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
index 0a27cce337482..9fdf3efa02bb6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 // For implicit conversions
 import org.apache.spark.sql.catalyst.dsl.plans._
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
index 017b180c574b4..da912ab382179 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 // For implicit conversions
 import org.apache.spark.sql.catalyst.dsl.plans._
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index 036fd3fa1d6a1..cdb843f959704 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable.ArrayBuffer
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.types.{StringType, NullType}
+import org.apache.spark.sql.types.{StringType, NullType}
 
 case class Dummy(optKey: Option[Expression]) extends Expression {
   def children = optKey.toSeq
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala
index f005b7df21043..d7d60efee50fa 100755
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql.catalyst.util
 import org.json4s.jackson.JsonMethods.parse
 import org.scalatest.FunSuite
 
+import org.apache.spark.sql.types.{MetadataBuilder, Metadata}
+
 class MetadataSuite extends FunSuite {
 
   val baseMetadata = new MetadataBuilder()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
similarity index 65%
rename from sql/core/src/test/scala/org/apache/spark/sql/DataTypeSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index e9740d913cf57..c147be9f6b1ae 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -15,7 +15,7 @@
 * limitations under the License.
 */
 
-package org.apache.spark.sql
+package org.apache.spark.sql.types
 
 import org.scalatest.FunSuite
 
@@ -62,6 +62,7 @@ class DataTypeSuite extends FunSuite {
     }
   }
 
+  checkDataTypeJsonRepr(NullType)
   checkDataTypeJsonRepr(BooleanType)
   checkDataTypeJsonRepr(ByteType)
   checkDataTypeJsonRepr(ShortType)
@@ -69,7 +70,9 @@ class DataTypeSuite extends FunSuite {
   checkDataTypeJsonRepr(LongType)
   checkDataTypeJsonRepr(FloatType)
   checkDataTypeJsonRepr(DoubleType)
+  checkDataTypeJsonRepr(DecimalType(10, 5))
   checkDataTypeJsonRepr(DecimalType.Unlimited)
+  checkDataTypeJsonRepr(DateType)
   checkDataTypeJsonRepr(TimestampType)
   checkDataTypeJsonRepr(StringType)
   checkDataTypeJsonRepr(BinaryType)
@@ -77,12 +80,39 @@ class DataTypeSuite extends FunSuite {
   checkDataTypeJsonRepr(ArrayType(StringType, false))
   checkDataTypeJsonRepr(MapType(IntegerType, StringType, true))
   checkDataTypeJsonRepr(MapType(IntegerType, ArrayType(DoubleType), false))
+
   val metadata = new MetadataBuilder()
     .putString("name", "age")
     .build()
-  checkDataTypeJsonRepr(
-    StructType(Seq(
-      StructField("a", IntegerType, nullable = true),
-      StructField("b", ArrayType(DoubleType), nullable = false),
-      StructField("c", DoubleType, nullable = false, metadata))))
+  val structType = StructType(Seq(
+    StructField("a", IntegerType, nullable = true),
+    StructField("b", ArrayType(DoubleType), nullable = false),
+    StructField("c", DoubleType, nullable = false, metadata)))
+  checkDataTypeJsonRepr(structType)
+
+  def checkDefaultSize(dataType: DataType, expectedDefaultSize: Int): Unit = {
+    test(s"Check the default size of ${dataType}") {
+      assert(dataType.defaultSize === expectedDefaultSize)
+    }
+  }
+
+  checkDefaultSize(NullType, 1)
+  checkDefaultSize(BooleanType, 1)
+  checkDefaultSize(ByteType, 1)
+  checkDefaultSize(ShortType, 2)
+  checkDefaultSize(IntegerType, 4)
+  checkDefaultSize(LongType, 8)
+  checkDefaultSize(FloatType, 4)
+  checkDefaultSize(DoubleType, 8)
+  checkDefaultSize(DecimalType(10, 5), 4096)
+  checkDefaultSize(DecimalType.Unlimited, 4096)
+  checkDefaultSize(DateType, 8)
+  checkDefaultSize(TimestampType, 8)
+  checkDefaultSize(StringType, 4096)
+  checkDefaultSize(BinaryType, 4096)
+  checkDefaultSize(ArrayType(DoubleType, true), 800)
+  checkDefaultSize(ArrayType(StringType, false), 409600)
+  checkDefaultSize(MapType(IntegerType, StringType, true), 410000)
+  checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 80400)
+  checkDefaultSize(structType, 812)
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/types/decimal/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
similarity index 98%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/types/decimal/DecimalSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
index e32f1ac382130..de6a2cd448c47 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/types/decimal/DecimalSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
@@ -15,8 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst.types.decimal
+package org.apache.spark.sql.types.decimal
 
+import org.apache.spark.sql.types.Decimal
 import org.scalatest.{PrivateMethodTester, FunSuite}
 
 import scala.language.postfixOps
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 3bd283fd20156..3e9ef07df9db6 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -56,12 +56,10 @@
     <dependency>
       <groupId>com.twitter</groupId>
       <artifactId>parquet-column</artifactId>
-      <version>${parquet.version}</version>
     </dependency>
     <dependency>
       <groupId>com.twitter</groupId>
       <artifactId>parquet-hadoop</artifactId>
-      <version>${parquet.version}</version>
     </dependency>
     <dependency>
       <groupId>com.fasterxml.jackson.core</groupId>
@@ -73,11 +71,6 @@
       <artifactId>junit</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
@@ -87,11 +80,5 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
-    <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
-    </plugins>
   </build>
 </project>
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/ArrayType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/ArrayType.java
deleted file mode 100644
index b73a371e93001..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/ArrayType.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-/**
- * The data type representing Lists.
- * An ArrayType object comprises two fields, {@code DataType elementType} and
- * {@code boolean containsNull}. The field of {@code elementType} is used to specify the type of
- * array elements. The field of {@code containsNull} is used to specify if the array has
- * {@code null} values.
- *
- * To create an {@link ArrayType},
- * {@link DataType#createArrayType(DataType)} or
- * {@link DataType#createArrayType(DataType, boolean)}
- * should be used.
- */
-public class ArrayType extends DataType {
-  private DataType elementType;
-  private boolean containsNull;
-
-  protected ArrayType(DataType elementType, boolean containsNull) {
-    this.elementType = elementType;
-    this.containsNull = containsNull;
-  }
-
-  public DataType getElementType() {
-    return elementType;
-  }
-
-  public boolean isContainsNull() {
-    return containsNull;
-  }
-
-  @Override
-  public boolean equals(Object o) {
-    if (this == o) return true;
-    if (o == null || getClass() != o.getClass()) return false;
-
-    ArrayType arrayType = (ArrayType) o;
-
-    if (containsNull != arrayType.containsNull) return false;
-    if (!elementType.equals(arrayType.elementType)) return false;
-
-    return true;
-  }
-
-  @Override
-  public int hashCode() {
-    int result = elementType.hashCode();
-    result = 31 * result + (containsNull ? 1 : 0);
-    return result;
-  }
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/BinaryType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/BinaryType.java
deleted file mode 100644
index 7daad60f62a0b..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/BinaryType.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-/**
- * The data type representing byte[] values.
- *
- * {@code BinaryType} is represented by the singleton object {@link DataType#BinaryType}.
- */
-public class BinaryType extends DataType {
-  protected BinaryType() {}
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/DateType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/DateType.java
deleted file mode 100644
index 6677793baa365..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/DateType.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-/**
- * The data type representing java.sql.Date values.
- *
- * {@code DateType} is represented by the singleton object {@link DataType#DateType}.
- */
-public class DateType extends DataType {
-    protected DateType() {}
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/DecimalType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/DecimalType.java
deleted file mode 100644
index 60752451ecfc7..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/DecimalType.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-/**
- * The data type representing java.math.BigDecimal values.
- */
-public class DecimalType extends DataType {
-  private boolean hasPrecisionInfo;
-  private int precision;
-  private int scale;
-
-  public DecimalType(int precision, int scale) {
-    this.hasPrecisionInfo = true;
-    this.precision = precision;
-    this.scale = scale;
-  }
-
-  public DecimalType() {
-    this.hasPrecisionInfo = false;
-    this.precision = -1;
-    this.scale = -1;
-  }
-
-  public boolean isUnlimited() {
-    return !hasPrecisionInfo;
-  }
-
-  public boolean isFixed() {
-    return hasPrecisionInfo;
-  }
-
-  /** Return the precision, or -1 if no precision is set */
-  public int getPrecision() {
-    return precision;
-  }
-
-  /** Return the scale, or -1 if no precision is set */
-  public int getScale() {
-    return scale;
-  }
-
-  @Override
-  public boolean equals(Object o) {
-    if (this == o) return true;
-    if (o == null || getClass() != o.getClass()) return false;
-
-    DecimalType that = (DecimalType) o;
-
-    if (hasPrecisionInfo != that.hasPrecisionInfo) return false;
-    if (precision != that.precision) return false;
-    if (scale != that.scale) return false;
-
-    return true;
-  }
-
-  @Override
-  public int hashCode() {
-    int result = (hasPrecisionInfo ? 1 : 0);
-    result = 31 * result + precision;
-    result = 31 * result + scale;
-    return result;
-  }
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/DoubleType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/DoubleType.java
deleted file mode 100644
index f0060d0bcf9f5..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/DoubleType.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-/**
- * The data type representing double and Double values.
- *
- * {@code DoubleType} is represented by the singleton object {@link DataType#DoubleType}.
- */
-public class DoubleType extends DataType {
-  protected DoubleType() {}
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/FloatType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/FloatType.java
deleted file mode 100644
index 4a6a37f69176a..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/FloatType.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-/**
- * The data type representing float and Float values.
- *
- * {@code FloatType} is represented by the singleton object {@link DataType#FloatType}.
- */
-public class FloatType extends DataType {
-  protected FloatType() {}
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/IntegerType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/IntegerType.java
deleted file mode 100644
index bfd70490bbbbb..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/IntegerType.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-/**
- * The data type representing int and Integer values.
- *
- * {@code IntegerType} is represented by the singleton object {@link DataType#IntegerType}.
- */
-public class IntegerType extends DataType {
-  protected IntegerType() {}
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/LongType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/LongType.java
deleted file mode 100644
index af13a46eb165c..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/LongType.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-/**
- * The data type representing long and Long values.
- *
- * {@code LongType} is represented by the singleton object {@link DataType#LongType}.
- */
-public class LongType extends DataType {
-  protected LongType() {}
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/MapType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/MapType.java
deleted file mode 100644
index 063e6b34abc48..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/MapType.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-/**
- * The data type representing Maps. A MapType object comprises two fields,
- * {@code DataType keyType}, {@code DataType valueType}, and {@code boolean valueContainsNull}.
- * The field of {@code keyType} is used to specify the type of keys in the map.
- * The field of {@code valueType} is used to specify the type of values in the map.
- * The field of {@code valueContainsNull} is used to specify if map values have
- * {@code null} values.
- * For values of a MapType column, keys are not allowed to have {@code null} values.
- *
- * To create a {@link MapType},
- * {@link DataType#createMapType(DataType, DataType)} or
- * {@link DataType#createMapType(DataType, DataType, boolean)}
- * should be used.
- */
-public class MapType extends DataType {
-  private DataType keyType;
-  private DataType valueType;
-  private boolean valueContainsNull;
-
-  protected MapType(DataType keyType, DataType valueType, boolean valueContainsNull) {
-    this.keyType = keyType;
-    this.valueType = valueType;
-    this.valueContainsNull = valueContainsNull;
-  }
-
-  public DataType getKeyType() {
-    return keyType;
-  }
-
-  public DataType getValueType() {
-    return valueType;
-  }
-
-  public boolean isValueContainsNull() {
-    return valueContainsNull;
-  }
-
-  @Override
-  public boolean equals(Object o) {
-    if (this == o) return true;
-    if (o == null || getClass() != o.getClass()) return false;
-
-    MapType mapType = (MapType) o;
-
-    if (valueContainsNull != mapType.valueContainsNull) return false;
-    if (!keyType.equals(mapType.keyType)) return false;
-    if (!valueType.equals(mapType.valueType)) return false;
-
-    return true;
-  }
-
-  @Override
-  public int hashCode() {
-    int result = keyType.hashCode();
-    result = 31 * result + valueType.hashCode();
-    result = 31 * result + (valueContainsNull ? 1 : 0);
-    return result;
-  }
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/ShortType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/ShortType.java
deleted file mode 100644
index 7d7604b4e3d2d..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/ShortType.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-/**
- * The data type representing short and Short values.
- *
- * {@code ShortType} is represented by the singleton object {@link DataType#ShortType}.
- */
-public class ShortType extends DataType {
-  protected ShortType() {}
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/StringType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/StringType.java
deleted file mode 100644
index f4ba0c07c9c6e..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/StringType.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-/**
- * The data type representing String values.
- *
- * {@code StringType} is represented by the singleton object {@link DataType#StringType}.
- */
-public class StringType extends DataType {
-  protected StringType() {}
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/StructField.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/StructField.java
deleted file mode 100644
index 7c60d492bcdf0..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/StructField.java
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-import java.util.Map;
-
-/**
- * A StructField object represents a field in a StructType object.
- * A StructField object comprises three fields, {@code String name}, {@code DataType dataType},
- * and {@code boolean nullable}. The field of {@code name} is the name of a StructField.
- * The field of {@code dataType} specifies the data type of a StructField.
- * The field of {@code nullable} specifies if values of a StructField can contain {@code null}
- * values.
- * The field of {@code metadata} provides extra information of the StructField.
- *
- * To create a {@link StructField},
- * {@link DataType#createStructField(String, DataType, boolean, Metadata)}
- * should be used.
- */
-public class StructField {
-  private String name;
-  private DataType dataType;
-  private boolean nullable;
-  private Metadata metadata;
-
-  protected StructField(
-      String name,
-      DataType dataType,
-      boolean nullable,
-      Metadata metadata) {
-    this.name = name;
-    this.dataType = dataType;
-    this.nullable = nullable;
-    this.metadata = metadata;
-  }
-
-  public String getName() {
-    return name;
-  }
-
-  public DataType getDataType() {
-    return dataType;
-  }
-
-  public boolean isNullable() {
-    return nullable;
-  }
-
-  public Metadata getMetadata() {
-    return metadata;
-  }
-
-  @Override
-  public boolean equals(Object o) {
-    if (this == o) return true;
-    if (o == null || getClass() != o.getClass()) return false;
-
-    StructField that = (StructField) o;
-
-    if (nullable != that.nullable) return false;
-    if (!dataType.equals(that.dataType)) return false;
-    if (!name.equals(that.name)) return false;
-    if (!metadata.equals(that.metadata)) return false;
-
-    return true;
-  }
-
-  @Override
-  public int hashCode() {
-    int result = name.hashCode();
-    result = 31 * result + dataType.hashCode();
-    result = 31 * result + (nullable ? 1 : 0);
-    result = 31 * result + metadata.hashCode();
-    return result;
-  }
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/StructType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/StructType.java
deleted file mode 100644
index a4b501efd9a10..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/StructType.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-import java.util.Arrays;
-
-/**
- * The data type representing Rows.
- * A StructType object comprises an array of StructFields.
- *
- * To create an {@link StructType},
- * {@link DataType#createStructType(java.util.List)} or
- * {@link DataType#createStructType(StructField[])}
- * should be used.
- */
-public class StructType extends DataType {
-  private StructField[] fields;
-
-  protected StructType(StructField[] fields) {
-    this.fields = fields;
-  }
-
-  public StructField[] getFields() {
-    return fields;
-  }
-
-  @Override
-  public boolean equals(Object o) {
-    if (this == o) return true;
-    if (o == null || getClass() != o.getClass()) return false;
-
-    StructType that = (StructType) o;
-
-    if (!Arrays.equals(fields, that.fields)) return false;
-
-    return true;
-  }
-
-  @Override
-  public int hashCode() {
-    return Arrays.hashCode(fields);
-  }
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/TimestampType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/TimestampType.java
deleted file mode 100644
index 06d44c731cdfe..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/TimestampType.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-/**
- * The data type representing java.sql.Timestamp values.
- *
- * {@code TimestampType} is represented by the singleton object {@link DataType#TimestampType}.
- */
-public class TimestampType extends DataType {
-  protected TimestampType() {}
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UserDefinedType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UserDefinedType.java
deleted file mode 100644
index b751847b464fd..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UserDefinedType.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-import java.io.Serializable;
-
-import org.apache.spark.annotation.DeveloperApi;
-
-/**
- * ::DeveloperApi::
- * The data type representing User-Defined Types (UDTs).
- * UDTs may use any other DataType for an underlying representation.
- */
-@DeveloperApi
-public abstract class UserDefinedType<UserType> extends DataType implements Serializable {
-
-  protected UserDefinedType() { }
-
-  @Override
-  public boolean equals(Object o) {
-    if (this == o) return true;
-    if (o == null || getClass() != o.getClass()) return false;
-    UserDefinedType<UserType> that = (UserDefinedType<UserType>) o;
-    return this.sqlType().equals(that.sqlType());
-  }
-
-  /** Underlying storage type for this UDT */
-  public abstract DataType sqlType();
-
-  /** Convert the user type to a SQL datum */
-  public abstract Object serialize(Object obj);
-
-  /** Convert a SQL datum to the user type */
-  public abstract UserType deserialize(Object datum);
-
-  /** Class object for the UserType */
-  public abstract Class<UserType> userClass();
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala
index 3c9439b2e9a52..e715d9434a2ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala
@@ -91,8 +91,8 @@ private[sql] trait CacheManager {
         CachedData(
           planToCache,
           InMemoryRelation(
-            useCompression,
-            columnBatchSize,
+            conf.useCompression,
+            conf.columnBatchSize,
             storageLevel,
             query.queryExecution.executedPlan,
             tableName))
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/Metadata.java b/sql/core/src/main/scala/org/apache/spark/sql/ExperimentalMethods.scala
similarity index 58%
rename from sql/core/src/main/java/org/apache/spark/sql/api/java/Metadata.java
rename to sql/core/src/main/scala/org/apache/spark/sql/ExperimentalMethods.scala
index 0f819fb01a76a..f0e6a8f332188 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/Metadata.java
+++ b/sql/core/src/main/scala/org/apache/spark/sql/ExperimentalMethods.scala
@@ -15,17 +15,22 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.api.java;
+package org.apache.spark.sql
+
+import org.apache.spark.annotation.Experimental
 
 /**
- * Metadata is a wrapper over Map[String, Any] that limits the value type to simple ones: Boolean,
- * Long, Double, String, Metadata, Array[Boolean], Array[Long], Array[Double], Array[String], and
- * Array[Metadata]. JSON is used for serialization.
- *
- * The default constructor is private. User should use [[MetadataBuilder]].
+ * Holder for experimental methods for the bravest. We make NO guarantee about the stability
+ * regarding binary compatibility and source compatibility of methods here.
  */
-class Metadata extends org.apache.spark.sql.catalyst.util.Metadata {
-  Metadata(scala.collection.immutable.Map<String, Object> map) {
-    super(map);
-  }
+@Experimental
+class ExperimentalMethods protected[sql](sqlContext: SQLContext) {
+
+  /**
+   * Allows extra strategies to be injected into the query planner at runtime.  Note this API
+   * should be consider experimental and is not intended to be stable across releases.
+   */
+  @Experimental
+  var extraStrategies: Seq[Strategy] = Nil
+
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 9697beb132fbb..243dc997078f3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -38,6 +38,7 @@ private[spark] object SQLConf {
   val PARQUET_FILTER_PUSHDOWN_ENABLED = "spark.sql.parquet.filterPushdown"
 
   val COLUMN_NAME_OF_CORRUPT_RECORD = "spark.sql.columnNameOfCorruptRecord"
+  val BROADCAST_TIMEOUT = "spark.sql.broadcastTimeout"
 
   // Options that control which operators can be chosen by the query planner.  These should be
   // considered hints and may be ignored by future versions of Spark SQL.
@@ -52,15 +53,15 @@ private[spark] object SQLConf {
 }
 
 /**
- * A trait that enables the setting and getting of mutable config parameters/hints.
+ * A class that enables the setting and getting of mutable config parameters/hints.
  *
  * In the presence of a SQLContext, these can be set and queried by passing SET commands
- * into Spark SQL's query functions (i.e. sql()). Otherwise, users of this trait can
- * modify the hints by programmatically calling the setters and getters of this trait.
+ * into Spark SQL's query functions (i.e. sql()). Otherwise, users of this class can
+ * modify the hints by programmatically calling the setters and getters of this class.
  *
  * SQLConf is thread-safe (internally synchronized, so safe to be used in multiple threads).
  */
-private[sql] trait SQLConf {
+private[sql] class SQLConf extends Serializable {
   import SQLConf._
 
   /** Only low degree of contention is expected for conf, thus NOT using ConcurrentHashMap. */
@@ -148,6 +149,12 @@ private[sql] trait SQLConf {
   private[spark] def columnNameOfCorruptRecord: String =
     getConf(COLUMN_NAME_OF_CORRUPT_RECORD, "_corrupt_record")
 
+  /**
+   * Timeout in seconds for the broadcast wait time in hash join
+   */
+  private[spark] def broadcastTimeout: Int =
+    getConf(BROADCAST_TIMEOUT, (5 * 60).toString).toInt
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
@@ -181,6 +188,10 @@ private[sql] trait SQLConf {
    */
   def getAllConfs: immutable.Map[String, String] = settings.synchronized { settings.toMap }
 
+  private[spark] def unsetConf(key: String) {
+    settings -= key
+  }
+
   private[spark] def clear() {
     settings.clear()
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 31cc4170aa867..0a22968cc7807 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -17,26 +17,29 @@
 
 package org.apache.spark.sql
 
+import java.beans.Introspector
+import java.util.Properties
+
+import scala.collection.immutable
 import scala.language.implicitConversions
 import scala.reflect.runtime.universe.TypeTag
 
-import org.apache.hadoop.conf.Configuration
-
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{AlphaComponent, DeveloperApi, Experimental}
+import org.apache.spark.api.java.{JavaSparkContext, JavaRDD}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.dsl.ExpressionConversions
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.optimizer.{Optimizer, DefaultOptimizer}
+import org.apache.spark.sql.catalyst.optimizer.{DefaultOptimizer, Optimizer}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
-import org.apache.spark.sql.catalyst.types.UserDefinedType
-import org.apache.spark.sql.execution.{SparkStrategies, _}
+import org.apache.spark.sql.execution._
 import org.apache.spark.sql.json._
-import org.apache.spark.sql.parquet.ParquetRelation
-import org.apache.spark.sql.sources.{DataSourceStrategy, BaseRelation, DDLParser, LogicalRelation}
+import org.apache.spark.sql.sources.{LogicalRelation, BaseRelation, DDLParser, DataSourceStrategy}
+import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 /**
  * :: AlphaComponent ::
@@ -49,14 +52,38 @@ import org.apache.spark.sql.sources.{DataSourceStrategy, BaseRelation, DDLParser
 @AlphaComponent
 class SQLContext(@transient val sparkContext: SparkContext)
   extends org.apache.spark.Logging
-  with SQLConf
   with CacheManager
   with ExpressionConversions
-  with UDFRegistration
   with Serializable {
 
   self =>
 
+  def this(sparkContext: JavaSparkContext) = this(sparkContext.sc)
+
+  // Note that this is a lazy val so we can override the default value in subclasses.
+  protected[sql] lazy val conf: SQLConf = new SQLConf
+
+  /** Set Spark SQL configuration properties. */
+  def setConf(props: Properties): Unit = conf.setConf(props)
+
+  /** Set the given Spark SQL configuration property. */
+  def setConf(key: String, value: String): Unit = conf.setConf(key, value)
+
+  /** Return the value of Spark SQL configuration property for the given key. */
+  def getConf(key: String): String = conf.getConf(key)
+
+  /**
+   * Return the value of Spark SQL configuration property for the given key. If the key is not set
+   * yet, return `defaultValue`.
+   */
+  def getConf(key: String, defaultValue: String): String = conf.getConf(key, defaultValue)
+
+  /**
+   * Return all the configuration properties that have been set (i.e. not the default).
+   * This creates a new copy of the config properties in the form of a Map.
+   */
+  def getAllConfs: immutable.Map[String, String] = conf.getAllConfs
+
   @transient
   protected[sql] lazy val catalog: Catalog = new SimpleCatalog(true)
 
@@ -76,11 +103,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
   @transient
   protected[sql] val sqlParser = {
     val fallback = new catalyst.SqlParser
-    new catalyst.SparkSQLParser(fallback(_))
+    new SparkSQLParser(fallback(_))
   }
 
   protected[sql] def parseSql(sql: String): LogicalPlan = {
-    ddlParser(sql).getOrElse(sqlParser(sql))
+    ddlParser(sql, false).getOrElse(sqlParser(sql))
   }
 
   protected[sql] def executeSql(sql: String): this.QueryExecution = executePlan(parseSql(sql))
@@ -92,21 +119,12 @@ class SQLContext(@transient val sparkContext: SparkContext)
     case _ =>
   }
 
-  /**
-   * :: DeveloperApi ::
-   * Allows catalyst LogicalPlans to be executed as a SchemaRDD.  Note that the LogicalPlan
-   * interface is considered internal, and thus not guaranteed to be stable.  As a result, using
-   * them directly is not recommended.
-   */
-  @DeveloperApi
-  implicit def logicalPlanToSparkQuery(plan: LogicalPlan): SchemaRDD = new SchemaRDD(this, plan)
-
   /**
    * Creates a SchemaRDD from an RDD of case classes.
    *
    * @group userf
    */
-  implicit def createSchemaRDD[A <: Product: TypeTag](rdd: RDD[A]) = {
+  implicit def createSchemaRDD[A <: Product: TypeTag](rdd: RDD[A]): SchemaRDD = {
     SparkPlan.currentContext.set(self)
     val attributeSeq = ScalaReflection.attributesFor[A]
     val schema = StructType.fromAttributes(attributeSeq)
@@ -114,8 +132,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
     new SchemaRDD(this, LogicalRDD(attributeSeq, rowRDD)(self))
   }
 
-  implicit def baseRelationToSchemaRDD(baseRelation: BaseRelation): SchemaRDD = {
-    logicalPlanToSparkQuery(LogicalRelation(baseRelation))
+  /**
+   * Convert a [[BaseRelation]] created for external data sources into a [[SchemaRDD]].
+   */
+  def baseRelationToSchemaRDD(baseRelation: BaseRelation): SchemaRDD = {
+    new SchemaRDD(this, LogicalRelation(baseRelation))
   }
 
   /**
@@ -156,6 +177,43 @@ class SQLContext(@transient val sparkContext: SparkContext)
     new SchemaRDD(this, logicalPlan)
   }
 
+  /**
+   * Applies a schema to an RDD of Java Beans.
+   *
+   * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
+   *          SELECT * queries will return the columns in an undefined order.
+   */
+  def applySchema(rdd: RDD[_], beanClass: Class[_]): SchemaRDD = {
+    val attributeSeq = getSchema(beanClass)
+    val className = beanClass.getName
+    val rowRdd = rdd.mapPartitions { iter =>
+      // BeanInfo is not serializable so we must rediscover it remotely for each partition.
+      val localBeanInfo = Introspector.getBeanInfo(
+        Class.forName(className, true, Utils.getContextOrSparkClassLoader))
+      val extractors =
+        localBeanInfo.getPropertyDescriptors.filterNot(_.getName == "class").map(_.getReadMethod)
+
+      iter.map { row =>
+        new GenericRow(
+          extractors.zip(attributeSeq).map { case (e, attr) =>
+            DataTypeConversions.convertJavaToCatalyst(e.invoke(row), attr.dataType)
+          }.toArray[Any]
+        ) : Row
+      }
+    }
+    new SchemaRDD(this, LogicalRDD(attributeSeq, rowRdd)(this))
+  }
+
+  /**
+   * Applies a schema to an RDD of Java Beans.
+   *
+   * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
+   *          SELECT * queries will return the columns in an undefined order.
+   */
+  def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): SchemaRDD = {
+    applySchema(rdd.rdd, beanClass)
+  }
+
   /**
    * Loads a Parquet file, returning the result as a [[SchemaRDD]].
    *
@@ -212,7 +270,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    */
   @Experimental
   def jsonRDD(json: RDD[String], schema: StructType): SchemaRDD = {
-    val columnNameOfCorruptJsonRecord = columnNameOfCorruptRecord
+    val columnNameOfCorruptJsonRecord = conf.columnNameOfCorruptRecord
     val appliedSchema =
       Option(schema).getOrElse(
         JsonRDD.nullTypeToStringType(
@@ -226,7 +284,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    */
   @Experimental
   def jsonRDD(json: RDD[String], samplingRatio: Double): SchemaRDD = {
-    val columnNameOfCorruptJsonRecord = columnNameOfCorruptRecord
+    val columnNameOfCorruptJsonRecord = conf.columnNameOfCorruptRecord
     val appliedSchema =
       JsonRDD.nullTypeToStringType(
         JsonRDD.inferSchema(json, samplingRatio, columnNameOfCorruptJsonRecord))
@@ -234,41 +292,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
     applySchema(rowRDD, appliedSchema)
   }
 
-  /**
-   * :: Experimental ::
-   * Creates an empty parquet file with the schema of class `A`, which can be registered as a table.
-   * This registered table can be used as the target of future `insertInto` operations.
-   *
-   * {{{
-   *   val sqlContext = new SQLContext(...)
-   *   import sqlContext._
-   *
-   *   case class Person(name: String, age: Int)
-   *   createParquetFile[Person]("path/to/file.parquet").registerTempTable("people")
-   *   sql("INSERT INTO people SELECT 'michael', 29")
-   * }}}
-   *
-   * @tparam A A case class type that describes the desired schema of the parquet file to be
-   *           created.
-   * @param path The path where the directory containing parquet metadata should be created.
-   *             Data inserted into this table will also be stored at this location.
-   * @param allowExisting When false, an exception will be thrown if this directory already exists.
-   * @param conf A Hadoop configuration object that can be used to specify options to the parquet
-   *             output format.
-   *
-   * @group userf
-   */
-  @Experimental
-  def createParquetFile[A <: Product : TypeTag](
-      path: String,
-      allowExisting: Boolean = true,
-      conf: Configuration = new Configuration()): SchemaRDD = {
-    new SchemaRDD(
-      this,
-      ParquetRelation.createEmpty(
-        path, ScalaReflection.attributesFor[A], allowExisting, conf, this))
-  }
-
   /**
    * Registers the given RDD as a temporary table in the catalog.  Temporary tables exist only
    * during the lifetime of this instance of SQLContext.
@@ -276,7 +299,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group userf
    */
   def registerRDDAsTable(rdd: SchemaRDD, tableName: String): Unit = {
-    catalog.registerTable(None, tableName, rdd.queryExecution.logical)
+    catalog.registerTable(Seq(tableName), rdd.queryExecution.logical)
   }
 
   /**
@@ -289,7 +312,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    */
   def dropTempTable(tableName: String): Unit = {
     tryUncacheQuery(table(tableName))
-    catalog.unregisterTable(None, tableName)
+    catalog.unregisterTable(Seq(tableName))
   }
 
   /**
@@ -299,38 +322,64 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group userf
    */
   def sql(sqlText: String): SchemaRDD = {
-    if (dialect == "sql") {
+    if (conf.dialect == "sql") {
       new SchemaRDD(this, parseSql(sqlText))
     } else {
-      sys.error(s"Unsupported SQL dialect: $dialect")
+      sys.error(s"Unsupported SQL dialect: ${conf.dialect}")
     }
   }
 
   /** Returns the specified table as a SchemaRDD */
   def table(tableName: String): SchemaRDD =
-    new SchemaRDD(this, catalog.lookupRelation(None, tableName))
+    new SchemaRDD(this, catalog.lookupRelation(Seq(tableName)))
 
   /**
-   * :: DeveloperApi ::
-   * Allows extra strategies to be injected into the query planner at runtime.  Note this API
-   * should be consider experimental and is not intended to be stable across releases.
+   * A collection of methods that are considered experimental, but can be used to hook into
+   * the query planner for advanced functionalities.
    */
-  @DeveloperApi
-  var extraStrategies: Seq[Strategy] = Nil
+  val experimental: ExperimentalMethods = new ExperimentalMethods(this)
+
+  /**
+   * A collection of methods for registering user-defined functions (UDF).
+   *
+   * The following example registers a Scala closure as UDF:
+   * {{{
+   *   sqlContext.udf.register("myUdf", (arg1: Int, arg2: String) => arg2 + arg1)
+   * }}}
+   *
+   * The following example registers a UDF in Java:
+   * {{{
+   *   sqlContext.udf().register("myUDF",
+   *     new UDF2<Integer, String, String>() {
+   *       @Override
+   *       public String call(Integer arg1, String arg2) {
+   *         return arg2 + arg1;
+   *       }
+   *     }, DataTypes.StringType);
+   * }}}
+   *
+   * Or, to use Java 8 lambda syntax:
+   * {{{
+   *   sqlContext.udf().register("myUDF",
+   *     (Integer arg1, String arg2) -> arg2 + arg1),
+   *     DataTypes.StringType);
+   * }}}
+   */
+  val udf: UDFRegistration = new UDFRegistration(this)
 
   protected[sql] class SparkPlanner extends SparkStrategies {
     val sparkContext: SparkContext = self.sparkContext
 
     val sqlContext: SQLContext = self
 
-    def codegenEnabled = self.codegenEnabled
+    def codegenEnabled = self.conf.codegenEnabled
 
-    def numPartitions = self.numShufflePartitions
+    def numPartitions = self.conf.numShufflePartitions
 
-    val strategies: Seq[Strategy] =
-      extraStrategies ++ (
-      CommandStrategy(self) ::
+    def strategies: Seq[Strategy] =
+      experimental.extraStrategies ++ (
       DataSourceStrategy ::
+      DDLStrategy ::
       TakeOrdered ::
       HashAggregation ::
       LeftSemiJoin ::
@@ -454,14 +503,14 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * have the same format as the one generated by `toString` in scala.
    * It is only used by PySpark.
    */
-  private[sql] def parseDataType(dataTypeString: String): DataType = {
+  protected[sql] def parseDataType(dataTypeString: String): DataType = {
     DataType.fromJson(dataTypeString)
   }
 
   /**
    * Apply a schema defined by the schemaString to an RDD. It is only used by PySpark.
    */
-  private[sql] def applySchemaToPythonRDD(
+  protected[sql] def applySchemaToPythonRDD(
       rdd: RDD[Array[Any]],
       schemaString: String): SchemaRDD = {
     val schema = parseDataType(schemaString).asInstanceOf[StructType]
@@ -471,7 +520,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
   /**
    * Apply a schema defined by the schema to an RDD. It is only used by PySpark.
    */
-  private[sql] def applySchemaToPythonRDD(
+  protected[sql] def applySchemaToPythonRDD(
       rdd: RDD[Array[Any]],
       schema: StructType): SchemaRDD = {
 
@@ -502,4 +551,43 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
     new SchemaRDD(this, LogicalRDD(schema.toAttributes, rowRdd)(self))
   }
+
+  /**
+   * Returns a Catalyst Schema for the given java bean class.
+   */
+  protected def getSchema(beanClass: Class[_]): Seq[AttributeReference] = {
+    // TODO: All of this could probably be moved to Catalyst as it is mostly not Spark specific.
+    val beanInfo = Introspector.getBeanInfo(beanClass)
+
+    // Note: The ordering of elements may differ from when the schema is inferred in Scala.
+    //       This is because beanInfo.getPropertyDescriptors gives no guarantees about
+    //       element ordering.
+    val fields = beanInfo.getPropertyDescriptors.filterNot(_.getName == "class")
+    fields.map { property =>
+      val (dataType, nullable) = property.getPropertyType match {
+        case c: Class[_] if c.isAnnotationPresent(classOf[SQLUserDefinedType]) =>
+          (c.getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance(), true)
+        case c: Class[_] if c == classOf[java.lang.String] => (StringType, true)
+        case c: Class[_] if c == java.lang.Short.TYPE => (ShortType, false)
+        case c: Class[_] if c == java.lang.Integer.TYPE => (IntegerType, false)
+        case c: Class[_] if c == java.lang.Long.TYPE => (LongType, false)
+        case c: Class[_] if c == java.lang.Double.TYPE => (DoubleType, false)
+        case c: Class[_] if c == java.lang.Byte.TYPE => (ByteType, false)
+        case c: Class[_] if c == java.lang.Float.TYPE => (FloatType, false)
+        case c: Class[_] if c == java.lang.Boolean.TYPE => (BooleanType, false)
+
+        case c: Class[_] if c == classOf[java.lang.Short] => (ShortType, true)
+        case c: Class[_] if c == classOf[java.lang.Integer] => (IntegerType, true)
+        case c: Class[_] if c == classOf[java.lang.Long] => (LongType, true)
+        case c: Class[_] if c == classOf[java.lang.Double] => (DoubleType, true)
+        case c: Class[_] if c == classOf[java.lang.Byte] => (ByteType, true)
+        case c: Class[_] if c == classOf[java.lang.Float] => (FloatType, true)
+        case c: Class[_] if c == classOf[java.lang.Boolean] => (BooleanType, true)
+        case c: Class[_] if c == classOf[java.math.BigDecimal] => (DecimalType(), true)
+        case c: Class[_] if c == classOf[java.sql.Date] => (DateType, true)
+        case c: Class[_] if c == classOf[java.sql.Timestamp] => (TimestampType, true)
+      }
+      AttributeReference(property.getName, dataType, nullable)()
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
index 95d73c1711523..d1e21dffeb8c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.sql
 
-import java.util.{Map => JMap, List => JList}
-
+import java.util.{List => JList}
 
 import scala.collection.JavaConversions._
 
@@ -31,14 +30,14 @@ import org.apache.spark.annotation.{AlphaComponent, Experimental}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.python.SerDeUtil
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.api.java.JavaSchemaRDD
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.json.JsonRDD
 import org.apache.spark.sql.execution.{LogicalRDD, EvaluatePython}
+import org.apache.spark.sql.json.JsonRDD
+import org.apache.spark.sql.types.{BooleanType, StructType}
 import org.apache.spark.storage.StorageLevel
 
 /**
@@ -214,7 +213,20 @@ class SchemaRDD(
    * @group Query
    */
   def orderBy(sortExprs: SortOrder*): SchemaRDD =
-    new SchemaRDD(sqlContext, Sort(sortExprs, logicalPlan))
+    new SchemaRDD(sqlContext, Sort(sortExprs, true, logicalPlan))
+
+  /**
+   * Sorts the results by the given expressions within partition.
+   * {{{
+   *   schemaRDD.sortBy('a)
+   *   schemaRDD.sortBy('a, 'b)
+   *   schemaRDD.sortBy('a.asc, 'b.desc)
+   * }}}
+   *
+   * @group Query
+   */
+  def sortBy(sortExprs: SortOrder*): SchemaRDD =
+    new SchemaRDD(sqlContext, Sort(sortExprs, false, logicalPlan))
 
   @deprecated("use limit with integer argument", "1.1.0")
   def limit(limitExpr: Expression): SchemaRDD =
@@ -225,7 +237,6 @@ class SchemaRDD(
    * {{{
    *   schemaRDD.limit(10)
    * }}}
-   * 
    * @group Query
    */
   def limit(limitNum: Int): SchemaRDD =
@@ -319,25 +330,6 @@ class SchemaRDD(
       sqlContext,
       Filter(ScalaUdf(udf, BooleanType, Seq(UnresolvedAttribute(arg1.name))), logicalPlan))
 
-  /**
-   * :: Experimental ::
-   * Filters tuples using a function over a `Dynamic` version of a given Row.  DynamicRows use
-   * scala's Dynamic trait to emulate an ORM of in a dynamically typed language.  Since the type of
-   * the column is not known at compile time, all attributes are converted to strings before
-   * being passed to the function.
-   *
-   * {{{
-   *   schemaRDD.where(r => r.firstName == "Bob" && r.lastName == "Smith")
-   * }}}
-   *
-   * @group Query
-   */
-  @Experimental
-  def where(dynamicUdf: (DynamicRow) => Boolean) =
-    new SchemaRDD(
-      sqlContext,
-      Filter(ScalaUdf(dynamicUdf, BooleanType, Seq(WrapDynamic(logicalPlan.output))), logicalPlan))
-
   /**
    * :: Experimental ::
    * Returns a sampled version of the underlying dataset.
@@ -396,13 +388,6 @@ class SchemaRDD(
    */
   def toSchemaRDD = this
 
-  /**
-   * Returns this RDD as a JavaSchemaRDD.
-   *
-   * @group schema
-   */
-  def toJavaSchemaRDD: JavaSchemaRDD = new JavaSchemaRDD(sqlContext, logicalPlan)
-
   /**
    * Converts a JavaRDD to a PythonRDD. It is used by pyspark.
    */
@@ -424,6 +409,21 @@ class SchemaRDD(
     }.grouped(100).map(batched => pickle.dumps(batched.toArray)).toIterable)
   }
 
+  /**
+   * Serializes the Array[Row] returned by SchemaRDD's takeSample(), using the same
+   * format as javaToPython and collectToPython. It is used by pyspark.
+   */
+  private[sql] def takeSampleToPython(
+      withReplacement: Boolean,
+      num: Int,
+      seed: Long): JList[Array[Byte]] = {
+    val fieldTypes = schema.fields.map(_.dataType)
+    val pickle = new Pickler
+    new java.util.ArrayList(this.takeSample(withReplacement, num, seed).map { row =>
+      EvaluatePython.rowToArray(row, fieldTypes)
+    }.grouped(100).map(batched => pickle.dumps(batched.toArray)).toIterable)
+  }
+
   /**
    * Creates SchemaRDD by applying own schema to derived RDD. Typically used to wrap return value
    * of base RDD functions that do not change schema.
@@ -443,6 +443,8 @@ class SchemaRDD(
 
   override def collect(): Array[Row] = queryExecution.executedPlan.executeCollect()
 
+  def collectAsList(): java.util.List[Row] = java.util.Arrays.asList(collect() : _*)
+
   override def take(num: Int): Array[Row] = limit(num).collect()
 
   // =======================================================================
@@ -455,13 +457,15 @@ class SchemaRDD(
                        (implicit ord: Ordering[Row] = null): SchemaRDD =
     applySchema(super.coalesce(numPartitions, shuffle)(ord))
 
-  override def distinct(): SchemaRDD =
-    applySchema(super.distinct())
+  override def distinct(): SchemaRDD = applySchema(super.distinct())
 
   override def distinct(numPartitions: Int)
                        (implicit ord: Ordering[Row] = null): SchemaRDD =
     applySchema(super.distinct(numPartitions)(ord))
 
+  def distinct(numPartitions: Int): SchemaRDD =
+    applySchema(super.distinct(numPartitions)(null))
+
   override def filter(f: Row => Boolean): SchemaRDD =
     applySchema(super.filter(f))
 
@@ -501,7 +505,7 @@ class SchemaRDD(
   }
 
   override def unpersist(blocking: Boolean): this.type = {
-    sqlContext.uncacheQuery(this, blocking)
+    sqlContext.tryUncacheQuery(this, blocking)
     this
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
index fd5f4abcbcd65..3cf9209465b76 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
@@ -97,8 +97,8 @@ private[sql] trait SchemaRDDLike {
    */
   @Experimental
   def insertInto(tableName: String, overwrite: Boolean): Unit =
-    sqlContext.executePlan(
-      InsertIntoTable(UnresolvedRelation(None, tableName), Map.empty, logicalPlan, overwrite)).toRdd
+    sqlContext.executePlan(InsertIntoTable(UnresolvedRelation(Seq(tableName)),
+      Map.empty, logicalPlan, overwrite)).toRdd
 
   /**
    * :: Experimental ::
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
new file mode 100644
index 0000000000000..f1a4053b79113
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+
+import scala.util.parsing.combinator.RegexParsers
+
+import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.{UncacheTableCommand, CacheTableCommand, SetCommand}
+import org.apache.spark.sql.types.StringType
+
+
+/**
+ * The top level Spark SQL parser. This parser recognizes syntaxes that are available for all SQL
+ * dialects supported by Spark SQL, and delegates all the other syntaxes to the `fallback` parser.
+ *
+ * @param fallback A function that parses an input string to a logical plan
+ */
+private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends AbstractSparkSQLParser {
+
+  // A parser for the key-value part of the "SET [key = [value ]]" syntax
+  private object SetCommandParser extends RegexParsers {
+    private val key: Parser[String] = "(?m)[^=]+".r
+
+    private val value: Parser[String] = "(?m).*$".r
+
+    private val output: Seq[Attribute] = Seq(AttributeReference("", StringType, nullable = false)())
+
+    private val pair: Parser[LogicalPlan] =
+      (key ~ ("=".r ~> value).?).? ^^ {
+        case None => SetCommand(None, output)
+        case Some(k ~ v) => SetCommand(Some(k.trim -> v.map(_.trim)), output)
+      }
+
+    def apply(input: String): LogicalPlan = parseAll(pair, input) match {
+      case Success(plan, _) => plan
+      case x => sys.error(x.toString)
+    }
+  }
+
+  protected val AS      = Keyword("AS")
+  protected val CACHE   = Keyword("CACHE")
+  protected val LAZY    = Keyword("LAZY")
+  protected val SET     = Keyword("SET")
+  protected val TABLE   = Keyword("TABLE")
+  protected val UNCACHE = Keyword("UNCACHE")
+
+  override protected lazy val start: Parser[LogicalPlan] = cache | uncache | set | others
+
+  private lazy val cache: Parser[LogicalPlan] =
+    CACHE ~> LAZY.? ~ (TABLE ~> ident) ~ (AS ~> restInput).? ^^ {
+      case isLazy ~ tableName ~ plan =>
+        CacheTableCommand(tableName, plan.map(fallback), isLazy.isDefined)
+    }
+
+  private lazy val uncache: Parser[LogicalPlan] =
+    UNCACHE ~ TABLE ~> ident ^^ {
+      case tableName => UncacheTableCommand(tableName)
+    }
+
+  private lazy val set: Parser[LogicalPlan] =
+    SET ~> restInput ^^ {
+      case input => SetCommandParser(input)
+    }
+
+  private lazy val others: Parser[LogicalPlan] =
+    wholeInput ^^ {
+      case input => fallback(input)
+    }
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala
index 00d6b43a57812..2e9d037f93c03 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala
@@ -19,22 +19,26 @@ package org.apache.spark.sql
 
 import java.util.{List => JList, Map => JMap}
 
+import scala.reflect.runtime.universe.TypeTag
+
 import org.apache.spark.Accumulator
 import org.apache.spark.api.python.PythonBroadcast
 import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.sql.api.java._
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUdf}
 import org.apache.spark.sql.execution.PythonUDF
+import org.apache.spark.sql.types.DataType
 
-import scala.reflect.runtime.universe.{TypeTag, typeTag}
 
 /**
- * Functions for registering scala lambda functions as UDFs in a SQLContext.
+ * Functions for registering user-defined functions.
  */
-private[sql] trait UDFRegistration {
-  self: SQLContext =>
+class UDFRegistration (sqlContext: SQLContext) extends org.apache.spark.Logging {
+
+  private val functionRegistry = sqlContext.functionRegistry
 
-  private[spark] def registerPython(
+  protected[sql] def registerPython(
       name: String,
       command: Array[Byte],
       envVars: JMap[String, String],
@@ -55,7 +59,7 @@ private[sql] trait UDFRegistration {
       """.stripMargin)
 
 
-    val dataType = parseDataType(stringDataType)
+    val dataType = sqlContext.parseDataType(stringDataType)
 
     def builder(e: Seq[Expression]) =
       PythonUDF(
@@ -72,129 +76,699 @@ private[sql] trait UDFRegistration {
     functionRegistry.registerFunction(name, builder)
   }
 
-  /** registerFunction 1-22 were generated by this script
+  // scalastyle:off
 
-    (1 to 22).map { x =>
-      val types = (1 to x).map(x => "_").reduce(_ + ", " + _)
-      s"""
-        def registerFunction[T: TypeTag](name: String, func: Function$x[$types, T]): Unit = {
-          def builder(e: Seq[Expression]) =
-            ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /* registerFunction 0-22 were generated by this script
+
+    (0 to 22).map { x =>
+      val types = (1 to x).foldRight("RT")((i, s) => {s"A$i, $s"})
+      val typeTags = (1 to x).map(i => s"A${i}: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _)
+      val argDocs = (1 to x).map(i => s"         * @tparam A$i type of the UDF argument at position $i.").foldLeft("")(_ + "\n" + _)
+      println(s"""
+        /**
+         * Register a Scala closure of ${x} arguments as user-defined function (UDF).
+         * @tparam RT return type of UDF.$argDocs
+         */
+        def register[$typeTags](name: String, func: Function$x[$types]): Unit = {
+          def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
           functionRegistry.registerFunction(name, builder)
-        }
-      """
+        }""")
     }
-  */
 
-  // scalastyle:off
-  def registerFunction[T: TypeTag](name: String, func: Function1[_, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+    (1 to 22).foreach { i =>
+      val extTypeArgs = (1 to i).map(_ => "_").mkString(", ")
+      val anyTypeArgs = (1 to i).map(_ => "Any").mkString(", ")
+      val anyCast = s".asInstanceOf[UDF$i[$anyTypeArgs, Any]]"
+      val anyParams = (1 to i).map(_ => "_: Any").mkString(", ")
+      println(s"""
+         |/**
+         | * Register a user-defined function with ${i} arguments.
+         | */
+         |def register(name: String, f: UDF$i[$extTypeArgs, _], returnType: DataType) = {
+         |  functionRegistry.registerFunction(
+         |    name,
+         |    (e: Seq[Expression]) => ScalaUdf(f$anyCast.call($anyParams), returnType, e))
+         |}""".stripMargin)
+    }
+    */
+
+  /**
+   * Register a Scala closure of 0 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   */
+  def register[RT: TypeTag](name: String, func: Function0[RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  /**
+   * Register a Scala closure of 1 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   */
+  def register[RT: TypeTag, A1: TypeTag](name: String, func: Function1[A1, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function2[_, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 2 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag](name: String, func: Function2[A1, A2, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function3[_, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 3 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag](name: String, func: Function3[A1, A2, A3, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function4[_, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 4 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag](name: String, func: Function4[A1, A2, A3, A4, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function5[_, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 5 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag](name: String, func: Function5[A1, A2, A3, A4, A5, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function6[_, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 6 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag](name: String, func: Function6[A1, A2, A3, A4, A5, A6, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function7[_, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 7 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag](name: String, func: Function7[A1, A2, A3, A4, A5, A6, A7, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function8[_, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 8 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag](name: String, func: Function8[A1, A2, A3, A4, A5, A6, A7, A8, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function9[_, _, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 9 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   * @tparam A9 type of the UDF argument at position 9.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag](name: String, func: Function9[A1, A2, A3, A4, A5, A6, A7, A8, A9, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function10[_, _, _, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 10 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   * @tparam A9 type of the UDF argument at position 9.
+   * @tparam A10 type of the UDF argument at position 10.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag](name: String, func: Function10[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function11[_, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 11 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   * @tparam A9 type of the UDF argument at position 9.
+   * @tparam A10 type of the UDF argument at position 10.
+   * @tparam A11 type of the UDF argument at position 11.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag](name: String, func: Function11[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function12[_, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 12 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   * @tparam A9 type of the UDF argument at position 9.
+   * @tparam A10 type of the UDF argument at position 10.
+   * @tparam A11 type of the UDF argument at position 11.
+   * @tparam A12 type of the UDF argument at position 12.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag](name: String, func: Function12[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function13[_, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 13 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   * @tparam A9 type of the UDF argument at position 9.
+   * @tparam A10 type of the UDF argument at position 10.
+   * @tparam A11 type of the UDF argument at position 11.
+   * @tparam A12 type of the UDF argument at position 12.
+   * @tparam A13 type of the UDF argument at position 13.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag](name: String, func: Function13[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 14 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   * @tparam A9 type of the UDF argument at position 9.
+   * @tparam A10 type of the UDF argument at position 10.
+   * @tparam A11 type of the UDF argument at position 11.
+   * @tparam A12 type of the UDF argument at position 12.
+   * @tparam A13 type of the UDF argument at position 13.
+   * @tparam A14 type of the UDF argument at position 14.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag](name: String, func: Function14[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 15 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   * @tparam A9 type of the UDF argument at position 9.
+   * @tparam A10 type of the UDF argument at position 10.
+   * @tparam A11 type of the UDF argument at position 11.
+   * @tparam A12 type of the UDF argument at position 12.
+   * @tparam A13 type of the UDF argument at position 13.
+   * @tparam A14 type of the UDF argument at position 14.
+   * @tparam A15 type of the UDF argument at position 15.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag](name: String, func: Function15[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 16 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   * @tparam A9 type of the UDF argument at position 9.
+   * @tparam A10 type of the UDF argument at position 10.
+   * @tparam A11 type of the UDF argument at position 11.
+   * @tparam A12 type of the UDF argument at position 12.
+   * @tparam A13 type of the UDF argument at position 13.
+   * @tparam A14 type of the UDF argument at position 14.
+   * @tparam A15 type of the UDF argument at position 15.
+   * @tparam A16 type of the UDF argument at position 16.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag](name: String, func: Function16[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 17 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   * @tparam A9 type of the UDF argument at position 9.
+   * @tparam A10 type of the UDF argument at position 10.
+   * @tparam A11 type of the UDF argument at position 11.
+   * @tparam A12 type of the UDF argument at position 12.
+   * @tparam A13 type of the UDF argument at position 13.
+   * @tparam A14 type of the UDF argument at position 14.
+   * @tparam A15 type of the UDF argument at position 15.
+   * @tparam A16 type of the UDF argument at position 16.
+   * @tparam A17 type of the UDF argument at position 17.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag](name: String, func: Function17[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 18 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   * @tparam A9 type of the UDF argument at position 9.
+   * @tparam A10 type of the UDF argument at position 10.
+   * @tparam A11 type of the UDF argument at position 11.
+   * @tparam A12 type of the UDF argument at position 12.
+   * @tparam A13 type of the UDF argument at position 13.
+   * @tparam A14 type of the UDF argument at position 14.
+   * @tparam A15 type of the UDF argument at position 15.
+   * @tparam A16 type of the UDF argument at position 16.
+   * @tparam A17 type of the UDF argument at position 17.
+   * @tparam A18 type of the UDF argument at position 18.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag](name: String, func: Function18[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 19 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   * @tparam A9 type of the UDF argument at position 9.
+   * @tparam A10 type of the UDF argument at position 10.
+   * @tparam A11 type of the UDF argument at position 11.
+   * @tparam A12 type of the UDF argument at position 12.
+   * @tparam A13 type of the UDF argument at position 13.
+   * @tparam A14 type of the UDF argument at position 14.
+   * @tparam A15 type of the UDF argument at position 15.
+   * @tparam A16 type of the UDF argument at position 16.
+   * @tparam A17 type of the UDF argument at position 17.
+   * @tparam A18 type of the UDF argument at position 18.
+   * @tparam A19 type of the UDF argument at position 19.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag](name: String, func: Function19[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 20 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   * @tparam A9 type of the UDF argument at position 9.
+   * @tparam A10 type of the UDF argument at position 10.
+   * @tparam A11 type of the UDF argument at position 11.
+   * @tparam A12 type of the UDF argument at position 12.
+   * @tparam A13 type of the UDF argument at position 13.
+   * @tparam A14 type of the UDF argument at position 14.
+   * @tparam A15 type of the UDF argument at position 15.
+   * @tparam A16 type of the UDF argument at position 16.
+   * @tparam A17 type of the UDF argument at position 17.
+   * @tparam A18 type of the UDF argument at position 18.
+   * @tparam A19 type of the UDF argument at position 19.
+   * @tparam A20 type of the UDF argument at position 20.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag](name: String, func: Function20[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 21 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   * @tparam A9 type of the UDF argument at position 9.
+   * @tparam A10 type of the UDF argument at position 10.
+   * @tparam A11 type of the UDF argument at position 11.
+   * @tparam A12 type of the UDF argument at position 12.
+   * @tparam A13 type of the UDF argument at position 13.
+   * @tparam A14 type of the UDF argument at position 14.
+   * @tparam A15 type of the UDF argument at position 15.
+   * @tparam A16 type of the UDF argument at position 16.
+   * @tparam A17 type of the UDF argument at position 17.
+   * @tparam A18 type of the UDF argument at position 18.
+   * @tparam A19 type of the UDF argument at position 19.
+   * @tparam A20 type of the UDF argument at position 20.
+   * @tparam A21 type of the UDF argument at position 21.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag](name: String, func: Function21[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
 
-  def registerFunction[T: TypeTag](name: String, func: Function22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
-    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[T].dataType, e)
+  /**
+   * Register a Scala closure of 22 arguments as user-defined function (UDF).
+   * @tparam RT return type of UDF.
+   * @tparam A1 type of the UDF argument at position 1.
+   * @tparam A2 type of the UDF argument at position 2.
+   * @tparam A3 type of the UDF argument at position 3.
+   * @tparam A4 type of the UDF argument at position 4.
+   * @tparam A5 type of the UDF argument at position 5.
+   * @tparam A6 type of the UDF argument at position 6.
+   * @tparam A7 type of the UDF argument at position 7.
+   * @tparam A8 type of the UDF argument at position 8.
+   * @tparam A9 type of the UDF argument at position 9.
+   * @tparam A10 type of the UDF argument at position 10.
+   * @tparam A11 type of the UDF argument at position 11.
+   * @tparam A12 type of the UDF argument at position 12.
+   * @tparam A13 type of the UDF argument at position 13.
+   * @tparam A14 type of the UDF argument at position 14.
+   * @tparam A15 type of the UDF argument at position 15.
+   * @tparam A16 type of the UDF argument at position 16.
+   * @tparam A17 type of the UDF argument at position 17.
+   * @tparam A18 type of the UDF argument at position 18.
+   * @tparam A19 type of the UDF argument at position 19.
+   * @tparam A20 type of the UDF argument at position 20.
+   * @tparam A21 type of the UDF argument at position 21.
+   * @tparam A22 type of the UDF argument at position 22.
+   */
+  def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag, A22: TypeTag](name: String, func: Function22[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, RT]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor[RT].dataType, e)
     functionRegistry.registerFunction(name, builder)
   }
+
+  /**
+   * Register a user-defined function with 1 arguments.
+   */
+  def register(name: String, f: UDF1[_, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF1[Any, Any]].call(_: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 2 arguments.
+   */
+  def register(name: String, f: UDF2[_, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 3 arguments.
+   */
+  def register(name: String, f: UDF3[_, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 4 arguments.
+   */
+  def register(name: String, f: UDF4[_, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 5 arguments.
+   */
+  def register(name: String, f: UDF5[_, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 6 arguments.
+   */
+  def register(name: String, f: UDF6[_, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 7 arguments.
+   */
+  def register(name: String, f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 8 arguments.
+   */
+  def register(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 9 arguments.
+   */
+  def register(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 10 arguments.
+   */
+  def register(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 11 arguments.
+   */
+  def register(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 12 arguments.
+   */
+  def register(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 13 arguments.
+   */
+  def register(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 14 arguments.
+   */
+  def register(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 15 arguments.
+   */
+  def register(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 16 arguments.
+   */
+  def register(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 17 arguments.
+   */
+  def register(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 18 arguments.
+   */
+  def register(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 19 arguments.
+   */
+  def register(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 20 arguments.
+   */
+  def register(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 21 arguments.
+   */
+  def register(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
+  /**
+   * Register a user-defined function with 22 arguments.
+   */
+  def register(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType) = {
+    functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+  }
+
   // scalastyle:on
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala
deleted file mode 100644
index 4c0869e05b029..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package org.apache.spark.sql.api.java
-
-import java.beans.Introspector
-
-import org.apache.hadoop.conf.Configuration
-
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
-import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
-import org.apache.spark.sql.{SQLContext, StructType => SStructType}
-import org.apache.spark.sql.catalyst.annotation.SQLUserDefinedType
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, GenericRow, Row => ScalaRow}
-import org.apache.spark.sql.execution.LogicalRDD
-import org.apache.spark.sql.json.JsonRDD
-import org.apache.spark.sql.parquet.ParquetRelation
-import org.apache.spark.sql.sources.{LogicalRelation, BaseRelation}
-import org.apache.spark.sql.types.util.DataTypeConversions
-import org.apache.spark.sql.types.util.DataTypeConversions.asScalaDataType
-import org.apache.spark.util.Utils
-
-/**
- * The entry point for executing Spark SQL queries from a Java program.
- */
-class JavaSQLContext(val sqlContext: SQLContext) extends UDFRegistration {
-
-  def this(sparkContext: JavaSparkContext) = this(new SQLContext(sparkContext.sc))
-
-  def baseRelationToSchemaRDD(baseRelation: BaseRelation): JavaSchemaRDD = {
-    new JavaSchemaRDD(sqlContext, LogicalRelation(baseRelation))
-  }
-
-  /**
-   * Executes a SQL query using Spark, returning the result as a SchemaRDD.  The dialect that is
-   * used for SQL parsing can be configured with 'spark.sql.dialect'.
-   *
-   * @group userf
-   */
-  def sql(sqlText: String): JavaSchemaRDD = {
-    if (sqlContext.dialect == "sql") {
-      new JavaSchemaRDD(sqlContext, sqlContext.parseSql(sqlText))
-    } else {
-      sys.error(s"Unsupported SQL dialect: $sqlContext.dialect")
-    }
-  }
-
-  /**
-   * :: Experimental ::
-   * Creates an empty parquet file with the schema of class `beanClass`, which can be registered as
-   * a table. This registered table can be used as the target of future `insertInto` operations.
-   *
-   * {{{
-   *   JavaSQLContext sqlCtx = new JavaSQLContext(...)
-   *
-   *   sqlCtx.createParquetFile(Person.class, "path/to/file.parquet").registerTempTable("people")
-   *   sqlCtx.sql("INSERT INTO people SELECT 'michael', 29")
-   * }}}
-   *
-   * @param beanClass A java bean class object that will be used to determine the schema of the
-   *                  parquet file.
-   * @param path The path where the directory containing parquet metadata should be created.
-   *             Data inserted into this table will also be stored at this location.
-   * @param allowExisting When false, an exception will be thrown if this directory already exists.
-   * @param conf A Hadoop configuration object that can be used to specific options to the parquet
-   *             output format.
-   */
-  @Experimental
-  def createParquetFile(
-      beanClass: Class[_],
-      path: String,
-      allowExisting: Boolean = true,
-      conf: Configuration = new Configuration()): JavaSchemaRDD = {
-    new JavaSchemaRDD(
-      sqlContext,
-      ParquetRelation.createEmpty(path, getSchema(beanClass), allowExisting, conf, sqlContext))
-  }
-
-  /**
-   * Applies a schema to an RDD of Java Beans.
-   *
-   * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
-   *          SELECT * queries will return the columns in an undefined order.
-   */
-  def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): JavaSchemaRDD = {
-    val attributeSeq = getSchema(beanClass)
-    val className = beanClass.getName
-    val rowRdd = rdd.rdd.mapPartitions { iter =>
-      // BeanInfo is not serializable so we must rediscover it remotely for each partition.
-      val localBeanInfo = Introspector.getBeanInfo(
-        Class.forName(className, true, Utils.getContextOrSparkClassLoader))
-      val extractors =
-        localBeanInfo.getPropertyDescriptors.filterNot(_.getName == "class").map(_.getReadMethod)
-
-      iter.map { row =>
-        new GenericRow(
-          extractors.zip(attributeSeq).map { case (e, attr) =>
-            DataTypeConversions.convertJavaToCatalyst(e.invoke(row), attr.dataType)
-          }.toArray[Any]
-        ): ScalaRow
-      }
-    }
-    new JavaSchemaRDD(sqlContext, LogicalRDD(attributeSeq, rowRdd)(sqlContext))
-  }
-
-  /**
-   * :: DeveloperApi ::
-   * Creates a JavaSchemaRDD from an RDD containing Rows by applying a schema to this RDD.
-   * It is important to make sure that the structure of every Row of the provided RDD matches the
-   * provided schema. Otherwise, there will be runtime exception.
-   */
-  @DeveloperApi
-  def applySchema(rowRDD: JavaRDD[Row], schema: StructType): JavaSchemaRDD = {
-    val scalaRowRDD = rowRDD.rdd.map(r => r.row)
-    val scalaSchema = asScalaDataType(schema).asInstanceOf[SStructType]
-    val logicalPlan =
-      LogicalRDD(scalaSchema.toAttributes, scalaRowRDD)(sqlContext)
-    new JavaSchemaRDD(sqlContext, logicalPlan)
-  }
-
-  /**
-   * Loads a parquet file, returning the result as a [[JavaSchemaRDD]].
-   */
-  def parquetFile(path: String): JavaSchemaRDD =
-    new JavaSchemaRDD(
-      sqlContext,
-      ParquetRelation(path, Some(sqlContext.sparkContext.hadoopConfiguration), sqlContext))
-
-  /**
-   * Loads a JSON file (one object per line), returning the result as a JavaSchemaRDD.
-   * It goes through the entire dataset once to determine the schema.
-   */
-  def jsonFile(path: String): JavaSchemaRDD =
-    jsonRDD(sqlContext.sparkContext.textFile(path))
-
-  /**
-   * :: Experimental ::
-   * Loads a JSON file (one object per line) and applies the given schema,
-   * returning the result as a JavaSchemaRDD.
-   */
-  @Experimental
-  def jsonFile(path: String, schema: StructType): JavaSchemaRDD =
-    jsonRDD(sqlContext.sparkContext.textFile(path), schema)
-
-  /**
-   * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
-   * JavaSchemaRDD.
-   * It goes through the entire dataset once to determine the schema.
-   */
-  def jsonRDD(json: JavaRDD[String]): JavaSchemaRDD = {
-    val columnNameOfCorruptJsonRecord = sqlContext.columnNameOfCorruptRecord
-    val appliedScalaSchema =
-      JsonRDD.nullTypeToStringType(
-        JsonRDD.inferSchema(json.rdd, 1.0, columnNameOfCorruptJsonRecord))
-    val scalaRowRDD =
-      JsonRDD.jsonStringToRow(json.rdd, appliedScalaSchema, columnNameOfCorruptJsonRecord)
-    val logicalPlan =
-      LogicalRDD(appliedScalaSchema.toAttributes, scalaRowRDD)(sqlContext)
-    new JavaSchemaRDD(sqlContext, logicalPlan)
-  }
-
-  /**
-   * :: Experimental ::
-   * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema,
-   * returning the result as a JavaSchemaRDD.
-   */
-  @Experimental
-  def jsonRDD(json: JavaRDD[String], schema: StructType): JavaSchemaRDD = {
-    val columnNameOfCorruptJsonRecord = sqlContext.columnNameOfCorruptRecord
-    val appliedScalaSchema =
-      Option(asScalaDataType(schema)).getOrElse(
-        JsonRDD.nullTypeToStringType(
-          JsonRDD.inferSchema(
-            json.rdd, 1.0, columnNameOfCorruptJsonRecord))).asInstanceOf[SStructType]
-    val scalaRowRDD = JsonRDD.jsonStringToRow(
-      json.rdd, appliedScalaSchema, columnNameOfCorruptJsonRecord)
-    val logicalPlan =
-      LogicalRDD(appliedScalaSchema.toAttributes, scalaRowRDD)(sqlContext)
-    new JavaSchemaRDD(sqlContext, logicalPlan)
-  }
-
-  /**
-   * Registers the given RDD as a temporary table in the catalog.  Temporary tables exist only
-   * during the lifetime of this instance of SQLContext.
-   */
-  def registerRDDAsTable(rdd: JavaSchemaRDD, tableName: String): Unit = {
-    sqlContext.registerRDDAsTable(rdd.baseSchemaRDD, tableName)
-  }
-
-  /**
-   * Returns a Catalyst Schema for the given java bean class.
-   */
-  protected def getSchema(beanClass: Class[_]): Seq[AttributeReference] = {
-    // TODO: All of this could probably be moved to Catalyst as it is mostly not Spark specific.
-    val beanInfo = Introspector.getBeanInfo(beanClass)
-
-    // Note: The ordering of elements may differ from when the schema is inferred in Scala.
-    //       This is because beanInfo.getPropertyDescriptors gives no guarantees about
-    //       element ordering.
-    val fields = beanInfo.getPropertyDescriptors.filterNot(_.getName == "class")
-    fields.map { property =>
-      val (dataType, nullable) = property.getPropertyType match {
-        case c: Class[_] if c.isAnnotationPresent(classOf[SQLUserDefinedType]) =>
-          (c.getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance(), true)
-        case c: Class[_] if c == classOf[java.lang.String] =>
-          (org.apache.spark.sql.StringType, true)
-        case c: Class[_] if c == java.lang.Short.TYPE =>
-          (org.apache.spark.sql.ShortType, false)
-        case c: Class[_] if c == java.lang.Integer.TYPE =>
-          (org.apache.spark.sql.IntegerType, false)
-        case c: Class[_] if c == java.lang.Long.TYPE =>
-          (org.apache.spark.sql.LongType, false)
-        case c: Class[_] if c == java.lang.Double.TYPE =>
-          (org.apache.spark.sql.DoubleType, false)
-        case c: Class[_] if c == java.lang.Byte.TYPE =>
-          (org.apache.spark.sql.ByteType, false)
-        case c: Class[_] if c == java.lang.Float.TYPE =>
-          (org.apache.spark.sql.FloatType, false)
-        case c: Class[_] if c == java.lang.Boolean.TYPE =>
-          (org.apache.spark.sql.BooleanType, false)
-
-        case c: Class[_] if c == classOf[java.lang.Short] =>
-          (org.apache.spark.sql.ShortType, true)
-        case c: Class[_] if c == classOf[java.lang.Integer] =>
-          (org.apache.spark.sql.IntegerType, true)
-        case c: Class[_] if c == classOf[java.lang.Long] =>
-          (org.apache.spark.sql.LongType, true)
-        case c: Class[_] if c == classOf[java.lang.Double] =>
-          (org.apache.spark.sql.DoubleType, true)
-        case c: Class[_] if c == classOf[java.lang.Byte] =>
-          (org.apache.spark.sql.ByteType, true)
-        case c: Class[_] if c == classOf[java.lang.Float] =>
-          (org.apache.spark.sql.FloatType, true)
-        case c: Class[_] if c == classOf[java.lang.Boolean] =>
-          (org.apache.spark.sql.BooleanType, true)
-        case c: Class[_] if c == classOf[java.math.BigDecimal] =>
-          (org.apache.spark.sql.DecimalType(), true)
-        case c: Class[_] if c == classOf[java.sql.Date] =>
-          (org.apache.spark.sql.DateType, true)
-        case c: Class[_] if c == classOf[java.sql.Timestamp] =>
-          (org.apache.spark.sql.TimestampType, true)
-      }
-      AttributeReference(property.getName, dataType, nullable)()
-    }
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala
deleted file mode 100644
index ac4844f9b9290..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java
-
-import java.util.{List => JList}
-
-import org.apache.spark.Partitioner
-import org.apache.spark.api.java.{JavaRDDLike, JavaRDD}
-import org.apache.spark.api.java.function.{Function => JFunction}
-import org.apache.spark.sql.types.util.DataTypeConversions
-import org.apache.spark.sql.{SQLContext, SchemaRDD, SchemaRDDLike}
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import DataTypeConversions._
-import org.apache.spark.rdd.RDD
-import org.apache.spark.storage.StorageLevel
-
-/**
- * An RDD of [[Row]] objects that is returned as the result of a Spark SQL query.  In addition to
- * standard RDD operations, a JavaSchemaRDD can also be registered as a table in the JavaSQLContext
- * that was used to create. Registering a JavaSchemaRDD allows its contents to be queried in
- * future SQL statement.
- *
- * @groupname schema SchemaRDD Functions
- * @groupprio schema -1
- * @groupname Ungrouped Base RDD Functions
- */
-class JavaSchemaRDD(
-     @transient val sqlContext: SQLContext,
-     @transient val baseLogicalPlan: LogicalPlan)
-  extends JavaRDDLike[Row, JavaRDD[Row]]
-  with SchemaRDDLike {
-
-  private[sql] val baseSchemaRDD = new SchemaRDD(sqlContext, logicalPlan)
-
-  /** Returns the underlying Scala SchemaRDD. */
-  val schemaRDD: SchemaRDD = baseSchemaRDD
-
-  override val classTag = scala.reflect.classTag[Row]
-
-  override def wrapRDD(rdd: RDD[Row]): JavaRDD[Row] = JavaRDD.fromRDD(rdd)
-
-  val rdd = baseSchemaRDD.map(new Row(_))
-
-  override def toString: String = baseSchemaRDD.toString
-
-  /** Returns the schema of this JavaSchemaRDD (represented by a StructType). */
-  def schema: StructType =
-    asJavaDataType(baseSchemaRDD.schema).asInstanceOf[StructType]
-
-  // =======================================================================
-  // Base RDD functions that do NOT change schema
-  // =======================================================================
-
-  // Common RDD functions
-
-  /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
-  def cache(): JavaSchemaRDD = {
-    baseSchemaRDD.cache()
-    this
-  }
-
-  /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
-  def persist(): JavaSchemaRDD = {
-    baseSchemaRDD.persist()
-    this
-  }
-
-  /**
-   * Set this RDD's storage level to persist its values across operations after the first time
-   * it is computed. This can only be used to assign a new storage level if the RDD does not
-   * have a storage level set yet..
-   */
-  def persist(newLevel: StorageLevel): JavaSchemaRDD = {
-    baseSchemaRDD.persist(newLevel)
-    this
-  }
-
-  /**
-   * Mark the RDD as non-persistent, and remove all blocks for it from memory and disk.
-   *
-   * @param blocking Whether to block until all blocks are deleted.
-   * @return This RDD.
-   */
-  def unpersist(blocking: Boolean = true): JavaSchemaRDD = {
-    baseSchemaRDD.unpersist(blocking)
-    this
-  }
-
-  /** Assign a name to this RDD */
-  def setName(name: String): JavaSchemaRDD = {
-    baseSchemaRDD.setName(name)
-    this
-  }
-
-  // Overridden actions from JavaRDDLike.
-
-  override def collect(): JList[Row] = {
-    import scala.collection.JavaConversions._
-    val arr: java.util.Collection[Row] = baseSchemaRDD.collect().toSeq.map(new Row(_))
-    new java.util.ArrayList(arr)
-  }
-
-  override def count(): Long = baseSchemaRDD.count
-
-  override def take(num: Int): JList[Row] = {
-    import scala.collection.JavaConversions._
-    val arr: java.util.Collection[Row] = baseSchemaRDD.take(num).toSeq.map(new Row(_))
-    new java.util.ArrayList(arr)
-  }
-
-  // Transformations (return a new RDD)
-
-  /**
-   * Returns a new RDD with each row transformed to a JSON string.
-   */
-  def toJSON(): JavaRDD[String] =
-    baseSchemaRDD.toJSON.toJavaRDD
-
-  /**
-   * Return a new RDD that is reduced into `numPartitions` partitions.
-   */
-  def coalesce(numPartitions: Int, shuffle: Boolean = false): JavaSchemaRDD =
-    baseSchemaRDD.coalesce(numPartitions, shuffle).toJavaSchemaRDD
-
-  /**
-   * Return a new RDD containing the distinct elements in this RDD.
-   */
-  def distinct(): JavaSchemaRDD =
-    baseSchemaRDD.distinct().toJavaSchemaRDD
-
-  /**
-   * Return a new RDD containing the distinct elements in this RDD.
-   */
-  def distinct(numPartitions: Int): JavaSchemaRDD =
-    baseSchemaRDD.distinct(numPartitions).toJavaSchemaRDD
-
-  /**
-   * Return a new RDD containing only the elements that satisfy a predicate.
-   */
-  def filter(f: JFunction[Row, java.lang.Boolean]): JavaSchemaRDD =
-    baseSchemaRDD.filter(x => f.call(new Row(x)).booleanValue()).toJavaSchemaRDD
-
-  /**
-   * Return the intersection of this RDD and another one. The output will not contain any
-   * duplicate elements, even if the input RDDs did.
-   *
-   * Note that this method performs a shuffle internally.
-   */
-  def intersection(other: JavaSchemaRDD): JavaSchemaRDD =
-    this.baseSchemaRDD.intersection(other.baseSchemaRDD).toJavaSchemaRDD
-
-  /**
-   * Return the intersection of this RDD and another one. The output will not contain any
-   * duplicate elements, even if the input RDDs did.
-   *
-   * Note that this method performs a shuffle internally.
-   *
-   * @param partitioner Partitioner to use for the resulting RDD
-   */
-  def intersection(other: JavaSchemaRDD, partitioner: Partitioner): JavaSchemaRDD =
-    this.baseSchemaRDD.intersection(other.baseSchemaRDD, partitioner).toJavaSchemaRDD
-
-  /**
-   * Return the intersection of this RDD and another one. The output will not contain any
-   * duplicate elements, even if the input RDDs did.  Performs a hash partition across the cluster
-   *
-   * Note that this method performs a shuffle internally.
-   *
-   * @param numPartitions How many partitions to use in the resulting RDD
-   */
-  def intersection(other: JavaSchemaRDD, numPartitions: Int): JavaSchemaRDD =
-    this.baseSchemaRDD.intersection(other.baseSchemaRDD, numPartitions).toJavaSchemaRDD
-
-  /**
-   * Return a new RDD that has exactly `numPartitions` partitions.
-   *
-   * Can increase or decrease the level of parallelism in this RDD. Internally, this uses
-   * a shuffle to redistribute data.
-   *
-   * If you are decreasing the number of partitions in this RDD, consider using `coalesce`,
-   * which can avoid performing a shuffle.
-   */
-  def repartition(numPartitions: Int): JavaSchemaRDD =
-    baseSchemaRDD.repartition(numPartitions).toJavaSchemaRDD
-
-  /**
-   * Return an RDD with the elements from `this` that are not in `other`.
-   *
-   * Uses `this` partitioner/partition size, because even if `other` is huge, the resulting
-   * RDD will be &lt;= us.
-   */
-  def subtract(other: JavaSchemaRDD): JavaSchemaRDD =
-    this.baseSchemaRDD.subtract(other.baseSchemaRDD).toJavaSchemaRDD
-
-  /**
-   * Return an RDD with the elements from `this` that are not in `other`.
-   */
-  def subtract(other: JavaSchemaRDD, numPartitions: Int): JavaSchemaRDD =
-    this.baseSchemaRDD.subtract(other.baseSchemaRDD, numPartitions).toJavaSchemaRDD
-
-  /**
-   * Return an RDD with the elements from `this` that are not in `other`.
-   */
-  def subtract(other: JavaSchemaRDD, p: Partitioner): JavaSchemaRDD =
-    this.baseSchemaRDD.subtract(other.baseSchemaRDD, p).toJavaSchemaRDD
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/Row.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/Row.scala
deleted file mode 100644
index 401798e317e96..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/java/Row.scala
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java
-
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
-
-import scala.annotation.varargs
-import scala.collection.convert.Wrappers.{JListWrapper, JMapWrapper}
-import scala.collection.JavaConversions
-import scala.math.BigDecimal
-
-import org.apache.spark.api.java.JavaUtils.mapAsSerializableJavaMap
-import org.apache.spark.sql.catalyst.expressions.{Row => ScalaRow}
-
-/**
- * A result row from a Spark SQL query.
- */
-class Row(private[spark] val row: ScalaRow) extends Serializable {
-
-  /** Returns the number of columns present in this Row. */
-  def length: Int = row.length
-
-  /** Returns the value of column `i`. */
-  def get(i: Int): Any =
-    Row.toJavaValue(row(i))
-
-  /** Returns true if value at column `i` is NULL. */
-  def isNullAt(i: Int) = get(i) == null
-
-  /**
-   * Returns the value of column `i` as an int.  This function will throw an exception if the value
-   * is at `i` is not an integer, or if it is null.
-   */
-  def getInt(i: Int): Int =
-    row.getInt(i)
-
-  /**
-   * Returns the value of column `i` as a long.  This function will throw an exception if the value
-   * is at `i` is not a long, or if it is null.
-   */
-  def getLong(i: Int): Long =
-    row.getLong(i)
-
-  /**
-   * Returns the value of column `i` as a double.  This function will throw an exception if the
-   * value is at `i` is not a double, or if it is null.
-   */
-  def getDouble(i: Int): Double =
-    row.getDouble(i)
-
-  /**
-   * Returns the value of column `i` as a bool.  This function will throw an exception if the value
-   * is at `i` is not a boolean, or if it is null.
-   */
-  def getBoolean(i: Int): Boolean =
-    row.getBoolean(i)
-
-  /**
-   * Returns the value of column `i` as a short.  This function will throw an exception if the value
-   * is at `i` is not a short, or if it is null.
-   */
-  def getShort(i: Int): Short =
-    row.getShort(i)
-
-  /**
-   * Returns the value of column `i` as a byte.  This function will throw an exception if the value
-   * is at `i` is not a byte, or if it is null.
-   */
-  def getByte(i: Int): Byte =
-    row.getByte(i)
-
-  /**
-   * Returns the value of column `i` as a float.  This function will throw an exception if the value
-   * is at `i` is not a float, or if it is null.
-   */
-  def getFloat(i: Int): Float =
-    row.getFloat(i)
-
-  /**
-   * Returns the value of column `i` as a String.  This function will throw an exception if the
-   * value is at `i` is not a String.
-   */
-  def getString(i: Int): String =
-    row.getString(i)
-
-  def canEqual(other: Any): Boolean = other.isInstanceOf[Row]
-
-  override def equals(other: Any): Boolean = other match {
-    case that: Row =>
-      (that canEqual this) &&
-        row == that.row
-    case _ => false
-  }
-
-  override def hashCode(): Int = row.hashCode()
-
-  override def toString: String = row.toString
-}
-
-object Row {
-
-  private def toJavaValue(value: Any): Any = value match {
-    // For values of this ScalaRow, we will do the conversion when
-    // they are actually accessed.
-    case row: ScalaRow => new Row(row)
-    case map: scala.collection.Map[_, _] =>
-      mapAsSerializableJavaMap(
-        map.map {
-          case (key, value) => (toJavaValue(key), toJavaValue(value))
-        }
-      )
-    case seq: scala.collection.Seq[_] =>
-      JavaConversions.seqAsJavaList(seq.map(toJavaValue))
-    case decimal: BigDecimal => decimal.underlying()
-    case other => other
-  }
-
-  // TODO: Consolidate the toScalaValue at here with the scalafy in JsonRDD?
-  private def toScalaValue(value: Any): Any = value match {
-    // Values of this row have been converted to Scala values.
-    case row: Row => row.row
-    case map: java.util.Map[_, _] =>
-      JMapWrapper(map).map {
-        case (key, value) => (toScalaValue(key), toScalaValue(value))
-      }
-    case list: java.util.List[_] =>
-      JListWrapper(list).map(toScalaValue)
-    case decimal: java.math.BigDecimal => BigDecimal(decimal)
-    case other => other
-  }
-
-  /**
-   * Creates a Row with the given values.
-   */
-  @varargs def create(values: Any*): Row = {
-    // Right now, we cannot use @varargs to annotate the constructor of
-    // org.apache.spark.sql.api.java.Row. See https://issues.scala-lang.org/browse/SI-8383.
-    new Row(ScalaRow(values.map(toScalaValue):_*))
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/UDFRegistration.scala
deleted file mode 100644
index 158f26e3d445f..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/java/UDFRegistration.scala
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package org.apache.spark.sql.api.java
-
-import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUdf}
-import org.apache.spark.sql.types.util.DataTypeConversions._
-
-/**
- * A collection of functions that allow Java users to register UDFs.  In order to handle functions
- * of varying airities with minimal boilerplate for our users, we generate classes and functions
- * for each airity up to 22.  The code for this generation can be found in comments in this trait.
- */
-private[java] trait UDFRegistration {
-  self: JavaSQLContext =>
-
-  /* The following functions and required interfaces are generated with these code fragments:
-
-   (1 to 22).foreach { i =>
-     val extTypeArgs = (1 to i).map(_ => "_").mkString(", ")
-     val anyTypeArgs = (1 to i).map(_ => "Any").mkString(", ")
-     val anyCast = s".asInstanceOf[UDF$i[$anyTypeArgs, Any]]"
-     val anyParams = (1 to i).map(_ => "_: Any").mkString(", ")
-     println(s"""
-         |def registerFunction(
-         |    name: String, f: UDF$i[$extTypeArgs, _], @transient dataType: DataType) = {
-         |  val scalaType = asScalaDataType(dataType)
-         |  sqlContext.functionRegistry.registerFunction(
-         |    name,
-         |    (e: Seq[Expression]) => ScalaUdf(f$anyCast.call($anyParams), scalaType, e))
-         |}
-       """.stripMargin)
-   }
-
-  import java.io.File
-  import org.apache.spark.sql.catalyst.util.stringToFile
-  val directory = new File("sql/core/src/main/java/org/apache/spark/sql/api/java/")
-  (1 to 22).foreach { i =>
-    val typeArgs = (1 to i).map(i => s"T$i").mkString(", ")
-    val args = (1 to i).map(i => s"T$i t$i").mkString(", ")
-
-    val contents =
-      s"""/*
-         | * Licensed to the Apache Software Foundation (ASF) under one or more
-         | * contributor license agreements.  See the NOTICE file distributed with
-         | * this work for additional information regarding copyright ownership.
-         | * The ASF licenses this file to You under the Apache License, Version 2.0
-         | * (the "License"); you may not use this file except in compliance with
-         | * the License.  You may obtain a copy of the License at
-         | *
-         | *    http://www.apache.org/licenses/LICENSE-2.0
-         | *
-         | * Unless required by applicable law or agreed to in writing, software
-         | * distributed under the License is distributed on an "AS IS" BASIS,
-         | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-         | * See the License for the specific language governing permissions and
-         | * limitations under the License.
-         | */
-         |
-         |package org.apache.spark.sql.api.java;
-         |
-         |import java.io.Serializable;
-         |
-         |// **************************************************
-         |// THIS FILE IS AUTOGENERATED BY CODE IN
-         |// org.apache.spark.sql.api.java.FunctionRegistration
-         |// **************************************************
-         |
-         |/**
-         | * A Spark SQL UDF that has $i arguments.
-         | */
-         |public interface UDF$i<$typeArgs, R> extends Serializable {
-         |  public R call($args) throws Exception;
-         |}
-         |""".stripMargin
-
-      stringToFile(new File(directory, s"UDF$i.java"), contents)
-  }
-
-  */
-
-  // scalastyle:off
-  def registerFunction(name: String, f: UDF1[_, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF1[Any, Any]].call(_: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF2[_, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF3[_, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF4[_, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF5[_, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF6[_, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF7[_, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  def registerFunction(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
-    val scalaType = asScalaDataType(dataType)
-    sqlContext.functionRegistry.registerFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
-  }
-
-  // scalastyle:on
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/UDTWrappers.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/UDTWrappers.scala
deleted file mode 100644
index a7d0f4f127ecc..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/java/UDTWrappers.scala
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java
-
-import org.apache.spark.sql.catalyst.types.{UserDefinedType => ScalaUserDefinedType}
-import org.apache.spark.sql.{DataType => ScalaDataType}
-import org.apache.spark.sql.types.util.DataTypeConversions
-
-/**
- * Scala wrapper for a Java UserDefinedType
- */
-private[sql] class JavaToScalaUDTWrapper[UserType](val javaUDT: UserDefinedType[UserType])
-  extends ScalaUserDefinedType[UserType] with Serializable {
-
-  /** Underlying storage type for this UDT */
-  val sqlType: ScalaDataType = DataTypeConversions.asScalaDataType(javaUDT.sqlType())
-
-  /** Convert the user type to a SQL datum */
-  def serialize(obj: Any): Any = javaUDT.serialize(obj)
-
-  /** Convert a SQL datum to the user type */
-  def deserialize(datum: Any): UserType = javaUDT.deserialize(datum)
-
-  val userClass: java.lang.Class[UserType] = javaUDT.userClass()
-}
-
-/**
- * Java wrapper for a Scala UserDefinedType
- */
-private[sql] class ScalaToJavaUDTWrapper[UserType](val scalaUDT: ScalaUserDefinedType[UserType])
-  extends UserDefinedType[UserType] with Serializable {
-
-  /** Underlying storage type for this UDT */
-  val sqlType: DataType = DataTypeConversions.asJavaDataType(scalaUDT.sqlType)
-
-  /** Convert the user type to a SQL datum */
-  def serialize(obj: Any): java.lang.Object = scalaUDT.serialize(obj).asInstanceOf[java.lang.Object]
-
-  /** Convert a SQL datum to the user type */
-  def deserialize(datum: Any): UserType = scalaUDT.deserialize(datum)
-
-  val userClass: java.lang.Class[UserType] = scalaUDT.userClass
-}
-
-private[sql] object UDTWrappers {
-
-  def wrapAsScala(udtType: UserDefinedType[_]): ScalaUserDefinedType[_] = {
-    udtType match {
-      case t: ScalaToJavaUDTWrapper[_] => t.scalaUDT
-      case _ => new JavaToScalaUDTWrapper(udtType)
-    }
-  }
-
-  def wrapAsJava(udtType: ScalaUserDefinedType[_]): UserDefinedType[_] = {
-    udtType match {
-      case t: JavaToScalaUDTWrapper[_] => t.javaUDT
-      case _ => new ScalaToJavaUDTWrapper(udtType)
-    }
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
index 538dd5b734664..91c4c105b14e6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
@@ -17,11 +17,11 @@
 
 package org.apache.spark.sql.columnar
 
-import java.nio.{ByteOrder, ByteBuffer}
+import java.nio.{ByteBuffer, ByteOrder}
 
-import org.apache.spark.sql.catalyst.types.{BinaryType, NativeType, DataType}
 import org.apache.spark.sql.catalyst.expressions.MutableRow
 import org.apache.spark.sql.columnar.compression.CompressibleColumnAccessor
+import org.apache.spark.sql.types.{BinaryType, DataType, NativeType}
 
 /**
  * An `Iterator` like trait used to extract values from columnar byte buffer. When a value is
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
index c68dceef3b142..3a4977b836af7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
@@ -20,9 +20,9 @@ package org.apache.spark.sql.columnar
 import java.nio.{ByteBuffer, ByteOrder}
 
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.columnar.ColumnBuilder._
 import org.apache.spark.sql.columnar.compression.{AllCompressionSchemes, CompressibleColumnBuilder}
+import org.apache.spark.sql.types._
 
 private[sql] trait ColumnBuilder {
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
index 668efe4a3b2a8..391b3dae5c8ce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
@@ -21,7 +21,7 @@ import java.sql.{Date, Timestamp}
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.{AttributeMap, Attribute, AttributeReference}
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 private[sql] class ColumnStatisticsSchema(a: Attribute) extends Serializable {
   val upperBound = AttributeReference(a.name + ".upperBound", a.dataType, nullable = true)()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index ab66c85c4f242..fcf2faa0914c0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -24,8 +24,8 @@ import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.MutableRow
-import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.execution.SparkSqlSerializer
+import org.apache.spark.sql.types._
 
 /**
  * An abstract class that represents type of a column. Used to append/extract Java objects into/from
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index 0cebe823b2707..11d5943fb427f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -21,7 +21,6 @@ import java.nio.ByteBuffer
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
@@ -82,7 +81,7 @@ private[sql] case class InMemoryRelation(
     if (batchStats.value.isEmpty) {
       // Underlying columnar RDD hasn't been materialized, no useful statistics information
       // available, return the default statistics.
-      Statistics(sizeInBytes = child.sqlContext.defaultSizeInBytes)
+      Statistics(sizeInBytes = child.sqlContext.conf.defaultSizeInBytes)
     } else {
       // Underlying columnar RDD has been materialized, required information has also been collected
       // via the `batchStats` accumulator, compute the final statistics, and update `_statistics`.
@@ -128,8 +127,7 @@ private[sql] case class InMemoryRelation(
             rowCount += 1
           }
 
-          val stats = Row.fromSeq(
-            columnBuilders.map(_.columnStats.collectedStatistics).foldLeft(Seq.empty[Any])(_ ++ _))
+          val stats = Row.merge(columnBuilders.map(_.columnStats.collectedStatistics) : _*)
 
           batchStats += stats
           CachedBatch(columnBuilders.map(_.build().array()), stats)
@@ -175,8 +173,6 @@ private[sql] case class InMemoryColumnarTableScan(
     relation: InMemoryRelation)
   extends LeafNode {
 
-  @transient override val sqlContext = relation.child.sqlContext
-
   override def output: Seq[Attribute] = attributes
 
   private def statsFor(a: Attribute) = relation.partitionStatistics.forAttribute(a)
@@ -235,7 +231,7 @@ private[sql] case class InMemoryColumnarTableScan(
   val readPartitions = sparkContext.accumulator(0)
   val readBatches = sparkContext.accumulator(0)
 
-  private val inMemoryPartitionPruningEnabled = sqlContext.inMemoryPartitionPruning
+  private val inMemoryPartitionPruningEnabled = sqlContext.conf.inMemoryPartitionPruning
 
   override def execute() = {
     readPartitions.setValue(0)
@@ -273,9 +269,10 @@ private[sql] case class InMemoryColumnarTableScan(
 
           // Extract rows via column accessors
           new Iterator[Row] {
+            private[this] val rowLen = nextRow.length
             override def next() = {
               var i = 0
-              while (i < nextRow.length) {
+              while (i < rowLen) {
                 columnAccessors(i).extractTo(nextRow, i)
                 i += 1
               }
@@ -299,7 +296,7 @@ private[sql] case class InMemoryColumnarTableScan(
           cachedBatchIterator.filter { cachedBatch =>
             if (!partitionFilter(cachedBatch.stats)) {
               def statsString = relation.partitionStatistics.schema
-                .zip(cachedBatch.stats)
+                .zip(cachedBatch.stats.toSeq)
                 .map { case (a, s) => s"${a.name}: $s" }
                 .mkString(", ")
               logInfo(s"Skipping partition based on stats $statsString")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnAccessor.scala
index 27ac5f4dbdbbc..7dff9deac8dc0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnAccessor.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.sql.columnar.compression
 
 import org.apache.spark.sql.catalyst.expressions.MutableRow
-import org.apache.spark.sql.catalyst.types.NativeType
 import org.apache.spark.sql.columnar.{ColumnAccessor, NativeColumnAccessor}
+import org.apache.spark.sql.types.NativeType
 
 private[sql] trait CompressibleColumnAccessor[T <: NativeType] extends ColumnAccessor {
   this: NativeColumnAccessor[T] =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
index 628d9cec41d6b..aead768ecdf0a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
@@ -21,8 +21,8 @@ import java.nio.{ByteBuffer, ByteOrder}
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.types.NativeType
 import org.apache.spark.sql.columnar.{ColumnBuilder, NativeColumnBuilder}
+import org.apache.spark.sql.types.NativeType
 
 /**
  * A stackable trait that builds optionally compressed byte buffer for a column.  Memory layout of
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala
index acb06cb5376b4..879d29bcfa6f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala
@@ -21,8 +21,8 @@ import java.nio.{ByteBuffer, ByteOrder}
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.MutableRow
-import org.apache.spark.sql.catalyst.types.NativeType
 import org.apache.spark.sql.columnar.{ColumnType, NativeColumnType}
+import org.apache.spark.sql.types.NativeType
 
 private[sql] trait Encoder[T <: NativeType] {
   def gatherCompressibilityStats(row: Row, ordinal: Int): Unit = {}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
index 29edcf17242c5..68a5b1de7691b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
@@ -25,10 +25,11 @@ import scala.reflect.runtime.universe.runtimeMirror
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.{MutableRow, SpecificMutableRow}
-import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.columnar._
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
+
 private[sql] case object PassThrough extends CompressionScheme {
   override val typeId = 0
 
@@ -126,7 +127,7 @@ private[sql] case object RunLengthEncoding extends CompressionScheme {
         while (from.hasRemaining) {
           columnType.extract(from, value, 0)
 
-          if (value.head == currentValue.head) {
+          if (value(0) == currentValue(0)) {
             currentRun += 1
           } else {
             // Writes current run
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index d7c811ca89022..7c0b72aab448e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -123,7 +123,7 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
  */
 private[sql] case class AddExchange(sqlContext: SQLContext) extends Rule[SparkPlan] {
   // TODO: Determine the number of partitions.
-  def numPartitions = sqlContext.numShufflePartitions
+  def numPartitions = sqlContext.conf.numShufflePartitions
 
   def apply(plan: SparkPlan): SparkPlan = plan.transformUp {
     case operator: SparkPlan =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
index d2d8cb1c62d40..20b14834bb0d5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -19,11 +19,12 @@ package org.apache.spark.sql.execution
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{StructType, Row, SQLContext}
+import org.apache.spark.sql.{Row, SQLContext}
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow}
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
+import org.apache.spark.sql.types.StructType
 
 /**
  * :: DeveloperApi ::
@@ -69,7 +70,7 @@ case class LogicalRDD(output: Seq[Attribute], rdd: RDD[Row])(sqlContext: SQLCont
   @transient override lazy val statistics = Statistics(
     // TODO: Instead of returning a default value here, find a way to return a meaningful size
     // estimate for RDDs. See PR 1238 for more discussions.
-    sizeInBytes = BigInt(sqlContext.defaultSizeInBytes)
+    sizeInBytes = BigInt(sqlContext.conf.defaultSizeInBytes)
   )
 }
 
@@ -106,6 +107,6 @@ case class SparkLogicalPlan(alreadyPlanned: SparkPlan)(@transient sqlContext: SQ
   @transient override lazy val statistics = Statistics(
     // TODO: Instead of returning a default value here, find a way to return a meaningful size
     // estimate for RDDs. See PR 1238 for more discussions.
-    sizeInBytes = BigInt(sqlContext.defaultSizeInBytes)
+    sizeInBytes = BigInt(sqlContext.conf.defaultSizeInBytes)
   )
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
new file mode 100644
index 0000000000000..95172420608f9
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.errors._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.physical.{UnknownPartitioning, Partitioning}
+
+/**
+ * Apply the all of the GroupExpressions to every input row, hence we will get
+ * multiple output rows for a input row.
+ * @param projections The group of expressions, all of the group expressions should
+ *                    output the same schema specified bye the parameter `output`
+ * @param output      The output Schema
+ * @param child       Child operator
+ */
+@DeveloperApi
+case class Expand(
+    projections: Seq[GroupExpression],
+    output: Seq[Attribute],
+    child: SparkPlan)
+  extends UnaryNode {
+
+  // The GroupExpressions can output data with arbitrary partitioning, so set it
+  // as UNKNOWN partitioning
+  override def outputPartitioning: Partitioning = UnknownPartitioning(0)
+
+  override def execute() = attachTree(this, "execute") {
+    child.execute().mapPartitions { iter =>
+      // TODO Move out projection objects creation and transfer to
+      // workers via closure. However we can't assume the Projection
+      // is serializable because of the code gen, so we have to
+      // create the projections within each of the partition processing.
+      val groups = projections.map(ee => newProjection(ee.children, child.output)).toArray
+
+      new Iterator[Row] {
+        private[this] var result: Row = _
+        private[this] var idx = -1  // -1 means the initial state
+        private[this] var input: Row = _
+
+        override final def hasNext = (-1 < idx && idx < groups.length) || iter.hasNext
+
+        override final def next(): Row = {
+          if (idx <= 0) {
+            // in the initial (-1) or beginning(0) of a new input row, fetch the next input tuple
+            input = iter.next()
+            idx = 0
+          }
+
+          result = groups(idx)(input)
+          idx += 1
+
+          if (idx == groups.length && iter.hasNext) {
+            idx = 0
+          }
+
+          result
+        }
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index 7c3bf947e743d..4abe26fe4afc6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.trees._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 case class AggregateEvaluation(
     schema: Seq[Attribute],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 4cd8e7d657250..6fecd1ff066c3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -44,14 +44,14 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
    * populated by the query planning infrastructure.
    */
   @transient
-  protected[spark] val sqlContext = SparkPlan.currentContext.get()
+  protected[spark] final val sqlContext = SparkPlan.currentContext.get()
 
   protected def sparkContext = sqlContext.sparkContext
 
   // sqlContext will be null when we are being deserialized on the slaves.  In this instance
   // the value of codegenEnabled will be set by the desserializer after the constructor has run.
   val codegenEnabled: Boolean = if (sqlContext != null) {
-    sqlContext.codegenEnabled
+    sqlContext.conf.codegenEnabled
   } else {
     false
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer.scala
index 84d96e612f0dc..30564e14fa896 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.execution
 
 import java.nio.ByteBuffer
 
+import org.apache.spark.sql.types.Decimal
+
 import scala.reflect.ClassTag
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLog
@@ -29,7 +31,6 @@ import com.twitter.chill.{AllScalaRegistrar, ResourcePool}
 import org.apache.spark.{SparkEnv, SparkConf}
 import org.apache.spark.serializer.{SerializerInstance, KryoSerializer}
 import org.apache.spark.sql.catalyst.expressions.GenericRow
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
 import org.apache.spark.util.collection.OpenHashSet
 import org.apache.spark.util.MutablePair
 import org.apache.spark.util.Utils
@@ -45,7 +46,8 @@ private[sql] class SparkSqlSerializer(conf: SparkConf) extends KryoSerializer(co
     kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericMutableRow])
     kryo.register(classOf[com.clearspring.analytics.stream.cardinality.HyperLogLog],
                   new HyperLogLogSerializer)
-    kryo.register(classOf[scala.math.BigDecimal], new BigDecimalSerializer)
+    kryo.register(classOf[java.math.BigDecimal], new JavaBigDecimalSerializer)
+    kryo.register(classOf[BigDecimal], new ScalaBigDecimalSerializer)
 
     // Specific hashsets must come first TODO: Move to core.
     kryo.register(classOf[IntegerHashSet], new IntegerHashSetSerializer)
@@ -98,14 +100,25 @@ private[sql] object SparkSqlSerializer {
     }
 }
 
-private[sql] class BigDecimalSerializer extends Serializer[BigDecimal] {
-  def write(kryo: Kryo, output: Output, bd: math.BigDecimal) {
+private[sql] class JavaBigDecimalSerializer extends Serializer[java.math.BigDecimal] {
+  def write(kryo: Kryo, output: Output, bd: java.math.BigDecimal) {
+    // TODO: There are probably more efficient representations than strings...
+    output.writeString(bd.toString)
+  }
+
+  def read(kryo: Kryo, input: Input, tpe: Class[java.math.BigDecimal]): java.math.BigDecimal = {
+    new java.math.BigDecimal(input.readString())
+  }
+}
+
+private[sql] class ScalaBigDecimalSerializer extends Serializer[BigDecimal] {
+  def write(kryo: Kryo, output: Output, bd: BigDecimal) {
     // TODO: There are probably more efficient representations than strings...
-    output.writeString(bd.toString())
+    output.writeString(bd.toString)
   }
 
   def read(kryo: Kryo, input: Input, tpe: Class[BigDecimal]): BigDecimal = {
-    BigDecimal(input.readString())
+    new java.math.BigDecimal(input.readString())
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 1225d18857af2..0cc9d049c9640 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -23,9 +23,10 @@ import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.catalyst.types._
-import org.apache.spark.sql.columnar.{InMemoryRelation, InMemoryColumnarTableScan}
+import org.apache.spark.sql.columnar.{InMemoryColumnarTableScan, InMemoryRelation}
 import org.apache.spark.sql.parquet._
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.sources.{CreateTempTableUsing, CreateTableUsing}
 
 
 private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
@@ -33,6 +34,12 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
   object LeftSemiJoin extends Strategy with PredicateHelper {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+      case ExtractEquiJoinKeys(LeftSemi, leftKeys, rightKeys, condition, left, right)
+        if sqlContext.conf.autoBroadcastJoinThreshold > 0 &&
+          right.statistics.sizeInBytes <= sqlContext.conf.autoBroadcastJoinThreshold =>
+        val semiJoin = joins.BroadcastLeftSemiJoinHash(
+          leftKeys, rightKeys, planLater(left), planLater(right))
+        condition.map(Filter(_, semiJoin)).getOrElse(semiJoin) :: Nil
       // Find left semi joins where at least some predicates can be evaluated by matching join keys
       case ExtractEquiJoinKeys(LeftSemi, leftKeys, rightKeys, condition, left, right) =>
         val semiJoin = joins.LeftSemiJoinHash(
@@ -74,13 +81,13 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right)
-        if sqlContext.autoBroadcastJoinThreshold > 0 &&
-           right.statistics.sizeInBytes <= sqlContext.autoBroadcastJoinThreshold =>
+        if sqlContext.conf.autoBroadcastJoinThreshold > 0 &&
+           right.statistics.sizeInBytes <= sqlContext.conf.autoBroadcastJoinThreshold =>
         makeBroadcastHashJoin(leftKeys, rightKeys, left, right, condition, joins.BuildRight)
 
       case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right)
-        if sqlContext.autoBroadcastJoinThreshold > 0 &&
-           left.statistics.sizeInBytes <= sqlContext.autoBroadcastJoinThreshold =>
+        if sqlContext.conf.autoBroadcastJoinThreshold > 0 &&
+           left.statistics.sizeInBytes <= sqlContext.conf.autoBroadcastJoinThreshold =>
           makeBroadcastHashJoin(leftKeys, rightKeys, left, right, condition, joins.BuildLeft)
 
       case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right) =>
@@ -190,7 +197,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
   object TakeOrdered extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case logical.Limit(IntegerLiteral(limit), logical.Sort(order, child)) =>
+      case logical.Limit(IntegerLiteral(limit), logical.Sort(order, true, child)) =>
         execution.TakeOrdered(limit, order, planLater(child)) :: Nil
       case _ => Nil
     }
@@ -208,7 +215,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         InsertIntoParquetTable(table, planLater(child), overwrite) :: Nil
       case PhysicalOperation(projectList, filters: Seq[Expression], relation: ParquetRelation) =>
         val prunePushedDownFilters =
-          if (sqlContext.parquetFilterPushDown) {
+          if (sqlContext.conf.parquetFilterPushDown) {
             (predicates: Seq[Expression]) => {
               // Note: filters cannot be pushed down to Parquet if they contain more complex
               // expressions than simple "Attribute cmp Literal" comparisons. Here we remove all
@@ -230,7 +237,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
           ParquetTableScan(
             _,
             relation,
-            if (sqlContext.parquetFilterPushDown) filters else Nil)) :: Nil
+            if (sqlContext.conf.parquetFilterPushDown) filters else Nil)) :: Nil
 
       case _ => Nil
     }
@@ -253,23 +260,26 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     def numPartitions = self.numPartitions
 
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+      case r: RunnableCommand => ExecutedCommand(r) :: Nil
+
       case logical.Distinct(child) =>
         execution.Distinct(partial = false,
           execution.Distinct(partial = true, planLater(child))) :: Nil
 
-      case logical.Sort(sortExprs, child) if sqlContext.externalSortEnabled =>
-        execution.ExternalSort(sortExprs, global = true, planLater(child)):: Nil
-      case logical.Sort(sortExprs, child) =>
-        execution.Sort(sortExprs, global = true, planLater(child)):: Nil
-
       case logical.SortPartitions(sortExprs, child) =>
         // This sort only sorts tuples within a partition. Its requiredDistribution will be
         // an UnspecifiedDistribution.
         execution.Sort(sortExprs, global = false, planLater(child)) :: Nil
+      case logical.Sort(sortExprs, global, child) if sqlContext.conf.externalSortEnabled =>
+        execution.ExternalSort(sortExprs, global, planLater(child)):: Nil
+      case logical.Sort(sortExprs, global, child) =>
+        execution.Sort(sortExprs, global, planLater(child)):: Nil
       case logical.Project(projectList, child) =>
         execution.Project(projectList, planLater(child)) :: Nil
       case logical.Filter(condition, child) =>
         execution.Filter(condition, planLater(child)) :: Nil
+      case logical.Expand(projections, output, child) =>
+        execution.Expand(projections, output, planLater(child)) :: Nil
       case logical.Aggregate(group, agg, child) =>
         execution.Aggregate(partial = false, group, agg, planLater(child)) :: Nil
       case logical.Sample(fraction, withReplacement, seed, child) =>
@@ -302,17 +312,15 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     }
   }
 
-  case class CommandStrategy(context: SQLContext) extends Strategy {
+  object DDLStrategy extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case r: RunnableCommand => ExecutedCommand(r) :: Nil
-      case logical.SetCommand(kv) =>
-        Seq(execution.SetCommand(kv, plan.output)(context))
-      case logical.ExplainCommand(logicalPlan, extended) =>
-        Seq(execution.ExplainCommand(logicalPlan, plan.output, extended)(context))
-      case logical.CacheTableCommand(tableName, optPlan, isLazy) =>
-        Seq(execution.CacheTableCommand(tableName, optPlan, isLazy))
-      case logical.UncacheTableCommand(tableName) =>
-        Seq(execution.UncacheTableCommand(tableName))
+      case CreateTableUsing(tableName, userSpecifiedSchema, provider, true, options) =>
+        ExecutedCommand(
+          CreateTempTableUsing(tableName, userSpecifiedSchema, provider, options)) :: Nil
+
+      case CreateTableUsing(tableName, userSpecifiedSchema, provider, false, options) =>
+        sys.error("Tables created with SQLContext must be TEMPORARY. Use a HiveContext instead.")
+
       case _ => Nil
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index e53723c176569..16ca4be5587c4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -70,7 +70,7 @@ case class Sample(fraction: Double, withReplacement: Boolean, seed: Long, child:
   override def output = child.output
 
   // TODO: How to pick seed?
-  override def execute() = child.execute().sample(withReplacement, fraction, seed)
+  override def execute() = child.execute().map(_.copy()).sample(withReplacement, fraction, seed)
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index afe3f3f07440c..52a31f01a4358 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -20,40 +20,26 @@ package org.apache.spark.sql.execution
 import org.apache.spark.Logging
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{SchemaRDD, SQLConf, SQLContext}
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions.{Row, Attribute}
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.{SQLConf, SQLContext}
 
-// TODO: DELETE ME...
-trait Command {
-  this: SparkPlan =>
-
-  /**
-   * A concrete command should override this lazy field to wrap up any side effects caused by the
-   * command or any other computation that should be evaluated exactly once. The value of this field
-   * can be used as the contents of the corresponding RDD generated from the physical plan of this
-   * command.
-   *
-   * The `execute()` method of all the physical command classes should reference `sideEffectResult`
-   * so that the command can be executed eagerly right after the command query is created.
-   */
-  protected lazy val sideEffectResult: Seq[Row] = Seq.empty[Row]
-
-  override def executeCollect(): Array[Row] = sideEffectResult.toArray
-
-  override def execute(): RDD[Row] = sqlContext.sparkContext.parallelize(sideEffectResult, 1)
-}
-
-// TODO: Replace command with runnable command.
+/**
+ * A logical command that is executed for its side-effects.  `RunnableCommand`s are
+ * wrapped in `ExecutedCommand` during execution.
+ */
 trait RunnableCommand extends logical.Command {
   self: Product =>
 
-  def output: Seq[Attribute]
   def run(sqlContext: SQLContext): Seq[Row]
 }
 
+/**
+ * A physical operator that executes the run method of a `RunnableCommand` and
+ * saves the result to prevent multiple executions.
+ */
 case class ExecutedCommand(cmd: RunnableCommand) extends SparkPlan {
   /**
    * A concrete command should override this lazy field to wrap up any side effects caused by the
@@ -79,43 +65,41 @@ case class ExecutedCommand(cmd: RunnableCommand) extends SparkPlan {
  * :: DeveloperApi ::
  */
 @DeveloperApi
-case class SetCommand(kv: Option[(String, Option[String])], output: Seq[Attribute])(
-    @transient context: SQLContext)
-  extends LeafNode with Command with Logging {
+case class SetCommand(
+    kv: Option[(String, Option[String])],
+    override val output: Seq[Attribute]) extends RunnableCommand with Logging {
 
-  override protected lazy val sideEffectResult: Seq[Row] = kv match {
+  override def run(sqlContext: SQLContext) = kv match {
     // Configures the deprecated "mapred.reduce.tasks" property.
     case Some((SQLConf.Deprecated.MAPRED_REDUCE_TASKS, Some(value))) =>
       logWarning(
         s"Property ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} is deprecated, " +
           s"automatically converted to ${SQLConf.SHUFFLE_PARTITIONS} instead.")
-      context.setConf(SQLConf.SHUFFLE_PARTITIONS, value)
+      sqlContext.setConf(SQLConf.SHUFFLE_PARTITIONS, value)
       Seq(Row(s"${SQLConf.SHUFFLE_PARTITIONS}=$value"))
 
     // Configures a single property.
     case Some((key, Some(value))) =>
-      context.setConf(key, value)
+      sqlContext.setConf(key, value)
       Seq(Row(s"$key=$value"))
 
-    // Queries all key-value pairs that are set in the SQLConf of the context. Notice that different
-    // from Hive, here "SET -v" is an alias of "SET". (In Hive, "SET" returns all changed properties
-    // while "SET -v" returns all properties.)
+    // Queries all key-value pairs that are set in the SQLConf of the sqlContext.
+    // Notice that different from Hive, here "SET -v" is an alias of "SET".
+    // (In Hive, "SET" returns all changed properties while "SET -v" returns all properties.)
     case Some(("-v", None)) | None =>
-      context.getAllConfs.map { case (k, v) => Row(s"$k=$v") }.toSeq
+      sqlContext.getAllConfs.map { case (k, v) => Row(s"$k=$v") }.toSeq
 
     // Queries the deprecated "mapred.reduce.tasks" property.
     case Some((SQLConf.Deprecated.MAPRED_REDUCE_TASKS, None)) =>
       logWarning(
         s"Property ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} is deprecated, " +
           s"showing ${SQLConf.SHUFFLE_PARTITIONS} instead.")
-      Seq(Row(s"${SQLConf.SHUFFLE_PARTITIONS}=${context.numShufflePartitions}"))
+      Seq(Row(s"${SQLConf.SHUFFLE_PARTITIONS}=${sqlContext.conf.numShufflePartitions}"))
 
     // Queries a single property.
     case Some((key, None)) =>
-      Seq(Row(s"$key=${context.getConf(key, "<undefined>")}"))
+      Seq(Row(s"$key=${sqlContext.getConf(key, "<undefined>")}"))
   }
-
-  override def otherCopyArgs = context :: Nil
 }
 
 /**
@@ -128,22 +112,19 @@ case class SetCommand(kv: Option[(String, Option[String])], output: Seq[Attribut
  */
 @DeveloperApi
 case class ExplainCommand(
-    logicalPlan: LogicalPlan, output: Seq[Attribute], extended: Boolean)(
-    @transient context: SQLContext)
-  extends LeafNode with Command {
+    logicalPlan: LogicalPlan,
+    override val output: Seq[Attribute], extended: Boolean = false) extends RunnableCommand {
 
   // Run through the optimizer to generate the physical plan.
-  override protected lazy val sideEffectResult: Seq[Row] = try {
+  override def run(sqlContext: SQLContext) = try {
     // TODO in Hive, the "extended" ExplainCommand prints the AST as well, and detailed properties.
-    val queryExecution = context.executePlan(logicalPlan)
+    val queryExecution = sqlContext.executePlan(logicalPlan)
     val outputString = if (extended) queryExecution.toString else queryExecution.simpleString
 
     outputString.split("\n").map(Row(_))
   } catch { case cause: TreeNodeException[_] =>
     ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_))
   }
-
-  override def otherCopyArgs = context :: Nil
 }
 
 /**
@@ -153,18 +134,15 @@ case class ExplainCommand(
 case class CacheTableCommand(
     tableName: String,
     plan: Option[LogicalPlan],
-    isLazy: Boolean)
-  extends LeafNode with Command {
-
-  override protected lazy val sideEffectResult = {
-    import sqlContext._
+    isLazy: Boolean) extends RunnableCommand {
 
-    plan.foreach(_.registerTempTable(tableName))
-    cacheTable(tableName)
+  override def run(sqlContext: SQLContext) = {
+    plan.foreach(p => new SchemaRDD(sqlContext, p).registerTempTable(tableName))
+    sqlContext.cacheTable(tableName)
 
     if (!isLazy) {
       // Performs eager caching
-      table(tableName).count()
+      sqlContext.table(tableName).count()
     }
 
     Seq.empty[Row]
@@ -178,8 +156,9 @@ case class CacheTableCommand(
  * :: DeveloperApi ::
  */
 @DeveloperApi
-case class UncacheTableCommand(tableName: String) extends LeafNode with Command {
-  override protected lazy val sideEffectResult: Seq[Row] = {
+case class UncacheTableCommand(tableName: String) extends RunnableCommand {
+
+  override def run(sqlContext: SQLContext) = {
     sqlContext.table(tableName).unpersist()
     Seq.empty[Row]
   }
@@ -191,12 +170,11 @@ case class UncacheTableCommand(tableName: String) extends LeafNode with Command
  * :: DeveloperApi ::
  */
 @DeveloperApi
-case class DescribeCommand(child: SparkPlan, output: Seq[Attribute])(
-    @transient context: SQLContext)
-  extends LeafNode with Command {
+case class DescribeCommand(
+    child: SparkPlan,
+    override val output: Seq[Attribute]) extends RunnableCommand {
 
-  override protected lazy val sideEffectResult: Seq[Row] = {
-    Row("# Registered as a temporary table", null, null) +:
-      child.output.map(field => Row(field.name, field.dataType.toString, null))
+  override def run(sqlContext: SQLContext) = {
+    child.output.map(field => Row(field.name, field.dataType.toString, null))
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index 61be5ed2db65c..4d7e338e8ed13 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -24,7 +24,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.SparkContext._
 import org.apache.spark.sql.{SchemaRDD, Row}
 import org.apache.spark.sql.catalyst.trees.TreeNodeRef
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 /**
  * :: DeveloperApi ::
@@ -144,7 +144,7 @@ package object debug {
       case (null, _) =>
 
       case (row: Row, StructType(fields)) =>
-        row.zip(fields.map(_.dataType)).foreach { case(d,t) => typeCheck(d,t) }
+        row.toSeq.zip(fields.map(_.dataType)).foreach { case(d, t) => typeCheck(d, t) }
       case (s: Seq[_], ArrayType(elemType, _)) =>
         s.foreach(typeCheck(_, elemType))
       case (m: Map[_, _], MapType(keyType, valueType, _)) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
index 5cf2a785adc7d..2dd22c020ef12 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
@@ -42,6 +42,15 @@ case class BroadcastHashJoin(
     right: SparkPlan)
   extends BinaryNode with HashJoin {
 
+  val timeout = {
+    val timeoutValue = sqlContext.conf.broadcastTimeout
+    if (timeoutValue < 0) {
+      Duration.Inf
+    } else {
+      timeoutValue.seconds
+    }
+  }
+
   override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning
 
   override def requiredChildDistribution =
@@ -56,7 +65,7 @@ case class BroadcastHashJoin(
   }
 
   override def execute() = {
-    val broadcastRelation = Await.result(broadcastFuture, 5.minute)
+    val broadcastRelation = Await.result(broadcastFuture, timeout)
 
     streamedPlan.execute().mapPartitions { streamedIter =>
       hashJoin(streamedIter, broadcastRelation.value)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
new file mode 100644
index 0000000000000..2ab064fd0151e
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.joins
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.sql.catalyst.expressions.{Expression, Row}
+import org.apache.spark.sql.catalyst.plans.physical.ClusteredDistribution
+import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
+
+/**
+ * :: DeveloperApi ::
+ * Build the right table's join keys into a HashSet, and iteratively go through the left
+ * table, to find the if join keys are in the Hash set.
+ */
+@DeveloperApi
+case class BroadcastLeftSemiJoinHash(
+    leftKeys: Seq[Expression],
+    rightKeys: Seq[Expression],
+    left: SparkPlan,
+    right: SparkPlan) extends BinaryNode with HashJoin {
+
+  override val buildSide = BuildRight
+
+  override def output = left.output
+
+  override def execute() = {
+    val buildIter= buildPlan.execute().map(_.copy()).collect().toIterator
+    val hashSet = new java.util.HashSet[Row]()
+    var currentRow: Row = null
+
+    // Create a Hash set of buildKeys
+    while (buildIter.hasNext) {
+      currentRow = buildIter.next()
+      val rowKey = buildSideKeyGenerator(currentRow)
+      if (!rowKey.anyNull) {
+        val keyExists = hashSet.contains(rowKey)
+        if (!keyExists) {
+          hashSet.add(rowKey)
+        }
+      }
+    }
+
+    val broadcastedRelation = sparkContext.broadcast(hashSet)
+
+    streamedPlan.execute().mapPartitions { streamIter =>
+      val joinKeys = streamSideKeyGenerator()
+      streamIter.filter(current => {
+        !joinKeys(current).anyNull && broadcastedRelation.value.contains(joinKeys.currentValue)
+      })
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
index b73041d306b36..59ef904272545 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
@@ -68,66 +68,56 @@ case class HashOuterJoin(
   @transient private[this] lazy val DUMMY_LIST = Seq[Row](null)
   @transient private[this] lazy val EMPTY_LIST = Seq.empty[Row]
 
+  @transient private[this] lazy val leftNullRow = new GenericRow(left.output.length)
+  @transient private[this] lazy val rightNullRow = new GenericRow(right.output.length)
+  @transient private[this] lazy val boundCondition =
+    condition.map(newPredicate(_, left.output ++ right.output)).getOrElse((row: Row) => true)
+
   // TODO we need to rewrite all of the iterators with our own implementation instead of the Scala
   // iterator for performance purpose.
 
   private[this] def leftOuterIterator(
-      key: Row, leftIter: Iterable[Row], rightIter: Iterable[Row]): Iterator[Row] = {
-    val joinedRow = new JoinedRow()
-    val rightNullRow = new GenericRow(right.output.length)
-    val boundCondition =
-      condition.map(newPredicate(_, left.output ++ right.output)).getOrElse((row: Row) => true)
-
-    leftIter.iterator.flatMap { l =>
-      joinedRow.withLeft(l)
-      var matched = false
-      (if (!key.anyNull) rightIter.collect { case r if (boundCondition(joinedRow.withRight(r))) =>
-        matched = true
-        joinedRow.copy
+      key: Row, joinedRow: JoinedRow, rightIter: Iterable[Row]): Iterator[Row] = {
+    val ret: Iterable[Row] = (
+      if (!key.anyNull) {
+        val temp = rightIter.collect {
+          case r if (boundCondition(joinedRow.withRight(r))) => joinedRow.copy
+        }
+        if (temp.size  == 0) {
+          joinedRow.withRight(rightNullRow).copy :: Nil
+        } else {
+          temp
+        }
       } else {
-        Nil
-      }) ++ DUMMY_LIST.filter(_ => !matched).map( _ => {
-        // DUMMY_LIST.filter(_ => !matched) is a tricky way to add additional row,
-        // as we don't know whether we need to append it until finish iterating all of the
-        // records in right side.
-        // If we didn't get any proper row, then append a single row with empty right
-        joinedRow.withRight(rightNullRow).copy
-      })
-    }
+        joinedRow.withRight(rightNullRow).copy :: Nil
+      }
+    )
+    ret.iterator
   }
 
   private[this] def rightOuterIterator(
-      key: Row, leftIter: Iterable[Row], rightIter: Iterable[Row]): Iterator[Row] = {
-    val joinedRow = new JoinedRow()
-    val leftNullRow = new GenericRow(left.output.length)
-    val boundCondition =
-      condition.map(newPredicate(_, left.output ++ right.output)).getOrElse((row: Row) => true)
-
-    rightIter.iterator.flatMap { r =>
-      joinedRow.withRight(r)
-      var matched = false
-      (if (!key.anyNull) leftIter.collect { case l if (boundCondition(joinedRow.withLeft(l))) =>
-        matched = true
-        joinedRow.copy
+      key: Row, leftIter: Iterable[Row], joinedRow: JoinedRow): Iterator[Row] = {
+
+    val ret: Iterable[Row] = (
+      if (!key.anyNull) {
+        val temp = leftIter.collect {
+          case l if (boundCondition(joinedRow.withLeft(l))) => joinedRow.copy
+        }
+        if (temp.size  == 0) {
+          joinedRow.withLeft(leftNullRow).copy :: Nil
+        } else {
+          temp
+        }
       } else {
-        Nil
-      }) ++ DUMMY_LIST.filter(_ => !matched).map( _ => {
-        // DUMMY_LIST.filter(_ => !matched) is a tricky way to add additional row,
-        // as we don't know whether we need to append it until finish iterating all of the
-        // records in left side.
-        // If we didn't get any proper row, then append a single row with empty left.
-        joinedRow.withLeft(leftNullRow).copy
-      })
-    }
+        joinedRow.withLeft(leftNullRow).copy :: Nil
+      }
+    )
+    ret.iterator
   }
 
   private[this] def fullOuterIterator(
-      key: Row, leftIter: Iterable[Row], rightIter: Iterable[Row]): Iterator[Row] = {
-    val joinedRow = new JoinedRow()
-    val leftNullRow = new GenericRow(left.output.length)
-    val rightNullRow = new GenericRow(right.output.length)
-    val boundCondition =
-      condition.map(newPredicate(_, left.output ++ right.output)).getOrElse((row: Row) => true)
+      key: Row, leftIter: Iterable[Row], rightIter: Iterable[Row],
+      joinedRow: JoinedRow): Iterator[Row] = {
 
     if (!key.anyNull) {
       // Store the positions of records in right, if one of its associated row satisfy
@@ -193,27 +183,37 @@ case class HashOuterJoin(
   }
 
   override def execute() = {
+    val joinedRow = new JoinedRow()
     left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) =>
       // TODO this probably can be replaced by external sort (sort merged join?)
-      // Build HashMap for current partition in left relation
-      val leftHashTable = buildHashTable(leftIter, newProjection(leftKeys, left.output))
-      // Build HashMap for current partition in right relation
-      val rightHashTable = buildHashTable(rightIter, newProjection(rightKeys, right.output))
-      val boundCondition =
-        condition.map(newPredicate(_, left.output ++ right.output)).getOrElse((row: Row) => true)
+
       joinType match {
-        case LeftOuter => leftHashTable.keysIterator.flatMap { key =>
-          leftOuterIterator(key, leftHashTable.getOrElse(key, EMPTY_LIST),
-            rightHashTable.getOrElse(key, EMPTY_LIST))
+        case LeftOuter => {
+          val rightHashTable = buildHashTable(rightIter, newProjection(rightKeys, right.output))
+          val keyGenerator = newProjection(leftKeys, left.output)
+          leftIter.flatMap( currentRow => {
+            val rowKey = keyGenerator(currentRow)
+            joinedRow.withLeft(currentRow)
+            leftOuterIterator(rowKey, joinedRow, rightHashTable.getOrElse(rowKey, EMPTY_LIST))
+          })
         }
-        case RightOuter => rightHashTable.keysIterator.flatMap { key =>
-          rightOuterIterator(key, leftHashTable.getOrElse(key, EMPTY_LIST),
-            rightHashTable.getOrElse(key, EMPTY_LIST))
+        case RightOuter => {
+          val leftHashTable = buildHashTable(leftIter, newProjection(leftKeys, left.output))
+          val keyGenerator = newProjection(rightKeys, right.output)
+          rightIter.flatMap ( currentRow => {
+            val rowKey = keyGenerator(currentRow)
+            joinedRow.withRight(currentRow)
+            rightOuterIterator(rowKey, leftHashTable.getOrElse(rowKey, EMPTY_LIST), joinedRow)
+          })
         }
-        case FullOuter => (leftHashTable.keySet ++ rightHashTable.keySet).iterator.flatMap { key =>
-          fullOuterIterator(key,
-            leftHashTable.getOrElse(key, EMPTY_LIST),
-            rightHashTable.getOrElse(key, EMPTY_LIST))
+        case FullOuter => {
+          val leftHashTable = buildHashTable(leftIter, newProjection(leftKeys, left.output))
+          val rightHashTable = buildHashTable(rightIter, newProjection(rightKeys, right.output))
+          (leftHashTable.keySet ++ rightHashTable.keySet).iterator.flatMap { key =>
+            fullOuterIterator(key,
+              leftHashTable.getOrElse(key, EMPTY_LIST),
+              rightHashTable.getOrElse(key, EMPTY_LIST), joinedRow)
+          }
         }
         case x => throw new Exception(s"HashOuterJoin should not take $x as the JoinType")
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
index 2b4a88d5e864e..b85021acc9d4c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
@@ -19,8 +19,6 @@ package org.apache.spark.sql.execution
 
 import java.util.{List => JList, Map => JMap}
 
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
-
 import scala.collection.JavaConversions._
 import scala.collection.JavaConverters._
 
@@ -33,7 +31,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 import org.apache.spark.{Accumulator, Logging => SparkLogging}
 
 /**
@@ -118,9 +116,9 @@ object EvaluatePython {
   def toJava(obj: Any, dataType: DataType): Any = (obj, dataType) match {
     case (null, _) => null
 
-    case (row: Seq[Any], struct: StructType) =>
+    case (row: Row, struct: StructType) =>
       val fields = struct.fields.map(field => field.dataType)
-      row.zip(fields).map {
+      row.toSeq.zip(fields).map {
         case (obj, dataType) => toJava(obj, dataType)
       }.toArray
 
@@ -132,14 +130,12 @@ object EvaluatePython {
       arr.asInstanceOf[Array[Any]].map(x => toJava(x, array.elementType))
 
     case (obj: Map[_, _], mt: MapType) => obj.map {
-      case (k, v) => (k, toJava(v, mt.valueType)) // key should be primitive type
+      case (k, v) => (toJava(k, mt.keyType), toJava(v, mt.valueType))
     }.asJava
 
     case (ud, udt: UserDefinedType[_]) => toJava(udt.serialize(ud), udt.sqlType)
 
-    case (dec: BigDecimal, dt: DecimalType) => dec.underlying()  // Pyrolite can handle BigDecimal
-
-    // Pyrolite can handle Timestamp
+    // Pyrolite can handle Timestamp and Decimal
     case (other, _) => other
   }
 
@@ -147,7 +143,8 @@ object EvaluatePython {
    * Convert Row into Java Array (for pickled into Python)
    */
   def rowToArray(row: Row, fields: Seq[DataType]): Array[Any] = {
-    row.zip(fields).map {case (obj, dt) => toJava(obj, dt)}.toArray
+    // TODO: this is slow!
+    row.toSeq.zip(fields).map {case (obj, dt) => toJava(obj, dt)}.toArray
   }
 
   // Converts value to the type specified by the data type.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
index fc70c183437f6..1af96c28d5fdf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
@@ -19,31 +19,49 @@ package org.apache.spark.sql.json
 
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.sources._
+import org.apache.spark.sql.types.StructType
 
-private[sql] class DefaultSource extends RelationProvider {
-  /** Returns a new base relation with the given parameters. */
+
+private[sql] class DefaultSource extends RelationProvider with SchemaRelationProvider {
+
+  /** Returns a new base relation with the parameters. */
   override def createRelation(
       sqlContext: SQLContext,
       parameters: Map[String, String]): BaseRelation = {
     val fileName = parameters.getOrElse("path", sys.error("Option 'path' not specified"))
     val samplingRatio = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0)
 
-    JSONRelation(fileName, samplingRatio)(sqlContext)
+    JSONRelation(fileName, samplingRatio, None)(sqlContext)
+  }
+
+  /** Returns a new base relation with the given schema and parameters. */
+  override def createRelation(
+      sqlContext: SQLContext,
+      parameters: Map[String, String],
+      schema: StructType): BaseRelation = {
+    val fileName = parameters.getOrElse("path", sys.error("Option 'path' not specified"))
+    val samplingRatio = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0)
+
+    JSONRelation(fileName, samplingRatio, Some(schema))(sqlContext)
   }
 }
 
-private[sql] case class JSONRelation(fileName: String, samplingRatio: Double)(
+private[sql] case class JSONRelation(
+    fileName: String,
+    samplingRatio: Double,
+    userSpecifiedSchema: Option[StructType])(
     @transient val sqlContext: SQLContext)
   extends TableScan {
 
   private def baseRDD = sqlContext.sparkContext.textFile(fileName)
 
-  override val schema =
-    JsonRDD.inferSchema(
-      baseRDD,
-      samplingRatio,
-      sqlContext.columnNameOfCorruptRecord)
+  override val schema = userSpecifiedSchema.getOrElse(
+    JsonRDD.nullTypeToStringType(
+      JsonRDD.inferSchema(
+        baseRDD,
+        samplingRatio,
+        sqlContext.conf.columnNameOfCorruptRecord)))
 
   override def buildScan() =
-    JsonRDD.jsonStringToRow(baseRDD, schema, sqlContext.columnNameOfCorruptRecord)
+    JsonRDD.jsonStringToRow(baseRDD, schema, sqlContext.conf.columnNameOfCorruptRecord)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index ffb9548356d1d..9171939f7e8f7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -17,15 +17,11 @@
 
 package org.apache.spark.sql.json
 
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
-import org.apache.spark.sql.types.util.DataTypeConversions
-
 import java.io.StringWriter
+import java.sql.{Date, Timestamp}
 
 import scala.collection.Map
 import scala.collection.convert.Wrappers.{JMapWrapper, JListWrapper}
-import scala.math.BigDecimal
-import java.sql.{Date, Timestamp}
 
 import com.fasterxml.jackson.core.JsonProcessingException
 import com.fasterxml.jackson.core.JsonFactory
@@ -34,8 +30,8 @@ import com.fasterxml.jackson.databind.ObjectMapper
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.catalyst.ScalaReflection
+import org.apache.spark.sql.types._
 import org.apache.spark.Logging
 
 private[sql] object JsonRDD extends Logging {
@@ -246,7 +242,7 @@ private[sql] object JsonRDD extends Logging {
         // The value associated with the key is an array.
         // Handle inner structs of an array.
         def buildKeyPathForInnerStructs(v: Any, t: DataType): Seq[(String, DataType)] = t match {
-          case ArrayType(StructType(Nil), containsNull) => {
+          case ArrayType(e: StructType, containsNull) => {
             // The elements of this arrays are structs.
             v.asInstanceOf[Seq[Map[String, Any]]].flatMap(Option(_)).flatMap {
               element => allKeysWithValueTypes(element)
@@ -263,6 +259,8 @@ private[sql] object JsonRDD extends Logging {
         val elementType = typeOfArray(array)
         buildKeyPathForInnerStructs(array, elementType) :+ (key, elementType)
       }
+      // we couldn't tell what the type is if the value is null or empty string
+      case (key: String, value) if value == "" || value == null => (key, NullType) :: Nil
       case (key: String, value) => (key, typeOfPrimitiveValue(value)) :: Nil
     }
   }
@@ -334,9 +332,9 @@ private[sql] object JsonRDD extends Logging {
     value match {
       case value: java.lang.Integer => Decimal(value)
       case value: java.lang.Long => Decimal(value)
-      case value: java.math.BigInteger => Decimal(BigDecimal(value))
+      case value: java.math.BigInteger => Decimal(new java.math.BigDecimal(value))
       case value: java.lang.Double => Decimal(value)
-      case value: java.math.BigDecimal => Decimal(BigDecimal(value))
+      case value: java.math.BigDecimal => Decimal(value)
     }
   }
 
@@ -400,13 +398,13 @@ private[sql] object JsonRDD extends Logging {
     } else {
       desiredType match {
         case StringType => toString(value)
+        case _ if value == null || value == "" => null // guard the non string type
         case IntegerType => value.asInstanceOf[IntegerType.JvmType]
         case LongType => toLong(value)
         case DoubleType => toDouble(value)
         case DecimalType() => toDecimal(value)
         case BooleanType => value.asInstanceOf[BooleanType.JvmType]
         case NullType => null
-
         case ArrayType(elementType, _) =>
           value.asInstanceOf[Seq[Any]].map(enforceCorrectType(_, elementType))
         case struct: StructType => asRow(value.asInstanceOf[Map[String, Any]], struct)
@@ -447,7 +445,6 @@ private[sql] object JsonRDD extends Logging {
       case (FloatType, v: Float) => gen.writeNumber(v)
       case (DoubleType, v: Double) => gen.writeNumber(v)
       case (LongType, v: Long) => gen.writeNumber(v)
-      case (DecimalType(), v: scala.math.BigDecimal) => gen.writeNumber(v.bigDecimal)
       case (DecimalType(), v: java.math.BigDecimal) => gen.writeNumber(v)
       case (ByteType, v: Byte) => gen.writeNumber(v.toInt)
       case (BinaryType, v: Array[Byte]) => gen.writeBinary(v)
@@ -461,16 +458,16 @@ private[sql] object JsonRDD extends Logging {
         gen.writeEndArray()
 
       case (MapType(kv,vv, _), v: Map[_,_]) =>
-        gen.writeStartObject
+        gen.writeStartObject()
         v.foreach { p =>
           gen.writeFieldName(p._1.toString)
           valWriter(vv,p._2)
         }
-        gen.writeEndObject
+        gen.writeEndObject()
 
-      case (StructType(ty), v: Seq[_]) =>
+      case (StructType(ty), v: Row) =>
         gen.writeStartObject()
-        ty.zip(v).foreach {
+        ty.zip(v.toSeq).foreach {
           case (_, null) =>
           case (field, v) =>
             gen.writeFieldName(field.name)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/package.scala
index 1fd8e6220f83b..6dd39be807037 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/package.scala
@@ -33,441 +33,9 @@ import org.apache.spark.sql.execution.SparkPlan
  */
 package object sql {
 
-  /**
-   * :: DeveloperApi ::
-   *
-   * Represents one row of output from a relational operator.
-   * @group row
-   */
-  @DeveloperApi
-  type Row = catalyst.expressions.Row
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * A [[Row]] object can be constructed by providing field values. Example:
-   * {{{
-   * import org.apache.spark.sql._
-   *
-   * // Create a Row from values.
-   * Row(value1, value2, value3, ...)
-   * // Create a Row from a Seq of values.
-   * Row.fromSeq(Seq(value1, value2, ...))
-   * }}}
-   *
-   * A value of a row can be accessed through both generic access by ordinal,
-   * which will incur boxing overhead for primitives, as well as native primitive access.
-   * An example of generic access by ordinal:
-   * {{{
-   * import org.apache.spark.sql._
-   *
-   * val row = Row(1, true, "a string", null)
-   * // row: Row = [1,true,a string,null]
-   * val firstValue = row(0)
-   * // firstValue: Any = 1
-   * val fourthValue = row(3)
-   * // fourthValue: Any = null
-   * }}}
-   *
-   * For native primitive access, it is invalid to use the native primitive interface to retrieve
-   * a value that is null, instead a user must check `isNullAt` before attempting to retrieve a
-   * value that might be null.
-   * An example of native primitive access:
-   * {{{
-   * // using the row from the previous example.
-   * val firstValue = row.getInt(0)
-   * // firstValue: Int = 1
-   * val isNull = row.isNullAt(3)
-   * // isNull: Boolean = true
-   * }}}
-   *
-   * Interfaces related to native primitive access are:
-   *
-   * `isNullAt(i: Int): Boolean`
-   *
-   * `getInt(i: Int): Int`
-   *
-   * `getLong(i: Int): Long`
-   *
-   * `getDouble(i: Int): Double`
-   *
-   * `getFloat(i: Int): Float`
-   *
-   * `getBoolean(i: Int): Boolean`
-   *
-   * `getShort(i: Int): Short`
-   *
-   * `getByte(i: Int): Byte`
-   *
-   * `getString(i: Int): String`
-   *
-   * Fields in a [[Row]] object can be extracted in a pattern match. Example:
-   * {{{
-   * import org.apache.spark.sql._
-   *
-   * val pairs = sql("SELECT key, value FROM src").rdd.map {
-   *   case Row(key: Int, value: String) =>
-   *     key -> value
-   * }
-   * }}}
-   *
-   * @group row
-   */
-  @DeveloperApi
-  val Row = catalyst.expressions.Row
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The base type of all Spark SQL data types.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  type DataType = catalyst.types.DataType
-
-  @DeveloperApi
-  val DataType = catalyst.types.DataType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `String` values
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val StringType = catalyst.types.StringType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `Array[Byte]` values.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val BinaryType = catalyst.types.BinaryType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `Boolean` values.
-   *
-   *@group dataType
-   */
-  @DeveloperApi
-  val BooleanType = catalyst.types.BooleanType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `java.sql.Timestamp` values.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val TimestampType = catalyst.types.TimestampType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `java.sql.Date` values.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val DateType = catalyst.types.DateType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `scala.math.BigDecimal` values.
-   *
-   * TODO(matei): explain precision and scale
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  type DecimalType = catalyst.types.DecimalType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `scala.math.BigDecimal` values.
-   *
-   * TODO(matei): explain precision and scale
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val DecimalType = catalyst.types.DecimalType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `Double` values.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val DoubleType = catalyst.types.DoubleType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `Float` values.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val FloatType = catalyst.types.FloatType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `Byte` values.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val ByteType = catalyst.types.ByteType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `Int` values.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val IntegerType = catalyst.types.IntegerType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `Long` values.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val LongType = catalyst.types.LongType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `Short` values.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val ShortType = catalyst.types.ShortType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `NULL` values.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val NullType = catalyst.types.NullType
-  
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type for collections of multiple values.
-   * Internally these are represented as columns that contain a ``scala.collection.Seq``.
-   *
-   * An [[ArrayType]] object comprises two fields, `elementType: [[DataType]]` and
-   * `containsNull: Boolean`. The field of `elementType` is used to specify the type of
-   * array elements. The field of `containsNull` is used to specify if the array has `null` values.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  type ArrayType = catalyst.types.ArrayType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * An [[ArrayType]] object can be constructed with two ways,
-   * {{{
-   * ArrayType(elementType: DataType, containsNull: Boolean)
-   * }}} and
-   * {{{
-   * ArrayType(elementType: DataType)
-   * }}}
-   * For `ArrayType(elementType)`, the field of `containsNull` is set to `false`.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val ArrayType = catalyst.types.ArrayType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing `Map`s. A [[MapType]] object comprises three fields,
-   * `keyType: [[DataType]]`, `valueType: [[DataType]]` and `valueContainsNull: Boolean`.
-   * The field of `keyType` is used to specify the type of keys in the map.
-   * The field of `valueType` is used to specify the type of values in the map.
-   * The field of `valueContainsNull` is used to specify if values of this map has `null` values.
-   * For values of a MapType column, keys are not allowed to have `null` values.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  type MapType = catalyst.types.MapType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * A [[MapType]] object can be constructed with two ways,
-   * {{{
-   * MapType(keyType: DataType, valueType: DataType, valueContainsNull: Boolean)
-   * }}} and
-   * {{{
-   * MapType(keyType: DataType, valueType: DataType)
-   * }}}
-   * For `MapType(keyType: DataType, valueType: DataType)`,
-   * the field of `valueContainsNull` is set to `true`.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val MapType = catalyst.types.MapType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * The data type representing [[Row]]s.
-   * A [[StructType]] object comprises a [[Seq]] of [[StructField]]s.
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  type StructType = catalyst.types.StructType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * A [[StructType]] object can be constructed by
-   * {{{
-   * StructType(fields: Seq[StructField])
-   * }}}
-   * For a [[StructType]] object, one or multiple [[StructField]]s can be extracted by names.
-   * If multiple [[StructField]]s are extracted, a [[StructType]] object will be returned.
-   * If a provided name does not have a matching field, it will be ignored. For the case
-   * of extracting a single StructField, a `null` will be returned.
-   * Example:
-   * {{{
-   * import org.apache.spark.sql._
-   *
-   * val struct =
-   *   StructType(
-   *     StructField("a", IntegerType, true) ::
-   *     StructField("b", LongType, false) ::
-   *     StructField("c", BooleanType, false) :: Nil)
-   *
-   * // Extract a single StructField.
-   * val singleField = struct("b")
-   * // singleField: StructField = StructField(b,LongType,false)
-   *
-   * // This struct does not have a field called "d". null will be returned.
-   * val nonExisting = struct("d")
-   * // nonExisting: StructField = null
-   *
-   * // Extract multiple StructFields. Field names are provided in a set.
-   * // A StructType object will be returned.
-   * val twoFields = struct(Set("b", "c"))
-   * // twoFields: StructType =
-   * //   StructType(List(StructField(b,LongType,false), StructField(c,BooleanType,false)))
-   *
-   * // Those names do not have matching fields will be ignored.
-   * // For the case shown below, "d" will be ignored and
-   * // it is treated as struct(Set("b", "c")).
-   * val ignoreNonExisting = struct(Set("b", "c", "d"))
-   * // ignoreNonExisting: StructType =
-   * //   StructType(List(StructField(b,LongType,false), StructField(c,BooleanType,false)))
-   * }}}
-   *
-   * A [[Row]] object is used as a value of the StructType.
-   * Example:
-   * {{{
-   * import org.apache.spark.sql._
-   *
-   * val innerStruct =
-   *   StructType(
-   *     StructField("f1", IntegerType, true) ::
-   *     StructField("f2", LongType, false) ::
-   *     StructField("f3", BooleanType, false) :: Nil)
-   *
-   * val struct = StructType(
-   *   StructField("a", innerStruct, true) :: Nil)
-   *
-   * // Create a Row with the schema defined by struct
-   * val row = Row(Row(1, 2, true))
-   * // row: Row = [[1,2,true]]
-   * }}}
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val StructType = catalyst.types.StructType
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * A [[StructField]] object represents a field in a [[StructType]] object.
-   * A [[StructField]] object comprises three fields, `name: [[String]]`, `dataType: [[DataType]]`,
-   * and `nullable: Boolean`. The field of `name` is the name of a `StructField`. The field of
-   * `dataType` specifies the data type of a `StructField`.
-   * The field of `nullable` specifies if values of a `StructField` can contain `null` values.
-   *
-   * @group field
-   */
-  @DeveloperApi
-  type StructField = catalyst.types.StructField
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * A [[StructField]] object can be constructed by
-   * {{{
-   * StructField(name: String, dataType: DataType, nullable: Boolean)
-   * }}}
-   *
-   * @group dataType
-   */
-  @DeveloperApi
-  val StructField = catalyst.types.StructField
-
   /**
    * Converts a logical plan into zero or more SparkPlans.
    */
   @DeveloperApi
   type Strategy = org.apache.spark.sql.catalyst.planning.GenericStrategy[SparkPlan]
-
-  /**
-   * :: DeveloperApi ::
-   *
-   * Metadata is a wrapper over Map[String, Any] that limits the value type to simple ones: Boolean,
-   * Long, Double, String, Metadata, Array[Boolean], Array[Long], Array[Double], Array[String], and
-   * Array[Metadata]. JSON is used for serialization.
-   *
-   * The default constructor is private. User should use either [[MetadataBuilder]] or
-   * [[Metadata$#fromJson]] to create Metadata instances.
-   *
-   * @param map an immutable map that stores the data
-   */
-  @DeveloperApi
-  type Metadata = catalyst.util.Metadata
-
-  /**
-   * :: DeveloperApi ::
-   * Builder for [[Metadata]]. If there is a key collision, the latter will overwrite the former.
-   */
-  @DeveloperApi
-  type MetadataBuilder = catalyst.util.MetadataBuilder
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
index 1bbb66aaa19a3..9d9150246c8d4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -17,16 +17,14 @@
 
 package org.apache.spark.sql.parquet
 
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
-
 import scala.collection.mutable.{Buffer, ArrayBuffer, HashMap}
 
 import parquet.io.api.{PrimitiveConverter, GroupConverter, Binary, Converter}
 import parquet.schema.MessageType
 
-import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.parquet.CatalystConverter.FieldType
+import org.apache.spark.sql.types._
 
 /**
  * Collection of converters of Parquet types (group and primitive types) that
@@ -68,7 +66,7 @@ private[sql] object CatalystConverter {
 
   // TODO: consider using Array[T] for arrays to avoid boxing of primitive types
   type ArrayScalaType[T] = Seq[T]
-  type StructScalaType[T] = Seq[T]
+  type StructScalaType[T] = Row
   type MapScalaType[K, V] = Map[K, V]
 
   protected[parquet] def createConverter(
@@ -91,8 +89,8 @@ private[sql] object CatalystConverter {
       case ArrayType(elementType: DataType, true) => {
         new CatalystArrayContainsNullConverter(elementType, fieldIndex, parent)
       }
-      case StructType(fields: Seq[StructField]) => {
-        new CatalystStructConverter(fields.toArray, fieldIndex, parent)
+      case StructType(fields: Array[StructField]) => {
+        new CatalystStructConverter(fields, fieldIndex, parent)
       }
       case MapType(keyType: DataType, valueType: DataType, valueContainsNull: Boolean) => {
         new CatalystMapConverter(
@@ -436,7 +434,7 @@ private[parquet] object CatalystArrayConverter {
  * A `parquet.io.api.GroupConverter` that converts a single-element groups that
  * match the characteristics of an array (see
  * [[org.apache.spark.sql.parquet.ParquetTypesConverter]]) into an
- * [[org.apache.spark.sql.catalyst.types.ArrayType]].
+ * [[org.apache.spark.sql.types.ArrayType]].
  *
  * @param elementType The type of the array elements (complex or primitive)
  * @param index The position of this (array) field inside its parent converter
@@ -500,7 +498,7 @@ private[parquet] class CatalystArrayConverter(
  * A `parquet.io.api.GroupConverter` that converts a single-element groups that
  * match the characteristics of an array (see
  * [[org.apache.spark.sql.parquet.ParquetTypesConverter]]) into an
- * [[org.apache.spark.sql.catalyst.types.ArrayType]].
+ * [[org.apache.spark.sql.types.ArrayType]].
  *
  * @param elementType The type of the array elements (native)
  * @param index The position of this (array) field inside its parent converter
@@ -621,7 +619,7 @@ private[parquet] class CatalystNativeArrayConverter(
  * A `parquet.io.api.GroupConverter` that converts a single-element groups that
  * match the characteristics of an array contains null (see
  * [[org.apache.spark.sql.parquet.ParquetTypesConverter]]) into an
- * [[org.apache.spark.sql.catalyst.types.ArrayType]].
+ * [[org.apache.spark.sql.types.ArrayType]].
  *
  * @param elementType The type of the array elements (complex or primitive)
  * @param index The position of this (array) field inside its parent converter
@@ -727,7 +725,7 @@ private[parquet] class CatalystStructConverter(
  * A `parquet.io.api.GroupConverter` that converts two-element groups that
  * match the characteristics of a map (see
  * [[org.apache.spark.sql.parquet.ParquetTypesConverter]]) into an
- * [[org.apache.spark.sql.catalyst.types.MapType]].
+ * [[org.apache.spark.sql.types.MapType]].
  *
  * @param schema
  * @param index
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala
index 6fb5f49b13668..f08350878f239 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala
@@ -29,7 +29,7 @@ import parquet.io.api.Binary
 
 import org.apache.spark.SparkEnv
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 private[sql] object ParquetFilters {
   val PARQUET_FILTER_DATA = "org.apache.spark.sql.parquet.row.filter"
@@ -50,12 +50,37 @@ private[sql] object ParquetFilters {
         (n: String, v: Any) => FilterApi.eq(floatColumn(n), v.asInstanceOf[java.lang.Float])
       case DoubleType =>
         (n: String, v: Any) => FilterApi.eq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+      // Binary.fromString and Binary.fromByteArray don't accept null values
       case StringType =>
-        (n: String, v: Any) =>
-          FilterApi.eq(binaryColumn(n), Binary.fromString(v.asInstanceOf[String]))
+        (n: String, v: Any) => FilterApi.eq(
+          binaryColumn(n),
+          Option(v).map(s => Binary.fromString(s.asInstanceOf[String])).orNull)
       case BinaryType =>
-        (n: String, v: Any) =>
-          FilterApi.eq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
+        (n: String, v: Any) => FilterApi.eq(
+          binaryColumn(n),
+          Option(v).map(b => Binary.fromByteArray(v.asInstanceOf[Array[Byte]])).orNull)
+    }
+
+    val makeNotEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
+      case BooleanType =>
+        (n: String, v: Any) => FilterApi.notEq(booleanColumn(n), v.asInstanceOf[java.lang.Boolean])
+      case IntegerType =>
+        (n: String, v: Any) => FilterApi.notEq(intColumn(n), v.asInstanceOf[Integer])
+      case LongType =>
+        (n: String, v: Any) => FilterApi.notEq(longColumn(n), v.asInstanceOf[java.lang.Long])
+      case FloatType =>
+        (n: String, v: Any) => FilterApi.notEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
+      case DoubleType =>
+        (n: String, v: Any) => FilterApi.notEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+      case StringType =>
+        (n: String, v: Any) => FilterApi.notEq(
+          binaryColumn(n),
+          Option(v).map(s => Binary.fromString(s.asInstanceOf[String])).orNull)
+      case BinaryType =>
+        (n: String, v: Any) => FilterApi.notEq(
+          binaryColumn(n),
+          Option(v).map(b => Binary.fromByteArray(v.asInstanceOf[Array[Byte]])).orNull)
     }
 
     val makeLt: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -126,30 +151,45 @@ private[sql] object ParquetFilters {
           FilterApi.gtEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
     }
 
+    // NOTE:
+    //
+    // For any comparison operator `cmp`, both `a cmp NULL` and `NULL cmp a` evaluate to `NULL`,
+    // which can be casted to `false` implicitly. Please refer to the `eval` method of these
+    // operators and the `SimplifyFilters` rule for details.
     predicate match {
-      case EqualTo(NamedExpression(name, _), Literal(value, dataType)) if dataType != NullType =>
+      case IsNull(NamedExpression(name, dataType)) =>
+        makeEq.lift(dataType).map(_(name, null))
+      case IsNotNull(NamedExpression(name, dataType)) =>
+        makeNotEq.lift(dataType).map(_(name, null))
+
+      case EqualTo(NamedExpression(name, _), NonNullLiteral(value, dataType)) =>
         makeEq.lift(dataType).map(_(name, value))
-      case EqualTo(Literal(value, dataType), NamedExpression(name, _)) if dataType != NullType =>
+      case EqualTo(NonNullLiteral(value, dataType), NamedExpression(name, _)) =>
         makeEq.lift(dataType).map(_(name, value))
 
-      case LessThan(NamedExpression(name, _), Literal(value, dataType)) =>
+      case Not(EqualTo(NamedExpression(name, _), NonNullLiteral(value, dataType))) =>
+        makeNotEq.lift(dataType).map(_(name, value))
+      case Not(EqualTo(NonNullLiteral(value, dataType), NamedExpression(name, _))) =>
+        makeNotEq.lift(dataType).map(_(name, value))
+
+      case LessThan(NamedExpression(name, _), NonNullLiteral(value, dataType)) =>
         makeLt.lift(dataType).map(_(name, value))
-      case LessThan(Literal(value, dataType), NamedExpression(name, _)) =>
+      case LessThan(NonNullLiteral(value, dataType), NamedExpression(name, _)) =>
         makeGt.lift(dataType).map(_(name, value))
 
-      case LessThanOrEqual(NamedExpression(name, _), Literal(value, dataType)) =>
+      case LessThanOrEqual(NamedExpression(name, _), NonNullLiteral(value, dataType)) =>
         makeLtEq.lift(dataType).map(_(name, value))
-      case LessThanOrEqual(Literal(value, dataType), NamedExpression(name, _)) =>
+      case LessThanOrEqual(NonNullLiteral(value, dataType), NamedExpression(name, _)) =>
         makeGtEq.lift(dataType).map(_(name, value))
 
-      case GreaterThan(NamedExpression(name, _), Literal(value, dataType)) =>
+      case GreaterThan(NamedExpression(name, _), NonNullLiteral(value, dataType)) =>
         makeGt.lift(dataType).map(_(name, value))
-      case GreaterThan(Literal(value, dataType), NamedExpression(name, _)) =>
+      case GreaterThan(NonNullLiteral(value, dataType), NamedExpression(name, _)) =>
         makeLt.lift(dataType).map(_(name, value))
 
-      case GreaterThanOrEqual(NamedExpression(name, _), Literal(value, dataType)) =>
+      case GreaterThanOrEqual(NamedExpression(name, _), NonNullLiteral(value, dataType)) =>
         makeGtEq.lift(dataType).map(_(name, value))
-      case GreaterThanOrEqual(Literal(value, dataType), NamedExpression(name, _)) =>
+      case GreaterThanOrEqual(NonNullLiteral(value, dataType), NamedExpression(name, _)) =>
         makeLtEq.lift(dataType).map(_(name, value))
 
       case And(lhs, rhs) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index b237a07c72d07..cde5160149e9c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -28,7 +28,7 @@ import parquet.schema.MessageType
 
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, UnresolvedException}
-import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.{AttributeMap, Attribute}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 
 /**
@@ -65,7 +65,9 @@ private[sql] case class ParquetRelation(
     ParquetTypesConverter.readSchemaFromFile(
       new Path(path.split(",").head),
       conf,
-      sqlContext.isParquetBinaryAsString)
+      sqlContext.conf.isParquetBinaryAsString)
+
+  lazy val attributeMap = AttributeMap(output.map(o => o -> o))
 
   override def newInstance() = ParquetRelation(path, conf, sqlContext).asInstanceOf[this.type]
 
@@ -78,7 +80,7 @@ private[sql] case class ParquetRelation(
   }
 
   // TODO: Use data from the footers.
-  override lazy val statistics = Statistics(sizeInBytes = sqlContext.defaultSizeInBytes)
+  override lazy val statistics = Statistics(sizeInBytes = sqlContext.conf.defaultSizeInBytes)
 }
 
 private[sql] object ParquetRelation {
@@ -161,7 +163,8 @@ private[sql] object ParquetRelation {
                   sqlContext: SQLContext): ParquetRelation = {
     val path = checkPath(pathString, allowExisting, conf)
     conf.set(ParquetOutputFormat.COMPRESSION, shortParquetCompressionCodecNames.getOrElse(
-      sqlContext.parquetCompressionCodec.toUpperCase, CompressionCodecName.UNCOMPRESSED).name())
+      sqlContext.conf.parquetCompressionCodec.toUpperCase, CompressionCodecName.UNCOMPRESSED)
+      .name())
     ParquetRelation.enableLogForwarding()
     ParquetTypesConverter.writeMetaData(attributes, path, conf)
     new ParquetRelation(path.toString, Some(conf), sqlContext) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 232ef90b017a7..28cd17fde46ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.parquet
 import java.io.IOException
 import java.lang.{Long => JLong}
 import java.text.SimpleDateFormat
+import java.text.NumberFormat
 import java.util.concurrent.{Callable, TimeUnit}
 import java.util.{ArrayList, Collections, Date, List => JList}
 
@@ -63,18 +64,17 @@ case class ParquetTableScan(
   // The resolution of Parquet attributes is case sensitive, so we resolve the original attributes
   // by exprId. note: output cannot be transient, see
   // https://issues.apache.org/jira/browse/SPARK-1367
-  val normalOutput =
-    attributes
-      .filterNot(a => relation.partitioningAttributes.map(_.exprId).contains(a.exprId))
-      .flatMap(a => relation.output.find(o => o.exprId == a.exprId))
+  val output = attributes.map(relation.attributeMap)
 
-  val partOutput =
-    attributes.flatMap(a => relation.partitioningAttributes.find(o => o.exprId == a.exprId))
+  // A mapping of ordinals partitionRow -> finalOutput.
+  val requestedPartitionOrdinals = {
+    val partitionAttributeOrdinals = AttributeMap(relation.partitioningAttributes.zipWithIndex)
 
-  def output = partOutput ++ normalOutput
-
-  assert(normalOutput.size + partOutput.size == attributes.size,
-    s"$normalOutput + $partOutput != $attributes, ${relation.output}")
+    attributes.zipWithIndex.flatMap {
+      case (attribute, finalOrdinal) =>
+        partitionAttributeOrdinals.get(attribute).map(_ -> finalOrdinal)
+    }
+  }.toArray
 
   override def execute(): RDD[Row] = {
     import parquet.filter2.compat.FilterCompat.FilterPredicateCompat
@@ -96,7 +96,7 @@ case class ParquetTableScan(
     // Store both requested and original schema in `Configuration`
     conf.set(
       RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
-      ParquetTypesConverter.convertToString(normalOutput))
+      ParquetTypesConverter.convertToString(output))
     conf.set(
       RowWriteSupport.SPARK_ROW_SCHEMA,
       ParquetTypesConverter.convertToString(relation.output))
@@ -124,7 +124,7 @@ case class ParquetTableScan(
         classOf[Row],
         conf)
 
-    if (partOutput.nonEmpty) {
+    if (requestedPartitionOrdinals.nonEmpty) {
       baseRDD.mapPartitionsWithInputSplit { case (split, iter) =>
         val partValue = "([^=]+)=([^=]+)".r
         val partValues =
@@ -137,15 +137,25 @@ case class ParquetTableScan(
               case _ => None
             }.toMap
 
+        // Convert the partitioning attributes into the correct types
         val partitionRowValues =
-          partOutput.map(a => Cast(Literal(partValues(a.name)), a.dataType).eval(EmptyRow))
+          relation.partitioningAttributes
+            .map(a => Cast(Literal(partValues(a.name)), a.dataType).eval(EmptyRow))
 
         new Iterator[Row] {
-          private[this] val joinedRow = new JoinedRow5(Row(partitionRowValues:_*), null)
-
           def hasNext = iter.hasNext
-
-          def next() = joinedRow.withRight(iter.next()._2)
+          def next() = {
+            val row = iter.next()._2.asInstanceOf[SpecificMutableRow]
+
+            // Parquet will leave partitioning columns empty, so we fill them in here.
+            var i = 0
+            while (i < requestedPartitionOrdinals.size) {
+              row(requestedPartitionOrdinals(i)._2) =
+                partitionRowValues(requestedPartitionOrdinals(i)._1)
+              i += 1
+            }
+            row
+          }
         }
       }
     } else {
@@ -291,12 +301,9 @@ case class InsertIntoParquetTable(
       }
 
     def writeShard(context: TaskContext, iter: Iterator[Row]): Int = {
-      // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
-      // around by taking a mod. We expect that no task will be attempted 2 billion times.
-      val attemptNumber = (context.attemptId % Int.MaxValue).toInt
       /* "reduce task" <split #> <attempt # = spark task #> */
       val attemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = false, context.partitionId,
-        attemptNumber)
+        context.attemptNumber)
       val hadoopContext = newTaskAttemptContext(wrappedConf.value, attemptId)
       val format = new AppendingParquetOutputFormat(taskIdOffset)
       val committer = format.getOutputCommitter(hadoopContext)
@@ -338,9 +345,13 @@ private[parquet] class AppendingParquetOutputFormat(offset: Int)
 
   // override to choose output filename so not overwrite existing ones
   override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
+    val numfmt = NumberFormat.getInstance()
+    numfmt.setMinimumIntegerDigits(5)
+    numfmt.setGroupingUsed(false)
+
     val taskId: TaskID = getTaskAttemptID(context).getTaskID
     val partition: Int = taskId.getId
-    val filename = s"part-r-${partition + offset}.parquet"
+    val filename = "part-r-" + numfmt.format(partition + offset) + ".parquet"
     val committer: FileOutputCommitter =
       getOutputCommitter(context).asInstanceOf[FileOutputCommitter]
     new Path(committer.getWorkPath, filename)
@@ -616,7 +627,9 @@ private[parquet] object FileSystemHelper {
       throw new IllegalArgumentException(
         s"ParquetTableOperations: path $path does not exist or is not a directory")
     }
-    fs.listStatus(path).map(_.getPath)
+    fs.globStatus(path)
+      .flatMap { status => if(status.isDir) fs.listStatus(status.getPath) else List(status) }
+      .map(_.getPath)
   }
 
     /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index ef3687e692964..fd63ad8144064 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -29,8 +29,7 @@ import parquet.schema.MessageType
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
-import org.apache.spark.sql.catalyst.types._
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
+import org.apache.spark.sql.types._
 
 /**
  * A `parquet.io.api.RecordMaterializer` for Rows.
@@ -130,7 +129,7 @@ private[parquet] object RowReadSupport {
 private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
 
   private[parquet] var writer: RecordConsumer = null
-  private[parquet] var attributes: Seq[Attribute] = null
+  private[parquet] var attributes: Array[Attribute] = null
 
   override def init(configuration: Configuration): WriteSupport.WriteContext = {
     val origAttributesStr: String = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
@@ -138,7 +137,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
     metadata.put(RowReadSupport.SPARK_METADATA_KEY, origAttributesStr)
 
     if (attributes == null) {
-      attributes = ParquetTypesConverter.convertFromString(origAttributesStr)
+      attributes = ParquetTypesConverter.convertFromString(origAttributesStr).toArray
     }
 
     log.debug(s"write support initialized for requested schema $attributes")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
new file mode 100644
index 0000000000000..02ce1b3e6d811
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.parquet
+
+import java.io.File
+
+import scala.reflect.ClassTag
+import scala.reflect.runtime.universe.TypeTag
+import scala.util.Try
+
+import org.apache.spark.sql.{SQLContext, SchemaRDD}
+import org.apache.spark.sql.catalyst.util
+import org.apache.spark.util.Utils
+
+/**
+ * A helper trait that provides convenient facilities for Parquet testing.
+ *
+ * NOTE: Considering classes `Tuple1` ... `Tuple22` all extend `Product`, it would be more
+ * convenient to use tuples rather than special case classes when writing test cases/suites.
+ * Especially, `Tuple1.apply` can be used to easily wrap a single type/value.
+ */
+trait ParquetTest {
+  val sqlContext: SQLContext
+
+  import sqlContext._
+
+  protected def configuration = sparkContext.hadoopConfiguration
+
+  /**
+   * Sets all SQL configurations specified in `pairs`, calls `f`, and then restore all SQL
+   * configurations.
+   *
+   * @todo Probably this method should be moved to a more general place
+   */
+  protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
+    val (keys, values) = pairs.unzip
+    val currentValues = keys.map(key => Try(getConf(key)).toOption)
+    (keys, values).zipped.foreach(setConf)
+    try f finally {
+      keys.zip(currentValues).foreach {
+        case (key, Some(value)) => setConf(key, value)
+        case (key, None) => conf.unsetConf(key)
+      }
+    }
+  }
+
+  /**
+   * Generates a temporary path without creating the actual file/directory, then pass it to `f`. If
+   * a file/directory is created there by `f`, it will be delete after `f` returns.
+   *
+   * @todo Probably this method should be moved to a more general place
+   */
+  protected def withTempPath(f: File => Unit): Unit = {
+    val file = util.getTempFilePath("parquetTest").getCanonicalFile
+    try f(file) finally if (file.exists()) Utils.deleteRecursively(file)
+  }
+
+  /**
+   * Creates a temporary directory, which is then passed to `f` and will be deleted after `f`
+   * returns.
+   *
+   * @todo Probably this method should be moved to a more general place
+   */
+  protected def withTempDir(f: File => Unit): Unit = {
+    val dir = Utils.createTempDir().getCanonicalFile
+    try f(dir) finally Utils.deleteRecursively(dir)
+  }
+
+  /**
+   * Writes `data` to a Parquet file, which is then passed to `f` and will be deleted after `f`
+   * returns.
+   */
+  protected def withParquetFile[T <: Product: ClassTag: TypeTag]
+      (data: Seq[T])
+      (f: String => Unit): Unit = {
+    withTempPath { file =>
+      sparkContext.parallelize(data).saveAsParquetFile(file.getCanonicalPath)
+      f(file.getCanonicalPath)
+    }
+  }
+
+  /**
+   * Writes `data` to a Parquet file and reads it back as a SchemaRDD, which is then passed to `f`.
+   * The Parquet file will be deleted after `f` returns.
+   */
+  protected def withParquetRDD[T <: Product: ClassTag: TypeTag]
+      (data: Seq[T])
+      (f: SchemaRDD => Unit): Unit = {
+    withParquetFile(data)(path => f(parquetFile(path)))
+  }
+
+  /**
+   * Drops temporary table `tableName` after calling `f`.
+   */
+  protected def withTempTable(tableName: String)(f: => Unit): Unit = {
+    try f finally dropTempTable(tableName)
+  }
+
+  /**
+   * Writes `data` to a Parquet file, reads it back as a SchemaRDD and registers it as a temporary
+   * table named `tableName`, then call `f`. The temporary table together with the Parquet file will
+   * be dropped/deleted after `f` returns.
+   */
+  protected def withParquetTable[T <: Product: ClassTag: TypeTag]
+      (data: Seq[T], tableName: String)
+      (f: => Unit): Unit = {
+    withParquetRDD(data) { rdd =>
+      rdd.registerTempTable(tableName)
+      withTempTable(tableName)(f)
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTestData.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTestData.scala
index c0918a40d136f..d5993656e0225 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTestData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTestData.scala
@@ -422,5 +422,41 @@ private[sql] object ParquetTestData {
     val first = reader.read()
     assert(first != null)
   } */
+
+  // to test golb pattern (wild card pattern matching for parquetFile input
+  val testGlobDir = Utils.createTempDir()
+  val testGlobSubDir1 = Utils.createTempDir(testGlobDir.getPath)
+  val testGlobSubDir2 = Utils.createTempDir(testGlobDir.getPath)
+  val testGlobSubDir3 = Utils.createTempDir(testGlobDir.getPath)
+
+  def writeGlobFiles() = {
+    val subDirs = Array(testGlobSubDir1, testGlobSubDir2, testGlobSubDir3)
+
+    subDirs.foreach { dir =>
+      val path: Path = new Path(new Path(dir.toURI), new Path("part-r-0.parquet"))
+      val job = new Job()
+      val schema: MessageType = MessageTypeParser.parseMessageType(testSchema)
+      val writeSupport = new TestGroupWriteSupport(schema)
+      val writer = new ParquetWriter[Group](path, writeSupport)
+
+      for(i <- 0 until 15) {
+        val record = new SimpleGroup(schema)
+        if(i % 3 == 0) {
+          record.add(0, true)
+        } else {
+          record.add(0, false)
+        }
+        if(i % 5 == 0) {
+          record.add(1, 5)
+        }
+        record.add(2, "abc")
+        record.add(3, i.toLong << 33)
+        record.add(4, 2.5F)
+        record.add(5, 4.5D)
+        writer.write(record)
+      }
+      writer.close()
+    }
+  }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
index fa37d1f2ae7e6..6d8c682ccced8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -24,8 +24,8 @@ import scala.util.Try
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.mapreduce.Job
-import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
+import parquet.format.converter.ParquetMetadataConverter
 import parquet.hadoop.{ParquetFileReader, Footer, ParquetFileWriter}
 import parquet.hadoop.metadata.{ParquetMetadata, FileMetaData}
 import parquet.hadoop.util.ContextUtil
@@ -36,7 +36,7 @@ import parquet.schema.Type.Repetition
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Attribute}
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 // Implicits
 import scala.collection.JavaConversions._
@@ -80,7 +80,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
 
   /**
    * Converts a given Parquet `Type` into the corresponding
-   * [[org.apache.spark.sql.catalyst.types.DataType]].
+   * [[org.apache.spark.sql.types.DataType]].
    *
    * We apply the following conversion rules:
    * <ul>
@@ -191,7 +191,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
   }
 
   /**
-   * For a given Catalyst [[org.apache.spark.sql.catalyst.types.DataType]] return
+   * For a given Catalyst [[org.apache.spark.sql.types.DataType]] return
    * the name of the corresponding Parquet primitive type or None if the given type
    * is not primitive.
    *
@@ -231,21 +231,21 @@ private[parquet] object ParquetTypesConverter extends Logging {
   }
 
   /**
-   * Converts a given Catalyst [[org.apache.spark.sql.catalyst.types.DataType]] into
+   * Converts a given Catalyst [[org.apache.spark.sql.types.DataType]] into
    * the corresponding Parquet `Type`.
    *
    * The conversion follows the rules below:
    * <ul>
    *   <li> Primitive types are converted into Parquet's primitive types.</li>
-   *   <li> [[org.apache.spark.sql.catalyst.types.StructType]]s are converted
+   *   <li> [[org.apache.spark.sql.types.StructType]]s are converted
    *        into Parquet's `GroupType` with the corresponding field types.</li>
-   *   <li> [[org.apache.spark.sql.catalyst.types.ArrayType]]s are converted
+   *   <li> [[org.apache.spark.sql.types.ArrayType]]s are converted
    *        into a 2-level nested group, where the outer group has the inner
    *        group as sole field. The inner group has name `values` and
    *        repetition level `REPEATED` and has the element type of
    *        the array as schema. We use Parquet's `ConversionPatterns` for this
    *        purpose.</li>
-   *   <li> [[org.apache.spark.sql.catalyst.types.MapType]]s are converted
+   *   <li> [[org.apache.spark.sql.types.MapType]]s are converted
    *        into a nested (2-level) Parquet `GroupType` with two fields: a key
    *        type and a value type. The nested group has repetition level
    *        `REPEATED` and name `map`. We use Parquet's `ConversionPatterns`
@@ -319,7 +319,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
           val fields = structFields.map {
             field => fromDataType(field.dataType, field.name, field.nullable, inArray = false)
           }
-          new ParquetGroupType(repetition, name, fields)
+          new ParquetGroupType(repetition, name, fields.toSeq)
         }
         case MapType(keyType, valueType, valueContainsNull) => {
           val parquetKeyType =
@@ -437,10 +437,14 @@ private[parquet] object ParquetTypesConverter extends Logging {
     }
     val path = origPath.makeQualified(fs)
 
-    val children = fs.listStatus(path).filterNot { status =>
-      val name = status.getPath.getName
-      (name(0) == '.' || name(0) == '_') && name != ParquetFileWriter.PARQUET_METADATA_FILE
-    }
+    val children =
+      fs
+        .globStatus(path)
+        .flatMap { status => if(status.isDir) fs.listStatus(status.getPath) else List(status) }
+        .filterNot { status =>
+          val name = status.getPath.getName
+          (name(0) == '.' || name(0) == '_') && name != ParquetFileWriter.PARQUET_METADATA_FILE
+        }
 
     ParquetRelation.enableLogForwarding()
 
@@ -454,7 +458,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
       // ... and fallback to "_metadata" if no such file exists (which implies the Parquet file is
       // empty, thus normally the "_metadata" file is expected to be fairly small).
       .orElse(children.find(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE))
-      .map(ParquetFileReader.readFooter(conf, _))
+      .map(ParquetFileReader.readFooter(conf, _, ParquetMetadataConverter.NO_FILTER))
       .getOrElse(
         throw new IllegalArgumentException(s"Could not find Parquet metadata at path $path"))
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 2e0c6c51c00e5..1b50afbbabcb0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -18,11 +18,12 @@ package org.apache.spark.sql.parquet
 
 import java.util.{List => JList}
 
+import scala.collection.JavaConversions._
+
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.conf.{Configurable, Configuration}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapreduce.{JobContext, InputSplit, Job}
-import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate
 
 import parquet.hadoop.ParquetInputFormat
 import parquet.hadoop.util.ContextUtil
@@ -30,13 +31,11 @@ import parquet.hadoop.util.ContextUtil
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.{Partition => SparkPartition, Logging}
 import org.apache.spark.rdd.{NewHadoopPartition, RDD}
-
 import org.apache.spark.sql.{SQLConf, Row, SQLContext}
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.types.{StringType, IntegerType, StructField, StructType}
 import org.apache.spark.sql.sources._
+import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 
-import scala.collection.JavaConversions._
 
 /**
  * Allows creation of parquet based tables using the syntax
@@ -137,7 +136,7 @@ case class ParquetRelation2(path: String)(@transient val sqlContext: SQLContext)
     ParquetTypesConverter.readSchemaFromFile(
       partitions.head.files.head.getPath,
       Some(sparkContext.hadoopConfiguration),
-      sqlContext.isParquetBinaryAsString))
+      sqlContext.conf.isParquetBinaryAsString))
 
   val dataIncludesKey =
     partitionKeys.headOption.map(dataSchema.fieldNames.contains(_)).getOrElse(true)
@@ -198,7 +197,7 @@ case class ParquetRelation2(path: String)(@transient val sqlContext: SQLContext)
     predicates
       .reduceOption(And)
       .flatMap(ParquetFilters.createFilter)
-      .filter(_ => sqlContext.parquetFilterPushDown)
+      .filter(_ => sqlContext.conf.parquetFilterPushDown)
       .foreach(ParquetInputFormat.setFilterPredicate(jobConf, _))
 
     def percentRead = selectedPartitions.size.toDouble / partitions.size.toDouble * 100
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/LogicalRelation.scala
index 4d87f6817dcb9..12b59ba20bb10 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/LogicalRelation.scala
@@ -17,7 +17,7 @@
 package org.apache.spark.sql.sources
 
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
-import org.apache.spark.sql.catalyst.expressions.AttributeMap
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeMap}
 import org.apache.spark.sql.catalyst.plans.logical.{Statistics, LeafNode, LogicalPlan}
 
 /**
@@ -27,7 +27,7 @@ private[sql] case class LogicalRelation(relation: BaseRelation)
   extends LeafNode
   with MultiInstanceRelation {
 
-  override val output = relation.schema.toAttributes
+  override val output: Seq[AttributeReference] = relation.schema.toAttributes
 
   // Logical Relations are distinct if they have different output for the sake of transformations.
   override def equals(other: Any) = other match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index ca510cb0b07e3..171b816a26332 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -17,65 +17,91 @@
 
 package org.apache.spark.sql.sources
 
+import scala.language.implicitConversions
+
 import org.apache.spark.Logging
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.{SchemaRDD, SQLContext}
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
 import org.apache.spark.sql.execution.RunnableCommand
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
-import scala.language.implicitConversions
-import scala.util.parsing.combinator.lexical.StdLexical
-import scala.util.parsing.combinator.syntactical.StandardTokenParsers
-import scala.util.parsing.combinator.PackratParsers
-
-import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.SqlLexical
 
 /**
  * A parser for foreign DDL commands.
  */
-private[sql] class DDLParser extends StandardTokenParsers with PackratParsers with Logging {
+private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
+
+  def apply(input: String, exceptionOnError: Boolean): Option[LogicalPlan] = {
+    try {
+      Some(apply(input))
+    } catch {
+      case _ if !exceptionOnError => None
+      case x: Throwable => throw x
+    }
+  }
 
-  def apply(input: String): Option[LogicalPlan] = {
-    phrase(ddl)(new lexical.Scanner(input)) match {
-      case Success(r, x) => Some(r)
+  def parseType(input: String): DataType = {
+    lexical.initialize(reservedWords)
+    phrase(dataType)(new lexical.Scanner(input)) match {
+      case Success(r, x) => r
       case x =>
-        logDebug(s"Not recognized as DDL: $x")
-        None
+        sys.error(s"Unsupported dataType: $x")
     }
   }
 
-  protected case class Keyword(str: String)
-
-  protected implicit def asParser(k: Keyword): Parser[String] =
-    lexical.allCaseVersions(k.str).map(x => x : Parser[String]).reduce(_ | _)
 
+  // Keyword is a convention with AbstractSparkSQLParser, which will scan all of the `Keyword`
+  // properties via reflection the class in runtime for constructing the SqlLexical object
   protected val CREATE = Keyword("CREATE")
   protected val TEMPORARY = Keyword("TEMPORARY")
   protected val TABLE = Keyword("TABLE")
   protected val USING = Keyword("USING")
   protected val OPTIONS = Keyword("OPTIONS")
 
-  // Use reflection to find the reserved words defined in this class.
-  protected val reservedWords =
-    this.getClass
-      .getMethods
-      .filter(_.getReturnType == classOf[Keyword])
-      .map(_.invoke(this).asInstanceOf[Keyword].str)
-
-  override val lexical = new SqlLexical(reservedWords)
+  // Data types.
+  protected val STRING = Keyword("STRING")
+  protected val BINARY = Keyword("BINARY")
+  protected val BOOLEAN = Keyword("BOOLEAN")
+  protected val TINYINT = Keyword("TINYINT")
+  protected val SMALLINT = Keyword("SMALLINT")
+  protected val INT = Keyword("INT")
+  protected val BIGINT = Keyword("BIGINT")
+  protected val FLOAT = Keyword("FLOAT")
+  protected val DOUBLE = Keyword("DOUBLE")
+  protected val DECIMAL = Keyword("DECIMAL")
+  protected val DATE = Keyword("DATE")
+  protected val TIMESTAMP = Keyword("TIMESTAMP")
+  protected val VARCHAR = Keyword("VARCHAR")
+  protected val ARRAY = Keyword("ARRAY")
+  protected val MAP = Keyword("MAP")
+  protected val STRUCT = Keyword("STRUCT")
 
   protected lazy val ddl: Parser[LogicalPlan] = createTable
 
+  protected def start: Parser[LogicalPlan] = ddl
+
   /**
-   * CREATE TEMPORARY TABLE avroTable
+   * `CREATE [TEMPORARY] TABLE avroTable
    * USING org.apache.spark.sql.avro
-   * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")
+   * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")`
+   * or
+   * `CREATE [TEMPORARY] TABLE avroTable(intField int, stringField string...)
+   * USING org.apache.spark.sql.avro
+   * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")`
    */
   protected lazy val createTable: Parser[LogicalPlan] =
-    CREATE ~ TEMPORARY ~ TABLE ~> ident ~ (USING ~> className) ~ (OPTIONS ~> options) ^^ {
-      case tableName ~ provider ~ opts =>
-        CreateTableUsing(tableName, provider, opts)
+  (
+    (CREATE ~> TEMPORARY.? <~ TABLE) ~ ident
+      ~ (tableCols).? ~ (USING ~> className) ~ (OPTIONS ~> options) ^^ {
+      case temp ~ tableName ~ columns ~ provider ~ opts =>
+        val userSpecifiedSchema = columns.flatMap(fields => Some(StructType(fields)))
+        CreateTableUsing(tableName, userSpecifiedSchema, provider, temp.isDefined, opts)
     }
+  )
+
+  protected lazy val tableCols: Parser[Seq[StructField]] =  "(" ~> repsep(column, ",") <~ ")"
 
   protected lazy val options: Parser[Map[String, String]] =
     "(" ~> repsep(pair, ",") <~ ")" ^^ { case s: Seq[(String, String)] => s.toMap }
@@ -83,14 +109,69 @@ private[sql] class DDLParser extends StandardTokenParsers with PackratParsers wi
   protected lazy val className: Parser[String] = repsep(ident, ".") ^^ { case s => s.mkString(".")}
 
   protected lazy val pair: Parser[(String, String)] = ident ~ stringLit ^^ { case k ~ v => (k,v) }
-}
 
-private[sql] case class CreateTableUsing(
-    tableName: String,
-    provider: String,
-    options: Map[String, String]) extends RunnableCommand {
+  protected lazy val column: Parser[StructField] =
+    ident ~ dataType ^^ { case columnName ~ typ =>
+      StructField(columnName, typ)
+    }
 
-  def run(sqlContext: SQLContext) = {
+  protected lazy val primitiveType: Parser[DataType] =
+    STRING ^^^ StringType |
+    BINARY ^^^ BinaryType |
+    BOOLEAN ^^^ BooleanType |
+    TINYINT ^^^ ByteType |
+    SMALLINT ^^^ ShortType |
+    INT ^^^ IntegerType |
+    BIGINT ^^^ LongType |
+    FLOAT ^^^ FloatType |
+    DOUBLE ^^^ DoubleType |
+    fixedDecimalType |                   // decimal with precision/scale
+    DECIMAL ^^^ DecimalType.Unlimited |  // decimal with no precision/scale
+    DATE ^^^ DateType |
+    TIMESTAMP ^^^ TimestampType |
+    VARCHAR ~ "(" ~ numericLit ~ ")" ^^^ StringType
+
+  protected lazy val fixedDecimalType: Parser[DataType] =
+    (DECIMAL ~ "(" ~> numericLit) ~ ("," ~> numericLit <~ ")") ^^ {
+      case precision ~ scale => DecimalType(precision.toInt, scale.toInt)
+    }
+
+  protected lazy val arrayType: Parser[DataType] =
+    ARRAY ~> "<" ~> dataType <~ ">" ^^ {
+      case tpe => ArrayType(tpe)
+    }
+
+  protected lazy val mapType: Parser[DataType] =
+    MAP ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ {
+      case t1 ~ _ ~ t2 => MapType(t1, t2)
+    }
+
+  protected lazy val structField: Parser[StructField] =
+    ident ~ ":" ~ dataType ^^ {
+      case fieldName ~ _ ~ tpe => StructField(fieldName, tpe, nullable = true)
+    }
+
+  protected lazy val structType: Parser[DataType] =
+    (STRUCT ~> "<" ~> repsep(structField, ",") <~ ">" ^^ {
+    case fields => StructType(fields)
+    }) |
+    (STRUCT ~> "<>" ^^ {
+      case fields => StructType(Nil)
+    })
+
+  private[sql] lazy val dataType: Parser[DataType] =
+    arrayType |
+    mapType |
+    structType |
+    primitiveType
+}
+
+object ResolvedDataSource {
+  def apply(
+      sqlContext: SQLContext,
+      userSpecifiedSchema: Option[StructType],
+      provider: String,
+      options: Map[String, String]): ResolvedDataSource = {
     val loader = Utils.getContextOrSparkClassLoader
     val clazz: Class[_] = try loader.loadClass(provider) catch {
       case cnf: java.lang.ClassNotFoundException =>
@@ -99,10 +180,70 @@ private[sql] case class CreateTableUsing(
             sys.error(s"Failed to load class for data source: $provider")
         }
     }
-    val dataSource = clazz.newInstance().asInstanceOf[org.apache.spark.sql.sources.RelationProvider]
-    val relation = dataSource.createRelation(sqlContext, options)
 
-    sqlContext.baseRelationToSchemaRDD(relation).registerTempTable(tableName)
+    val relation = userSpecifiedSchema match {
+      case Some(schema: StructType) => {
+        clazz.newInstance match {
+          case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider =>
+            dataSource
+              .asInstanceOf[org.apache.spark.sql.sources.SchemaRelationProvider]
+              .createRelation(sqlContext, new CaseInsensitiveMap(options), schema)
+          case _ =>
+            sys.error(s"${clazz.getCanonicalName} does not allow user-specified schemas.")
+        }
+      }
+      case None => {
+        clazz.newInstance match {
+          case dataSource: org.apache.spark.sql.sources.RelationProvider =>
+            dataSource
+              .asInstanceOf[org.apache.spark.sql.sources.RelationProvider]
+              .createRelation(sqlContext, new CaseInsensitiveMap(options))
+          case _ =>
+            sys.error(s"A schema needs to be specified when using ${clazz.getCanonicalName}.")
+        }
+      }
+    }
+
+    new ResolvedDataSource(clazz, relation)
+  }
+}
+
+private[sql] case class ResolvedDataSource(provider: Class[_], relation: BaseRelation)
+
+private[sql] case class CreateTableUsing(
+    tableName: String,
+    userSpecifiedSchema: Option[StructType],
+    provider: String,
+    temporary: Boolean,
+    options: Map[String, String]) extends Command
+
+private [sql] case class CreateTempTableUsing(
+    tableName: String,
+    userSpecifiedSchema: Option[StructType],
+    provider: String,
+    options: Map[String, String])  extends RunnableCommand {
+
+  def run(sqlContext: SQLContext) = {
+    val resolved = ResolvedDataSource(sqlContext, userSpecifiedSchema, provider, options)
+    new SchemaRDD(sqlContext, LogicalRelation(resolved.relation)).registerTempTable(tableName)
     Seq.empty
   }
 }
+
+/**
+ * Builds a map in which keys are case insensitive
+ */
+protected class CaseInsensitiveMap(map: Map[String, String]) extends Map[String, String] 
+  with Serializable {
+
+  val baseMap = map.map(kv => kv.copy(_1 = kv._1.toLowerCase))
+
+  override def get(k: String): Option[String] = baseMap.get(k.toLowerCase)
+
+  override def + [B1 >: String](kv: (String, B1)): Map[String, B1] =
+    baseMap + kv.copy(_1 = kv._1.toLowerCase)
+
+  override def iterator: Iterator[(String, String)] = baseMap.iterator
+
+  override def -(key: String): Map[String, String] = baseMap - key.toLowerCase()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 939b4e15163a6..cd82cc6ecb61b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -18,14 +18,15 @@ package org.apache.spark.sql.sources
 
 import org.apache.spark.annotation.{Experimental, DeveloperApi}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{SQLConf, Row, SQLContext, StructType}
+import org.apache.spark.sql.{Row, SQLContext}
 import org.apache.spark.sql.catalyst.expressions.{Expression, Attribute}
+import org.apache.spark.sql.types.StructType
 
 /**
  * ::DeveloperApi::
  * Implemented by objects that produce relations for a specific kind of data source.  When
- * Spark SQL is given a DDL operation with a USING clause specified, this interface is used to
- * pass in the parameters specified by a user.
+ * Spark SQL is given a DDL operation with a USING clause specified (to specify the implemented
+ * RelationProvider), this interface is used to pass in the parameters specified by a user.
  *
  * Users may specify the fully qualified class name of a given data source.  When that class is
  * not found Spark SQL will append the class name `DefaultSource` to the path, allowing for
@@ -36,10 +37,46 @@ import org.apache.spark.sql.catalyst.expressions.{Expression, Attribute}
  */
 @DeveloperApi
 trait RelationProvider {
-  /** Returns a new base relation with the given parameters. */
+  /**
+   * Returns a new base relation with the given parameters.
+   * Note: the parameters' keywords are case insensitive and this insensitivity is enforced
+   * by the Map that is passed to the function.
+   */
   def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation
 }
 
+/**
+ * ::DeveloperApi::
+ * Implemented by objects that produce relations for a specific kind of data source
+ * with a given schema.  When Spark SQL is given a DDL operation with a USING clause specified (
+ * to specify the implemented SchemaRelationProvider) and a user defined schema, this interface
+ * is used to pass in the parameters specified by a user.
+ *
+ * Users may specify the fully qualified class name of a given data source.  When that class is
+ * not found Spark SQL will append the class name `DefaultSource` to the path, allowing for
+ * less verbose invocation.  For example, 'org.apache.spark.sql.json' would resolve to the
+ * data source 'org.apache.spark.sql.json.DefaultSource'
+ *
+ * A new instance of this class with be instantiated each time a DDL call is made.
+ *
+ * The difference between a [[RelationProvider]] and a [[SchemaRelationProvider]] is that
+ * users need to provide a schema when using a SchemaRelationProvider.
+ * A relation provider can inherits both [[RelationProvider]] and [[SchemaRelationProvider]]
+ * if it can support both schema inference and user-specified schemas.
+ */
+@DeveloperApi
+trait SchemaRelationProvider {
+  /**
+   * Returns a new base relation with the given parameters and user defined schema.
+   * Note: the parameters' keywords are case insensitive and this insensitivity is enforced
+   * by the Map that is passed to the function.
+   */
+  def createRelation(
+      sqlContext: SQLContext,
+      parameters: Map[String, String],
+      schema: StructType): BaseRelation
+}
+
 /**
  * ::DeveloperApi::
  * Represents a collection of tuples with a known schema.  Classes that extend BaseRelation must
@@ -63,7 +100,7 @@ abstract class BaseRelation {
    * large to broadcast.  This method will be called multiple times during query planning
    * and thus should not perform expensive operations for each invocation.
    */
-  def sizeInBytes = sqlContext.defaultSizeInBytes
+  def sizeInBytes = sqlContext.conf.defaultSizeInBytes
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
index b9569e96c0312..006b16fbe07bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
@@ -20,9 +20,7 @@ package org.apache.spark.sql.test
 import java.util
 
 import scala.collection.JavaConverters._
-
-import org.apache.spark.sql.catalyst.annotation.SQLUserDefinedType
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 /**
  * An example class to demonstrate UDT in Scala, Java, and Python.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala
index 6bb81c76ed8bd..f9c082216085d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala
@@ -17,8 +17,11 @@
 
 package org.apache.spark.sql.test
 
+import scala.language.implicitConversions
+
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.sql.{SQLConf, SQLContext}
+import org.apache.spark.sql.{SchemaRDD, SQLConf, SQLContext}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 
 /** A SQLContext that can be used for local testing. */
 object TestSQLContext
@@ -29,6 +32,16 @@ object TestSQLContext
       new SparkConf().set("spark.sql.testkey", "true"))) {
 
   /** Fewer partitions to speed up testing. */
-  override private[spark] def numShufflePartitions: Int =
-    getConf(SQLConf.SHUFFLE_PARTITIONS, "5").toInt
+  protected[sql] override lazy val conf: SQLConf = new SQLConf {
+    override def numShufflePartitions: Int = this.getConf(SQLConf.SHUFFLE_PARTITIONS, "5").toInt
+  }
+
+  /**
+   * Turn a logical plan into a SchemaRDD. This should be removed once we have an easier way to
+   * construct SchemaRDD directly out of local data without relying on implicits.
+   */
+  protected[sql] implicit def logicalPlanToSparkQuery(plan: LogicalPlan): SchemaRDD = {
+    new SchemaRDD(this, plan)
+  }
+
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/types/util/DataTypeConversions.scala b/sql/core/src/main/scala/org/apache/spark/sql/types/util/DataTypeConversions.scala
deleted file mode 100644
index d4ef517981699..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/types/util/DataTypeConversions.scala
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.types.util
-
-import java.text.SimpleDateFormat
-
-import scala.collection.JavaConverters._
-
-import org.apache.spark.sql._
-import org.apache.spark.sql.api.java.{DataType => JDataType, StructField => JStructField,
-  MetadataBuilder => JMetaDataBuilder, UDTWrappers}
-import org.apache.spark.sql.api.java.{DecimalType => JDecimalType}
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
-import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.catalyst.types.UserDefinedType
-
-protected[sql] object DataTypeConversions {
-
-  /**
-   * Returns the equivalent StructField in Scala for the given StructField in Java.
-   */
-  def asJavaStructField(scalaStructField: StructField): JStructField = {
-    JDataType.createStructField(
-      scalaStructField.name,
-      asJavaDataType(scalaStructField.dataType),
-      scalaStructField.nullable,
-      (new JMetaDataBuilder).withMetadata(scalaStructField.metadata).build())
-  }
-
-  /**
-   * Returns the equivalent DataType in Java for the given DataType in Scala.
-   */
-  def asJavaDataType(scalaDataType: DataType): JDataType = scalaDataType match {
-    case udtType: UserDefinedType[_] =>
-      UDTWrappers.wrapAsJava(udtType)
-
-    case StringType => JDataType.StringType
-    case BinaryType => JDataType.BinaryType
-    case BooleanType => JDataType.BooleanType
-    case DateType => JDataType.DateType
-    case TimestampType => JDataType.TimestampType
-    case DecimalType.Fixed(precision, scale) => new JDecimalType(precision, scale)
-    case DecimalType.Unlimited => new JDecimalType()
-    case DoubleType => JDataType.DoubleType
-    case FloatType => JDataType.FloatType
-    case ByteType => JDataType.ByteType
-    case IntegerType => JDataType.IntegerType
-    case LongType => JDataType.LongType
-    case ShortType => JDataType.ShortType
-    case NullType => JDataType.NullType
-
-    case arrayType: ArrayType => JDataType.createArrayType(
-        asJavaDataType(arrayType.elementType), arrayType.containsNull)
-    case mapType: MapType => JDataType.createMapType(
-        asJavaDataType(mapType.keyType),
-        asJavaDataType(mapType.valueType),
-        mapType.valueContainsNull)
-    case structType: StructType => JDataType.createStructType(
-        structType.fields.map(asJavaStructField).asJava)
-  }
-
-  /**
-   * Returns the equivalent StructField in Scala for the given StructField in Java.
-   */
-  def asScalaStructField(javaStructField: JStructField): StructField = {
-    StructField(
-      javaStructField.getName,
-      asScalaDataType(javaStructField.getDataType),
-      javaStructField.isNullable,
-      javaStructField.getMetadata)
-  }
-
-  /**
-   * Returns the equivalent DataType in Scala for the given DataType in Java.
-   */
-  def asScalaDataType(javaDataType: JDataType): DataType = javaDataType match {
-    case udtType: org.apache.spark.sql.api.java.UserDefinedType[_] =>
-      UDTWrappers.wrapAsScala(udtType)
-
-    case stringType: org.apache.spark.sql.api.java.StringType =>
-      StringType
-    case binaryType: org.apache.spark.sql.api.java.BinaryType =>
-      BinaryType
-    case booleanType: org.apache.spark.sql.api.java.BooleanType =>
-      BooleanType
-    case dateType: org.apache.spark.sql.api.java.DateType =>
-      DateType
-    case timestampType: org.apache.spark.sql.api.java.TimestampType =>
-      TimestampType
-    case decimalType: org.apache.spark.sql.api.java.DecimalType =>
-      if (decimalType.isFixed) {
-        DecimalType(decimalType.getPrecision, decimalType.getScale)
-      } else {
-        DecimalType.Unlimited
-      }
-    case doubleType: org.apache.spark.sql.api.java.DoubleType =>
-      DoubleType
-    case floatType: org.apache.spark.sql.api.java.FloatType =>
-      FloatType
-    case byteType: org.apache.spark.sql.api.java.ByteType =>
-      ByteType
-    case integerType: org.apache.spark.sql.api.java.IntegerType =>
-      IntegerType
-    case longType: org.apache.spark.sql.api.java.LongType =>
-      LongType
-    case shortType: org.apache.spark.sql.api.java.ShortType =>
-      ShortType
-
-    case arrayType: org.apache.spark.sql.api.java.ArrayType =>
-      ArrayType(asScalaDataType(arrayType.getElementType), arrayType.isContainsNull)
-    case mapType: org.apache.spark.sql.api.java.MapType =>
-      MapType(
-        asScalaDataType(mapType.getKeyType),
-        asScalaDataType(mapType.getValueType),
-        mapType.isValueContainsNull)
-    case structType: org.apache.spark.sql.api.java.StructType =>
-      StructType(structType.getFields.map(asScalaStructField))
-  }
-
-  def stringToTime(s: String): java.util.Date = {
-    if (!s.contains('T')) {
-      // JDBC escape string
-      if (s.contains(' ')) {
-        java.sql.Timestamp.valueOf(s)
-      } else {
-        java.sql.Date.valueOf(s)
-      }
-    } else if (s.endsWith("Z")) {
-      // this is zero timezone of ISO8601
-      stringToTime(s.substring(0, s.length - 1) + "GMT-00:00")
-    } else if (s.indexOf("GMT") == -1) {
-      // timezone with ISO8601
-      val inset = "+00.00".length
-      val s0 = s.substring(0, s.length - inset)
-      val s1 = s.substring(s.length - inset, s.length)
-      if (s0.substring(s0.lastIndexOf(':')).contains('.')) {
-        stringToTime(s0 + "GMT" + s1)
-      } else {
-        stringToTime(s0 + ".0GMT" + s1)
-      }
-    } else {
-      // ISO8601 with GMT insert
-      val ISO8601GMT: SimpleDateFormat = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSSz" )
-      ISO8601GMT.parse(s)
-    }
-  }
-
-  /** Converts Java objects to catalyst rows / types */
-  def convertJavaToCatalyst(a: Any, dataType: DataType): Any = (a, dataType) match {
-    case (obj, udt: UserDefinedType[_]) => ScalaReflection.convertToCatalyst(obj, udt) // Scala type
-    case (d: java.math.BigDecimal, _) => Decimal(BigDecimal(d))
-    case (other, _) => other
-  }
-
-  /** Converts Java objects to catalyst rows / types */
-  def convertCatalystToJava(a: Any): Any = a match {
-    case d: scala.math.BigDecimal => d.underlying()
-    case other => other
-  }
-}
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java
index a9a11285def54..9ff40471a00af 100644
--- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java
+++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java
@@ -19,27 +19,26 @@
 
 import java.io.Serializable;
 
-import org.apache.spark.sql.api.java.UDF1;
 import org.junit.After;
-import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
-import org.junit.runners.Suite;
-import org.junit.runner.RunWith;
 
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
 
 // The test suite itself is Serializable so that anonymous Function implementations can be
 // serialized, as an alternative to converting these anonymous classes to static inner classes;
 // see http://stackoverflow.com/questions/758570/.
 public class JavaAPISuite implements Serializable {
   private transient JavaSparkContext sc;
-  private transient JavaSQLContext sqlContext;
+  private transient SQLContext sqlContext;
 
   @Before
   public void setUp() {
     sc = new JavaSparkContext("local", "JavaAPISuite");
-    sqlContext = new JavaSQLContext(sc);
+    sqlContext = new SQLContext(sc);
   }
 
   @After
@@ -55,15 +54,14 @@ public void udf1Test() {
     // sqlContext.registerFunction(
     //   "stringLengthTest", (String str) -> str.length(), DataType.IntegerType);
 
-    sqlContext.registerFunction("stringLengthTest", new UDF1<String, Integer>() {
+    sqlContext.udf().register("stringLengthTest", new UDF1<String, Integer>() {
       @Override
       public Integer call(String str) throws Exception {
         return str.length();
       }
-    }, DataType.IntegerType);
+    }, DataTypes.IntegerType);
 
-    // TODO: Why do we need this cast?
-    Row result = (Row) sqlContext.sql("SELECT stringLengthTest('test')").first();
+    Row result = sqlContext.sql("SELECT stringLengthTest('test')").first();
     assert(result.getInt(0) == 4);
   }
 
@@ -76,15 +74,14 @@ public void udf2Test() {
     //   (String str1, String str2) -> str1.length() + str2.length,
     //   DataType.IntegerType);
 
-    sqlContext.registerFunction("stringLengthTest", new UDF2<String, String, Integer>() {
+    sqlContext.udf().register("stringLengthTest", new UDF2<String, String, Integer>() {
       @Override
       public Integer call(String str1, String str2) throws Exception {
         return str1.length() + str2.length();
       }
-    }, DataType.IntegerType);
+    }, DataTypes.IntegerType);
 
-    // TODO: Why do we need this cast?
-    Row result = (Row) sqlContext.sql("SELECT stringLengthTest('test', 'test2')").first();
+    Row result = sqlContext.sql("SELECT stringLengthTest('test', 'test2')").first();
     assert(result.getInt(0) == 9);
   }
 }
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
index a04b8060cd658..9e96738ac095a 100644
--- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
+++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.api.java;
 
 import java.io.Serializable;
-import java.math.BigDecimal;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@@ -31,18 +30,20 @@
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.types.*;
 
 // The test suite itself is Serializable so that anonymous Function implementations can be
 // serialized, as an alternative to converting these anonymous classes to static inner classes;
 // see http://stackoverflow.com/questions/758570/.
 public class JavaApplySchemaSuite implements Serializable {
   private transient JavaSparkContext javaCtx;
-  private transient JavaSQLContext javaSqlCtx;
+  private transient SQLContext javaSqlCtx;
 
   @Before
   public void setUp() {
     javaCtx = new JavaSparkContext("local", "JavaApplySchemaSuite");
-    javaSqlCtx = new JavaSQLContext(javaCtx);
+    javaSqlCtx = new SQLContext(javaCtx);
   }
 
   @After
@@ -88,24 +89,24 @@ public void applySchema() {
     JavaRDD<Row> rowRDD = javaCtx.parallelize(personList).map(
       new Function<Person, Row>() {
         public Row call(Person person) throws Exception {
-          return Row.create(person.getName(), person.getAge());
+          return RowFactory.create(person.getName(), person.getAge());
         }
       });
 
     List<StructField> fields = new ArrayList<StructField>(2);
-    fields.add(DataType.createStructField("name", DataType.StringType, false));
-    fields.add(DataType.createStructField("age", DataType.IntegerType, false));
-    StructType schema = DataType.createStructType(fields);
+    fields.add(DataTypes.createStructField("name", DataTypes.StringType, false));
+    fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false));
+    StructType schema = DataTypes.createStructType(fields);
 
-    JavaSchemaRDD schemaRDD = javaSqlCtx.applySchema(rowRDD, schema);
+    SchemaRDD schemaRDD = javaSqlCtx.applySchema(rowRDD.rdd(), schema);
     schemaRDD.registerTempTable("people");
-    List<Row> actual = javaSqlCtx.sql("SELECT * FROM people").collect();
+    Row[] actual = javaSqlCtx.sql("SELECT * FROM people").collect();
 
     List<Row> expected = new ArrayList<Row>(2);
-    expected.add(Row.create("Michael", 29));
-    expected.add(Row.create("Yin", 28));
+    expected.add(RowFactory.create("Michael", 29));
+    expected.add(RowFactory.create("Yin", 28));
 
-    Assert.assertEquals(expected, actual);
+    Assert.assertEquals(expected, Arrays.asList(actual));
   }
 
   @Test
@@ -118,18 +119,18 @@ public void applySchemaToJSON() {
         "\"bigInteger\":92233720368547758069, \"double\":1.7976931348623157E305, " +
         "\"boolean\":false, \"null\":null}"));
     List<StructField> fields = new ArrayList<StructField>(7);
-    fields.add(DataType.createStructField("bigInteger", new DecimalType(), true));
-    fields.add(DataType.createStructField("boolean", DataType.BooleanType, true));
-    fields.add(DataType.createStructField("double", DataType.DoubleType, true));
-    fields.add(DataType.createStructField("integer", DataType.IntegerType, true));
-    fields.add(DataType.createStructField("long", DataType.LongType, true));
-    fields.add(DataType.createStructField("null", DataType.StringType, true));
-    fields.add(DataType.createStructField("string", DataType.StringType, true));
-    StructType expectedSchema = DataType.createStructType(fields);
+    fields.add(DataTypes.createStructField("bigInteger", DataTypes.createDecimalType(), true));
+    fields.add(DataTypes.createStructField("boolean", DataTypes.BooleanType, true));
+    fields.add(DataTypes.createStructField("double", DataTypes.DoubleType, true));
+    fields.add(DataTypes.createStructField("integer", DataTypes.IntegerType, true));
+    fields.add(DataTypes.createStructField("long", DataTypes.LongType, true));
+    fields.add(DataTypes.createStructField("null", DataTypes.StringType, true));
+    fields.add(DataTypes.createStructField("string", DataTypes.StringType, true));
+    StructType expectedSchema = DataTypes.createStructType(fields);
     List<Row> expectedResult = new ArrayList<Row>(2);
     expectedResult.add(
-      Row.create(
-        new BigDecimal("92233720368547758070"),
+      RowFactory.create(
+        new java.math.BigDecimal("92233720368547758070"),
         true,
         1.7976931348623157E308,
         10,
@@ -137,8 +138,8 @@ public void applySchemaToJSON() {
         null,
         "this is a simple string."));
     expectedResult.add(
-      Row.create(
-        new BigDecimal("92233720368547758069"),
+      RowFactory.create(
+        new java.math.BigDecimal("92233720368547758069"),
         false,
         1.7976931348623157E305,
         11,
@@ -146,18 +147,18 @@ public void applySchemaToJSON() {
         null,
         "this is another simple string."));
 
-    JavaSchemaRDD schemaRDD1 = javaSqlCtx.jsonRDD(jsonRDD);
+    SchemaRDD schemaRDD1 = javaSqlCtx.jsonRDD(jsonRDD.rdd());
     StructType actualSchema1 = schemaRDD1.schema();
     Assert.assertEquals(expectedSchema, actualSchema1);
     schemaRDD1.registerTempTable("jsonTable1");
-    List<Row> actual1 = javaSqlCtx.sql("select * from jsonTable1").collect();
+    List<Row> actual1 = javaSqlCtx.sql("select * from jsonTable1").collectAsList();
     Assert.assertEquals(expectedResult, actual1);
 
-    JavaSchemaRDD schemaRDD2 = javaSqlCtx.jsonRDD(jsonRDD, expectedSchema);
+    SchemaRDD schemaRDD2 = javaSqlCtx.jsonRDD(jsonRDD.rdd(), expectedSchema);
     StructType actualSchema2 = schemaRDD2.schema();
     Assert.assertEquals(expectedSchema, actualSchema2);
     schemaRDD2.registerTempTable("jsonTable2");
-    List<Row> actual2 = javaSqlCtx.sql("select * from jsonTable2").collect();
+    List<Row> actual2 = javaSqlCtx.sql("select * from jsonTable2").collectAsList();
     Assert.assertEquals(expectedResult, actual2);
   }
 }
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaRowSuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaRowSuite.java
index bc5cd66482add..fbfcd3f59d910 100644
--- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaRowSuite.java
+++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaRowSuite.java
@@ -29,6 +29,9 @@
 import org.junit.Before;
 import org.junit.Test;
 
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+
 public class JavaRowSuite {
   private byte byteValue;
   private short shortValue;
@@ -61,7 +64,7 @@ public void setUp() {
 
   @Test
   public void constructSimpleRow() {
-    Row simpleRow = Row.create(
+    Row simpleRow = RowFactory.create(
       byteValue,                 // ByteType
       new Byte(byteValue),
       shortValue,                // ShortType
@@ -137,10 +140,11 @@ public void constructComplexRow() {
     simpleMap.put(stringValue + " (3)", longValue - 2);
 
     // Simple struct
-    Row simpleStruct = Row.create(
+    Row simpleStruct = RowFactory.create(
       doubleValue, stringValue, timestampValue, null);
 
     // Complex array
+    @SuppressWarnings("unchecked")
     List<Map<String, Long>> arrayOfMaps = Arrays.asList(simpleMap);
     List<Row> arrayOfRows = Arrays.asList(simpleStruct);
 
@@ -149,7 +153,7 @@ public void constructComplexRow() {
     complexMap.put(arrayOfRows, simpleStruct);
 
     // Complex struct
-    Row complexStruct = Row.create(
+    Row complexStruct = RowFactory.create(
       simpleStringArray,
       simpleMap,
       simpleStruct,
@@ -166,7 +170,7 @@ public void constructComplexRow() {
     Assert.assertEquals(null, complexStruct.get(6));
 
     // A very complex row
-    Row complexRow = Row.create(arrayOfMaps, arrayOfRows, complexMap, complexStruct);
+    Row complexRow = RowFactory.create(arrayOfMaps, arrayOfRows, complexMap, complexStruct);
     Assert.assertEquals(arrayOfMaps, complexRow.get(0));
     Assert.assertEquals(arrayOfRows, complexRow.get(1));
     Assert.assertEquals(complexMap, complexRow.get(2));
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaSideDataTypeConversionSuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaSideDataTypeConversionSuite.java
deleted file mode 100644
index 8396a29c61c4c..0000000000000
--- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaSideDataTypeConversionSuite.java
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-import java.util.List;
-import java.util.ArrayList;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-import org.apache.spark.sql.types.util.DataTypeConversions;
-
-public class JavaSideDataTypeConversionSuite {
-  public void checkDataType(DataType javaDataType) {
-    org.apache.spark.sql.catalyst.types.DataType scalaDataType =
-      DataTypeConversions.asScalaDataType(javaDataType);
-    DataType actual = DataTypeConversions.asJavaDataType(scalaDataType);
-    Assert.assertEquals(javaDataType, actual);
-  }
-
-  @Test
-  public void createDataTypes() {
-    // Simple DataTypes.
-    checkDataType(DataType.StringType);
-    checkDataType(DataType.BinaryType);
-    checkDataType(DataType.BooleanType);
-    checkDataType(DataType.DateType);
-    checkDataType(DataType.TimestampType);
-    checkDataType(new DecimalType());
-    checkDataType(new DecimalType(10, 4));
-    checkDataType(DataType.DoubleType);
-    checkDataType(DataType.FloatType);
-    checkDataType(DataType.ByteType);
-    checkDataType(DataType.IntegerType);
-    checkDataType(DataType.LongType);
-    checkDataType(DataType.ShortType);
-
-    // Simple ArrayType.
-    DataType simpleJavaArrayType = DataType.createArrayType(DataType.StringType, true);
-    checkDataType(simpleJavaArrayType);
-
-    // Simple MapType.
-    DataType simpleJavaMapType = DataType.createMapType(DataType.StringType, DataType.LongType);
-    checkDataType(simpleJavaMapType);
-
-    // Simple StructType.
-    List<StructField> simpleFields = new ArrayList<StructField>();
-    simpleFields.add(DataType.createStructField("a", new DecimalType(), false));
-    simpleFields.add(DataType.createStructField("b", DataType.BooleanType, true));
-    simpleFields.add(DataType.createStructField("c", DataType.LongType, true));
-    simpleFields.add(DataType.createStructField("d", DataType.BinaryType, false));
-    DataType simpleJavaStructType = DataType.createStructType(simpleFields);
-    checkDataType(simpleJavaStructType);
-
-    // Complex StructType.
-    List<StructField> complexFields = new ArrayList<StructField>();
-    complexFields.add(DataType.createStructField("simpleArray", simpleJavaArrayType, true));
-    complexFields.add(DataType.createStructField("simpleMap", simpleJavaMapType, true));
-    complexFields.add(DataType.createStructField("simpleStruct", simpleJavaStructType, true));
-    complexFields.add(DataType.createStructField("boolean", DataType.BooleanType, false));
-    DataType complexJavaStructType = DataType.createStructType(complexFields);
-    checkDataType(complexJavaStructType);
-
-    // Complex ArrayType.
-    DataType complexJavaArrayType = DataType.createArrayType(complexJavaStructType, true);
-    checkDataType(complexJavaArrayType);
-
-    // Complex MapType.
-    DataType complexJavaMapType =
-      DataType.createMapType(complexJavaStructType, complexJavaArrayType, false);
-    checkDataType(complexJavaMapType);
-  }
-
-  @Test
-  public void illegalArgument() {
-    // ArrayType
-    try {
-      DataType.createArrayType(null, true);
-      Assert.fail();
-    } catch (IllegalArgumentException expectedException) {
-    }
-
-    // MapType
-    try {
-      DataType.createMapType(null, DataType.StringType);
-      Assert.fail();
-    } catch (IllegalArgumentException expectedException) {
-    }
-    try {
-      DataType.createMapType(DataType.StringType, null);
-      Assert.fail();
-    } catch (IllegalArgumentException expectedException) {
-    }
-    try {
-      DataType.createMapType(null, null);
-      Assert.fail();
-    } catch (IllegalArgumentException expectedException) {
-    }
-
-    // StructField
-    try {
-      DataType.createStructField(null, DataType.StringType, true);
-    } catch (IllegalArgumentException expectedException) {
-    }
-    try {
-      DataType.createStructField("name", null, true);
-    } catch (IllegalArgumentException expectedException) {
-    }
-    try {
-      DataType.createStructField(null, null, true);
-    } catch (IllegalArgumentException expectedException) {
-    }
-
-    // StructType
-    try {
-      List<StructField> simpleFields = new ArrayList<StructField>();
-      simpleFields.add(DataType.createStructField("a", new DecimalType(), false));
-      simpleFields.add(DataType.createStructField("b", DataType.BooleanType, true));
-      simpleFields.add(DataType.createStructField("c", DataType.LongType, true));
-      simpleFields.add(null);
-      DataType.createStructType(simpleFields);
-      Assert.fail();
-    } catch (IllegalArgumentException expectedException) {
-    }
-    try {
-      List<StructField> simpleFields = new ArrayList<StructField>();
-      simpleFields.add(DataType.createStructField("a", new DecimalType(), false));
-      simpleFields.add(DataType.createStructField("a", DataType.BooleanType, true));
-      simpleFields.add(DataType.createStructField("c", DataType.LongType, true));
-      DataType.createStructType(simpleFields);
-      Assert.fail();
-    } catch (IllegalArgumentException expectedException) {
-    }
-  }
-}
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaUserDefinedTypeSuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaUserDefinedTypeSuite.java
deleted file mode 100644
index 0caa8219a63e9..0000000000000
--- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaUserDefinedTypeSuite.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-import java.io.Serializable;
-import java.util.*;
-
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.MyDenseVector;
-import org.apache.spark.sql.MyLabeledPoint;
-
-public class JavaUserDefinedTypeSuite implements Serializable {
-  private transient JavaSparkContext javaCtx;
-  private transient JavaSQLContext javaSqlCtx;
-
-  @Before
-  public void setUp() {
-    javaCtx = new JavaSparkContext("local", "JavaUserDefinedTypeSuite");
-    javaSqlCtx = new JavaSQLContext(javaCtx);
-  }
-
-  @After
-  public void tearDown() {
-    javaCtx.stop();
-    javaCtx = null;
-    javaSqlCtx = null;
-  }
-
-  @Test
-  public void useScalaUDT() {
-    List<MyLabeledPoint> points = Arrays.asList(
-        new MyLabeledPoint(1.0, new MyDenseVector(new double[]{0.1, 1.0})),
-        new MyLabeledPoint(0.0, new MyDenseVector(new double[]{0.2, 2.0})));
-    JavaRDD<MyLabeledPoint> pointsRDD = javaCtx.parallelize(points);
-
-    JavaSchemaRDD schemaRDD = javaSqlCtx.applySchema(pointsRDD, MyLabeledPoint.class);
-    schemaRDD.registerTempTable("points");
-
-    List<Row> actualLabelRows = javaSqlCtx.sql("SELECT label FROM points").collect();
-    List<Double> actualLabels = new LinkedList<Double>();
-    for (Row r : actualLabelRows) {
-      actualLabels.add(r.getDouble(0));
-    }
-    for (MyLabeledPoint lp : points) {
-      Assert.assertTrue(actualLabels.contains(lp.label()));
-    }
-
-    List<Row> actualFeatureRows = javaSqlCtx.sql("SELECT features FROM points").collect();
-    List<MyDenseVector> actualFeatures = new LinkedList<MyDenseVector>();
-    for (Row r : actualFeatureRows) {
-      actualFeatures.add((MyDenseVector)r.get(0));
-    }
-    for (MyLabeledPoint lp : points) {
-      Assert.assertTrue(actualFeatures.contains(lp.features()));
-    }
-
-    List<Row> actual = javaSqlCtx.sql("SELECT label, features FROM points").collect();
-    List<MyLabeledPoint> actualPoints =
-        new LinkedList<MyLabeledPoint>();
-    for (Row r : actual) {
-      actualPoints.add(new MyLabeledPoint(r.getDouble(0), (MyDenseVector)r.get(1)));
-    }
-    for (MyLabeledPoint lp : points) {
-      Assert.assertTrue(actualPoints.contains(lp));
-    }
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 042210176ad7e..cfc037caff2a9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -49,6 +49,20 @@ class CachedTableSuite extends QueryTest {
     uncacheTable("tempTable")
   }
 
+  test("unpersist an uncached table will not raise exception") {
+    assert(None == lookupCachedData(testData))
+    testData.unpersist(true)
+    assert(None == lookupCachedData(testData))
+    testData.unpersist(false)
+    assert(None == lookupCachedData(testData))
+    testData.persist()
+    assert(None != lookupCachedData(testData))
+    testData.unpersist(true)
+    assert(None == lookupCachedData(testData))
+    testData.unpersist(false)
+    assert(None == lookupCachedData(testData))
+  }
+
   test("cache table as select") {
     sql("CACHE TABLE tempTable AS SELECT key FROM testData")
     assertCached(sql("SELECT COUNT(*) FROM tempTable"))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala
index 1a330a2bb6d46..afbfe214f1ce4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala
@@ -19,11 +19,14 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types._
 
 /* Implicits */
 import org.apache.spark.sql.catalyst.dsl._
 import org.apache.spark.sql.test.TestSQLContext._
 
+import scala.language.postfixOps
+
 class DslQuerySuite extends QueryTest {
   import org.apache.spark.sql.TestData._
 
@@ -42,18 +45,30 @@ class DslQuerySuite extends QueryTest {
   test("agg") {
     checkAnswer(
       testData2.groupBy('a)('a, sum('b)),
-      Seq((1,3),(2,3),(3,3))
+      Seq(Row(1,3), Row(2,3), Row(3,3))
     )
     checkAnswer(
       testData2.groupBy('a)('a, sum('b) as 'totB).aggregate(sum('totB)),
-      9
+      Row(9)
     )
     checkAnswer(
       testData2.aggregate(sum('b)),
-      9
+      Row(9)
     )
   }
 
+  test("convert $\"attribute name\" into unresolved attribute") {
+    checkAnswer(
+      testData.where($"key" === 1).select($"value"),
+      Row("1"))
+  }
+
+  test("convert Scala Symbol 'attrname into unresolved attribute") {
+    checkAnswer(
+      testData.where('key === 1).select('value),
+      Row("1"))
+  }
+
   test("select *") {
     checkAnswer(
       testData.select(Star(None)),
@@ -63,61 +78,88 @@ class DslQuerySuite extends QueryTest {
   test("simple select") {
     checkAnswer(
       testData.where('key === 1).select('value),
-      Seq(Seq("1")))
+      Row("1"))
   }
 
   test("select with functions") {
     checkAnswer(
       testData.select(sum('value), avg('value), count(1)),
-      Seq(Seq(5050.0, 50.5, 100)))
+      Row(5050.0, 50.5, 100))
 
     checkAnswer(
       testData2.select('a + 'b, 'a < 'b),
       Seq(
-        Seq(2, false),
-        Seq(3, true),
-        Seq(3, false),
-        Seq(4, false),
-        Seq(4, false),
-        Seq(5, false)))
+        Row(2, false),
+        Row(3, true),
+        Row(3, false),
+        Row(4, false),
+        Row(4, false),
+        Row(5, false)))
 
     checkAnswer(
       testData2.select(sumDistinct('a)),
-      Seq(Seq(6)))
+      Row(6))
   }
 
-  test("sorting") {
+  test("global sorting") {
     checkAnswer(
       testData2.orderBy('a.asc, 'b.asc),
-      Seq((1,1), (1,2), (2,1), (2,2), (3,1), (3,2)))
+      Seq(Row(1,1), Row(1,2), Row(2,1), Row(2,2), Row(3,1), Row(3,2)))
 
     checkAnswer(
       testData2.orderBy('a.asc, 'b.desc),
-      Seq((1,2), (1,1), (2,2), (2,1), (3,2), (3,1)))
+      Seq(Row(1,2), Row(1,1), Row(2,2), Row(2,1), Row(3,2), Row(3,1)))
 
     checkAnswer(
       testData2.orderBy('a.desc, 'b.desc),
-      Seq((3,2), (3,1), (2,2), (2,1), (1,2), (1,1)))
+      Seq(Row(3,2), Row(3,1), Row(2,2), Row(2,1), Row(1,2), Row(1,1)))
 
     checkAnswer(
       testData2.orderBy('a.desc, 'b.asc),
-      Seq((3,1), (3,2), (2,1), (2,2), (1,1), (1,2)))
+      Seq(Row(3,1), Row(3,2), Row(2,1), Row(2,2), Row(1,1), Row(1,2)))
 
     checkAnswer(
       arrayData.orderBy('data.getItem(0).asc),
-      arrayData.collect().sortBy(_.data(0)).toSeq)
+      arrayData.toSchemaRDD.collect().sortBy(_.getAs[Seq[Int]](0)(0)).toSeq)
 
     checkAnswer(
       arrayData.orderBy('data.getItem(0).desc),
-      arrayData.collect().sortBy(_.data(0)).reverse.toSeq)
+      arrayData.toSchemaRDD.collect().sortBy(_.getAs[Seq[Int]](0)(0)).reverse.toSeq)
+
+    checkAnswer(
+      arrayData.orderBy('data.getItem(1).asc),
+      arrayData.toSchemaRDD.collect().sortBy(_.getAs[Seq[Int]](0)(1)).toSeq)
+
+    checkAnswer(
+      arrayData.orderBy('data.getItem(1).desc),
+      arrayData.toSchemaRDD.collect().sortBy(_.getAs[Seq[Int]](0)(1)).reverse.toSeq)
+  }
+
+  test("partition wide sorting") {
+    // 2 partitions totally, and
+    // Partition #1 with values:
+    //    (1, 1)
+    //    (1, 2)
+    //    (2, 1)
+    // Partition #2 with values:
+    //    (2, 2)
+    //    (3, 1)
+    //    (3, 2)
+    checkAnswer(
+      testData2.sortBy('a.asc, 'b.asc),
+      Seq(Row(1,1), Row(1,2), Row(2,1), Row(2,2), Row(3,1), Row(3,2)))
+
+    checkAnswer(
+      testData2.sortBy('a.asc, 'b.desc),
+      Seq(Row(1,2), Row(1,1), Row(2,1), Row(2,2), Row(3,2), Row(3,1)))
 
     checkAnswer(
-      mapData.orderBy('data.getItem(1).asc),
-      mapData.collect().sortBy(_.data(1)).toSeq)
+      testData2.sortBy('a.desc, 'b.desc),
+      Seq(Row(2,1), Row(1,2), Row(1,1), Row(3,2), Row(3,1), Row(2,2)))
 
     checkAnswer(
-      mapData.orderBy('data.getItem(1).desc),
-      mapData.collect().sortBy(_.data(1)).reverse.toSeq)
+      testData2.sortBy('a.desc, 'b.asc),
+      Seq(Row(2,1), Row(1,1), Row(1,2), Row(3,1), Row(3,2), Row(2,2)))
   }
 
   test("limit") {
@@ -127,11 +169,11 @@ class DslQuerySuite extends QueryTest {
 
     checkAnswer(
       arrayData.limit(1),
-      arrayData.take(1).toSeq)
+      arrayData.take(1).map(r => Row.fromSeq(r.productIterator.toSeq)))
 
     checkAnswer(
       mapData.limit(1),
-      mapData.take(1).toSeq)
+      mapData.take(1).map(r => Row.fromSeq(r.productIterator.toSeq)))
   }
 
   test("SPARK-3395 limit distinct") {
@@ -142,8 +184,8 @@ class DslQuerySuite extends QueryTest {
       .registerTempTable("onerow")
     checkAnswer(
       sql("select * from onerow inner join testData2 on onerow.a = testData2.a"),
-      (1, 1, 1, 1) ::
-      (1, 1, 1, 2) :: Nil)
+      Row(1, 1, 1, 1) ::
+      Row(1, 1, 1, 2) :: Nil)
   }
 
   test("SPARK-3858 generator qualifiers are discarded") {
@@ -151,55 +193,55 @@ class DslQuerySuite extends QueryTest {
       arrayData.as('ad)
         .generate(Explode("data" :: Nil, 'data), alias = Some("ex"))
         .select("ex.data".attr),
-      Seq(1, 2, 3, 2, 3, 4).map(Seq(_)))
+      Seq(1, 2, 3, 2, 3, 4).map(Row(_)))
   }
 
   test("average") {
     checkAnswer(
       testData2.aggregate(avg('a)),
-      2.0)
+      Row(2.0))
 
     checkAnswer(
       testData2.aggregate(avg('a), sumDistinct('a)), // non-partial
-      (2.0, 6.0) :: Nil)
+      Row(2.0, 6.0) :: Nil)
 
     checkAnswer(
       decimalData.aggregate(avg('a)),
-      BigDecimal(2.0))
+      Row(new java.math.BigDecimal(2.0)))
     checkAnswer(
       decimalData.aggregate(avg('a), sumDistinct('a)), // non-partial
-      (BigDecimal(2.0), BigDecimal(6)) :: Nil)
+      Row(new java.math.BigDecimal(2.0), new java.math.BigDecimal(6)) :: Nil)
 
     checkAnswer(
       decimalData.aggregate(avg('a cast DecimalType(10, 2))),
-      BigDecimal(2.0))
+      Row(new java.math.BigDecimal(2.0)))
     checkAnswer(
       decimalData.aggregate(avg('a cast DecimalType(10, 2)), sumDistinct('a cast DecimalType(10, 2))), // non-partial
-      (BigDecimal(2.0), BigDecimal(6)) :: Nil)
+      Row(new java.math.BigDecimal(2.0), new java.math.BigDecimal(6)) :: Nil)
   }
 
   test("null average") {
     checkAnswer(
       testData3.aggregate(avg('b)),
-      2.0)
+      Row(2.0))
 
     checkAnswer(
       testData3.aggregate(avg('b), countDistinct('b)),
-      (2.0, 1) :: Nil)
+      Row(2.0, 1))
 
     checkAnswer(
       testData3.aggregate(avg('b), sumDistinct('b)), // non-partial
-      (2.0, 2.0) :: Nil)
+      Row(2.0, 2.0))
   }
 
   test("zero average") {
     checkAnswer(
       emptyTableData.aggregate(avg('a)),
-      null)
+      Row(null))
 
     checkAnswer(
       emptyTableData.aggregate(avg('a), sumDistinct('b)), // non-partial
-      (null, null) :: Nil)
+      Row(null, null))
   }
 
   test("count") {
@@ -207,28 +249,28 @@ class DslQuerySuite extends QueryTest {
 
     checkAnswer(
       testData2.aggregate(count('a), sumDistinct('a)), // non-partial
-      (6, 6.0) :: Nil)
+      Row(6, 6.0))
   }
 
   test("null count") {
     checkAnswer(
       testData3.groupBy('a)('a, count('b)),
-      Seq((1,0), (2, 1))
+      Seq(Row(1,0), Row(2, 1))
     )
 
     checkAnswer(
       testData3.groupBy('a)('a, count('a + 'b)),
-      Seq((1,0), (2, 1))
+      Seq(Row(1,0), Row(2, 1))
     )
 
     checkAnswer(
       testData3.aggregate(count('a), count('b), count(1), countDistinct('a), countDistinct('b)),
-      (2, 1, 2, 2, 1) :: Nil
+      Row(2, 1, 2, 2, 1)
     )
 
     checkAnswer(
       testData3.aggregate(count('b), countDistinct('b), sumDistinct('b)), // non-partial
-      (1, 1, 2) :: Nil
+      Row(1, 1, 2)
     )
   }
 
@@ -237,28 +279,28 @@ class DslQuerySuite extends QueryTest {
 
     checkAnswer(
       emptyTableData.aggregate(count('a), sumDistinct('a)), // non-partial
-      (0, null) :: Nil)
+      Row(0, null))
   }
 
   test("zero sum") {
     checkAnswer(
       emptyTableData.aggregate(sum('a)),
-      null)
+      Row(null))
   }
 
   test("zero sum distinct") {
     checkAnswer(
       emptyTableData.aggregate(sumDistinct('a)),
-      null)
+      Row(null))
   }
 
   test("except") {
     checkAnswer(
       lowerCaseData.except(upperCaseData),
-      (1, "a") ::
-      (2, "b") ::
-      (3, "c") ::
-      (4, "d") :: Nil)
+      Row(1, "a") ::
+      Row(2, "b") ::
+      Row(3, "c") ::
+      Row(4, "d") :: Nil)
     checkAnswer(lowerCaseData.except(lowerCaseData), Nil)
     checkAnswer(upperCaseData.except(upperCaseData), Nil)
   }
@@ -266,10 +308,10 @@ class DslQuerySuite extends QueryTest {
   test("intersect") {
     checkAnswer(
       lowerCaseData.intersect(lowerCaseData),
-      (1, "a") ::
-      (2, "b") ::
-      (3, "c") ::
-      (4, "d") :: Nil)
+      Row(1, "a") ::
+      Row(2, "b") ::
+      Row(3, "c") ::
+      Row(4, "d") :: Nil)
     checkAnswer(lowerCaseData.intersect(upperCaseData), Nil)
   }
 
@@ -279,75 +321,75 @@ class DslQuerySuite extends QueryTest {
     checkAnswer(
       // SELECT *, foo(key, value) FROM testData
       testData.select(Star(None), foo.call('key, 'value)).limit(3),
-      (1, "1", "11") :: (2, "2", "22") :: (3, "3", "33") :: Nil
+      Row(1, "1", "11") :: Row(2, "2", "22") :: Row(3, "3", "33") :: Nil
     )
   }
 
   test("sqrt") {
     checkAnswer(
       testData.select(sqrt('key)).orderBy('key asc),
-      (1 to 100).map(n => Seq(math.sqrt(n)))
+      (1 to 100).map(n => Row(math.sqrt(n)))
     )
 
     checkAnswer(
       testData.select(sqrt('value), 'key).orderBy('key asc, 'value asc),
-      (1 to 100).map(n => Seq(math.sqrt(n), n))
+      (1 to 100).map(n => Row(math.sqrt(n), n))
     )
 
     checkAnswer(
       testData.select(sqrt(Literal(null))),
-      (1 to 100).map(_ => Seq(null))
+      (1 to 100).map(_ => Row(null))
     )
   }
 
   test("abs") {
     checkAnswer(
       testData.select(abs('key)).orderBy('key asc),
-      (1 to 100).map(n => Seq(n))
+      (1 to 100).map(n => Row(n))
     )
 
     checkAnswer(
       negativeData.select(abs('key)).orderBy('key desc),
-      (1 to 100).map(n => Seq(n))
+      (1 to 100).map(n => Row(n))
     )
 
     checkAnswer(
       testData.select(abs(Literal(null))),
-      (1 to 100).map(_ => Seq(null))
+      (1 to 100).map(_ => Row(null))
     )
   }
 
   test("upper") {
     checkAnswer(
       lowerCaseData.select(upper('l)),
-      ('a' to 'd').map(c => Seq(c.toString.toUpperCase()))
+      ('a' to 'd').map(c => Row(c.toString.toUpperCase()))
     )
 
     checkAnswer(
       testData.select(upper('value), 'key),
-      (1 to 100).map(n => Seq(n.toString, n))
+      (1 to 100).map(n => Row(n.toString, n))
     )
 
     checkAnswer(
       testData.select(upper(Literal(null))),
-      (1 to 100).map(n => Seq(null))
+      (1 to 100).map(n => Row(null))
     )
   }
 
   test("lower") {
     checkAnswer(
       upperCaseData.select(lower('L)),
-      ('A' to 'F').map(c => Seq(c.toString.toLowerCase()))
+      ('A' to 'F').map(c => Row(c.toString.toLowerCase()))
     )
 
     checkAnswer(
       testData.select(lower('value), 'key),
-      (1 to 100).map(n => Seq(n.toString, n))
+      (1 to 100).map(n => Row(n.toString, n))
     )
 
     checkAnswer(
       testData.select(lower(Literal(null))),
-      (1 to 100).map(n => Seq(null))
+      (1 to 100).map(n => Row(null))
     )
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/InsertIntoSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/InsertIntoSuite.scala
deleted file mode 100644
index c87d762751e6d..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/InsertIntoSuite.scala
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql
-
-import _root_.java.io.File
-
-/* Implicits */
-import org.apache.spark.sql.test.TestSQLContext._
-
-class InsertIntoSuite extends QueryTest {
-  TestData // Initialize TestData
-  import TestData._
-
-  test("insertInto() created parquet file") {
-    val testFilePath = File.createTempFile("sparkSql", "pqt")
-    testFilePath.delete()
-    testFilePath.deleteOnExit()
-    val testFile = createParquetFile[TestData](testFilePath.getCanonicalPath)
-    testFile.registerTempTable("createAndInsertTest")
-
-    // Add some data.
-    testData.insertInto("createAndInsertTest")
-
-    // Make sure its there for a new instance of parquet file.
-    checkAnswer(
-      parquetFile(testFilePath.getCanonicalPath),
-      testData.collect().toSeq
-    )
-
-    // Make sure the registered table has also been updated.
-    checkAnswer(
-      sql("SELECT * FROM createAndInsertTest"),
-      testData.collect().toSeq
-    )
-
-    // Add more data.
-    testData.insertInto("createAndInsertTest")
-
-    // Make sure all data is there for a new instance of parquet file.
-    checkAnswer(
-      parquetFile(testFilePath.getCanonicalPath),
-      testData.collect().toSeq ++ testData.collect().toSeq
-    )
-
-    // Make sure the registered table has also been updated.
-    checkAnswer(
-      sql("SELECT * FROM createAndInsertTest"),
-      testData.collect().toSeq ++ testData.collect().toSeq
-    )
-
-    // Now overwrite.
-    testData.insertInto("createAndInsertTest", overwrite = true)
-
-    // Make sure its there for a new instance of parquet file.
-    checkAnswer(
-      parquetFile(testFilePath.getCanonicalPath),
-      testData.collect().toSeq
-    )
-
-    // Make sure the registered table has also been updated.
-    checkAnswer(
-      sql("SELECT * FROM createAndInsertTest"),
-      testData.collect().toSeq
-    )
-
-    testFilePath.delete()
-  }
-
-  test("INSERT INTO parquet table") {
-    val testFilePath = File.createTempFile("sparkSql", "pqt")
-    testFilePath.delete()
-    testFilePath.deleteOnExit()
-    val testFile = createParquetFile[TestData](testFilePath.getCanonicalPath)
-    testFile.registerTempTable("createAndInsertSQLTest")
-
-    sql("INSERT INTO createAndInsertSQLTest SELECT * FROM testData")
-
-    // Make sure its there for a new instance of parquet file.
-    checkAnswer(
-      parquetFile(testFilePath.getCanonicalPath),
-      testData.collect().toSeq
-    )
-
-    // Make sure the registered table has also been updated.
-    checkAnswer(
-      sql("SELECT * FROM createAndInsertSQLTest"),
-      testData.collect().toSeq
-    )
-
-    // Append more data.
-    sql("INSERT INTO createAndInsertSQLTest SELECT * FROM testData")
-
-    // Make sure all data is there for a new instance of parquet file.
-    checkAnswer(
-      parquetFile(testFilePath.getCanonicalPath),
-      testData.collect().toSeq ++ testData.collect().toSeq
-    )
-
-    // Make sure the registered table has also been updated.
-    checkAnswer(
-      sql("SELECT * FROM createAndInsertSQLTest"),
-      testData.collect().toSeq ++ testData.collect().toSeq
-    )
-
-    sql("INSERT OVERWRITE INTO createAndInsertSQLTest SELECT * FROM testData")
-
-    // Make sure its there for a new instance of parquet file.
-    checkAnswer(
-      parquetFile(testFilePath.getCanonicalPath),
-      testData.collect().toSeq
-    )
-
-    // Make sure the registered table has also been updated.
-    checkAnswer(
-      sql("SELECT * FROM createAndInsertSQLTest"),
-      testData.collect().toSeq
-    )
-
-    testFilePath.delete()
-  }
-
-  test("Double create fails when allowExisting = false") {
-    val testFilePath = File.createTempFile("sparkSql", "pqt")
-    testFilePath.delete()
-    testFilePath.deleteOnExit()
-    val testFile = createParquetFile[TestData](testFilePath.getCanonicalPath)
-
-    intercept[RuntimeException] {
-      createParquetFile[TestData](testFilePath.getCanonicalPath, allowExisting = false)
-    }
-
-    testFilePath.delete()
-  }
-
-  test("Double create does not fail when allowExisting = true") {
-    val testFilePath = File.createTempFile("sparkSql", "pqt")
-    testFilePath.delete()
-    testFilePath.deleteOnExit()
-    val testFile = createParquetFile[TestData](testFilePath.getCanonicalPath)
-
-    createParquetFile[TestData](testFilePath.getCanonicalPath, allowExisting = true)
-
-    testFilePath.delete()
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 8b4cf5bac0187..cd36da7751e83 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -48,6 +48,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
       case j: LeftSemiJoinBNL => j
       case j: CartesianProduct => j
       case j: BroadcastNestedLoopJoin => j
+      case j: BroadcastLeftSemiJoinHash => j
     }
 
     assert(operators.size === 1)
@@ -80,8 +81,13 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
         classOf[HashOuterJoin]),
       ("SELECT * FROM testData right join testData2 ON key = a and key = 2",
         classOf[HashOuterJoin]),
-      ("SELECT * FROM testData full outer join testData2 ON key = a", classOf[HashOuterJoin])
-      // TODO add BroadcastNestedLoopJoin
+      ("SELECT * FROM testData full outer join testData2 ON key = a", classOf[HashOuterJoin]),
+      ("SELECT * FROM testData left JOIN testData2 ON (key * a != key + a)",
+        classOf[BroadcastNestedLoopJoin]),
+      ("SELECT * FROM testData right JOIN testData2 ON (key * a != key + a)",
+        classOf[BroadcastNestedLoopJoin]),
+      ("SELECT * FROM testData full JOIN testData2 ON (key * a != key + a)",
+        classOf[BroadcastNestedLoopJoin])
     ).foreach { case (query, joinClass) => assertJoin(query, joinClass) }
   }
 
@@ -111,10 +117,10 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
     checkAnswer(
       upperCaseData.join(lowerCaseData, Inner).where('n === 'N),
       Seq(
-        (1, "A", 1, "a"),
-        (2, "B", 2, "b"),
-        (3, "C", 3, "c"),
-        (4, "D", 4, "d")
+        Row(1, "A", 1, "a"),
+        Row(2, "B", 2, "b"),
+        Row(3, "C", 3, "c"),
+        Row(4, "D", 4, "d")
       ))
   }
 
@@ -122,10 +128,10 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
     checkAnswer(
       upperCaseData.join(lowerCaseData, Inner, Some('n === 'N)),
       Seq(
-        (1, "A", 1, "a"),
-        (2, "B", 2, "b"),
-        (3, "C", 3, "c"),
-        (4, "D", 4, "d")
+        Row(1, "A", 1, "a"),
+        Row(2, "B", 2, "b"),
+        Row(3, "C", 3, "c"),
+        Row(4, "D", 4, "d")
       ))
   }
 
@@ -134,10 +140,10 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
     val y = testData2.where('a === 1).as('y)
     checkAnswer(
       x.join(y).where("x.a".attr === "y.a".attr),
-      (1,1,1,1) ::
-      (1,1,1,2) ::
-      (1,2,1,1) ::
-      (1,2,1,2) :: Nil
+      Row(1,1,1,1) ::
+      Row(1,1,1,2) ::
+      Row(1,2,1,1) ::
+      Row(1,2,1,2) :: Nil
     )
   }
 
@@ -157,54 +163,54 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
     checkAnswer(
       bigDataX.join(bigDataY).where("x.key".attr === "y.key".attr),
       testData.flatMap(
-        row => Seq.fill(16)((row ++ row).toSeq)).collect().toSeq)
+        row => Seq.fill(16)(Row.merge(row, row))).collect().toSeq)
   }
 
   test("cartisian product join") {
     checkAnswer(
       testData3.join(testData3),
-      (1, null, 1, null) ::
-      (1, null, 2, 2) ::
-      (2, 2, 1, null) ::
-      (2, 2, 2, 2) :: Nil)
+      Row(1, null, 1, null) ::
+        Row(1, null, 2, 2) ::
+        Row(2, 2, 1, null) ::
+        Row(2, 2, 2, 2) :: Nil)
   }
 
   test("left outer join") {
     checkAnswer(
       upperCaseData.join(lowerCaseData, LeftOuter, Some('n === 'N)),
-      (1, "A", 1, "a") ::
-      (2, "B", 2, "b") ::
-      (3, "C", 3, "c") ::
-      (4, "D", 4, "d") ::
-      (5, "E", null, null) ::
-      (6, "F", null, null) :: Nil)
+      Row(1, "A", 1, "a") ::
+        Row(2, "B", 2, "b") ::
+        Row(3, "C", 3, "c") ::
+        Row(4, "D", 4, "d") ::
+        Row(5, "E", null, null) ::
+        Row(6, "F", null, null) :: Nil)
 
     checkAnswer(
       upperCaseData.join(lowerCaseData, LeftOuter, Some('n === 'N && 'n > 1)),
-      (1, "A", null, null) ::
-      (2, "B", 2, "b") ::
-      (3, "C", 3, "c") ::
-      (4, "D", 4, "d") ::
-      (5, "E", null, null) ::
-      (6, "F", null, null) :: Nil)
+      Row(1, "A", null, null) ::
+        Row(2, "B", 2, "b") ::
+        Row(3, "C", 3, "c") ::
+        Row(4, "D", 4, "d") ::
+        Row(5, "E", null, null) ::
+        Row(6, "F", null, null) :: Nil)
 
     checkAnswer(
       upperCaseData.join(lowerCaseData, LeftOuter, Some('n === 'N && 'N > 1)),
-      (1, "A", null, null) ::
-      (2, "B", 2, "b") ::
-      (3, "C", 3, "c") ::
-      (4, "D", 4, "d") ::
-      (5, "E", null, null) ::
-      (6, "F", null, null) :: Nil)
+      Row(1, "A", null, null) ::
+        Row(2, "B", 2, "b") ::
+        Row(3, "C", 3, "c") ::
+        Row(4, "D", 4, "d") ::
+        Row(5, "E", null, null) ::
+        Row(6, "F", null, null) :: Nil)
 
     checkAnswer(
       upperCaseData.join(lowerCaseData, LeftOuter, Some('n === 'N && 'l > 'L)),
-      (1, "A", 1, "a") ::
-      (2, "B", 2, "b") ::
-      (3, "C", 3, "c") ::
-      (4, "D", 4, "d") ::
-      (5, "E", null, null) ::
-      (6, "F", null, null) :: Nil)
+      Row(1, "A", 1, "a") ::
+        Row(2, "B", 2, "b") ::
+        Row(3, "C", 3, "c") ::
+        Row(4, "D", 4, "d") ::
+        Row(5, "E", null, null) ::
+        Row(6, "F", null, null) :: Nil)
 
     // Make sure we are choosing left.outputPartitioning as the
     // outputPartitioning for the outer join operator.
@@ -215,12 +221,12 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
           |FROM upperCaseData l LEFT OUTER JOIN allNulls r ON (l.N = r.a)
           |GROUP BY l.N
         """.stripMargin),
-      (1, 1) ::
-      (2, 1) ::
-      (3, 1) ::
-      (4, 1) ::
-      (5, 1) ::
-      (6, 1) :: Nil)
+      Row(1, 1) ::
+        Row(2, 1) ::
+        Row(3, 1) ::
+        Row(4, 1) ::
+        Row(5, 1) ::
+        Row(6, 1) :: Nil)
 
     checkAnswer(
       sql(
@@ -229,42 +235,42 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
           |FROM upperCaseData l LEFT OUTER JOIN allNulls r ON (l.N = r.a)
           |GROUP BY r.a
         """.stripMargin),
-      (null, 6) :: Nil)
+      Row(null, 6) :: Nil)
   }
 
   test("right outer join") {
     checkAnswer(
       lowerCaseData.join(upperCaseData, RightOuter, Some('n === 'N)),
-      (1, "a", 1, "A") ::
-      (2, "b", 2, "B") ::
-      (3, "c", 3, "C") ::
-      (4, "d", 4, "D") ::
-      (null, null, 5, "E") ::
-      (null, null, 6, "F") :: Nil)
+      Row(1, "a", 1, "A") ::
+        Row(2, "b", 2, "B") ::
+        Row(3, "c", 3, "C") ::
+        Row(4, "d", 4, "D") ::
+        Row(null, null, 5, "E") ::
+        Row(null, null, 6, "F") :: Nil)
     checkAnswer(
       lowerCaseData.join(upperCaseData, RightOuter, Some('n === 'N && 'n > 1)),
-      (null, null, 1, "A") ::
-      (2, "b", 2, "B") ::
-      (3, "c", 3, "C") ::
-      (4, "d", 4, "D") ::
-      (null, null, 5, "E") ::
-      (null, null, 6, "F") :: Nil)
+      Row(null, null, 1, "A") ::
+        Row(2, "b", 2, "B") ::
+        Row(3, "c", 3, "C") ::
+        Row(4, "d", 4, "D") ::
+        Row(null, null, 5, "E") ::
+        Row(null, null, 6, "F") :: Nil)
     checkAnswer(
       lowerCaseData.join(upperCaseData, RightOuter, Some('n === 'N && 'N > 1)),
-      (null, null, 1, "A") ::
-      (2, "b", 2, "B") ::
-      (3, "c", 3, "C") ::
-      (4, "d", 4, "D") ::
-      (null, null, 5, "E") ::
-      (null, null, 6, "F") :: Nil)
+      Row(null, null, 1, "A") ::
+        Row(2, "b", 2, "B") ::
+        Row(3, "c", 3, "C") ::
+        Row(4, "d", 4, "D") ::
+        Row(null, null, 5, "E") ::
+        Row(null, null, 6, "F") :: Nil)
     checkAnswer(
       lowerCaseData.join(upperCaseData, RightOuter, Some('n === 'N && 'l > 'L)),
-      (1, "a", 1, "A") ::
-      (2, "b", 2, "B") ::
-      (3, "c", 3, "C") ::
-      (4, "d", 4, "D") ::
-      (null, null, 5, "E") ::
-      (null, null, 6, "F") :: Nil)
+      Row(1, "a", 1, "A") ::
+        Row(2, "b", 2, "B") ::
+        Row(3, "c", 3, "C") ::
+        Row(4, "d", 4, "D") ::
+        Row(null, null, 5, "E") ::
+        Row(null, null, 6, "F") :: Nil)
 
     // Make sure we are choosing right.outputPartitioning as the
     // outputPartitioning for the outer join operator.
@@ -275,7 +281,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
           |FROM allNulls l RIGHT OUTER JOIN upperCaseData r ON (l.a = r.N)
           |GROUP BY l.a
         """.stripMargin),
-      (null, 6) :: Nil)
+      Row(null, 6))
 
     checkAnswer(
       sql(
@@ -284,49 +290,49 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
           |FROM allNulls l RIGHT OUTER JOIN upperCaseData r ON (l.a = r.N)
           |GROUP BY r.N
         """.stripMargin),
-      (1, 1) ::
-      (2, 1) ::
-      (3, 1) ::
-      (4, 1) ::
-      (5, 1) ::
-      (6, 1) :: Nil)
+      Row(1, 1) ::
+        Row(2, 1) ::
+        Row(3, 1) ::
+        Row(4, 1) ::
+        Row(5, 1) ::
+        Row(6, 1) :: Nil)
   }
 
   test("full outer join") {
     upperCaseData.where('N <= 4).registerTempTable("left")
     upperCaseData.where('N >= 3).registerTempTable("right")
 
-    val left = UnresolvedRelation(None, "left", None)
-    val right = UnresolvedRelation(None, "right", None)
+    val left = UnresolvedRelation(Seq("left"), None)
+    val right = UnresolvedRelation(Seq("right"), None)
 
     checkAnswer(
       left.join(right, FullOuter, Some("left.N".attr === "right.N".attr)),
-      (1, "A", null, null) ::
-      (2, "B", null, null) ::
-      (3, "C", 3, "C") ::
-      (4, "D", 4, "D") ::
-      (null, null, 5, "E") ::
-      (null, null, 6, "F") :: Nil)
+      Row(1, "A", null, null) ::
+        Row(2, "B", null, null) ::
+        Row(3, "C", 3, "C") ::
+        Row(4, "D", 4, "D") ::
+        Row(null, null, 5, "E") ::
+        Row(null, null, 6, "F") :: Nil)
 
     checkAnswer(
       left.join(right, FullOuter, Some(("left.N".attr === "right.N".attr) && ("left.N".attr !== 3))),
-      (1, "A", null, null) ::
-      (2, "B", null, null) ::
-      (3, "C", null, null) ::
-      (null, null, 3, "C") ::
-      (4, "D", 4, "D") ::
-      (null, null, 5, "E") ::
-      (null, null, 6, "F") :: Nil)
+      Row(1, "A", null, null) ::
+        Row(2, "B", null, null) ::
+        Row(3, "C", null, null) ::
+        Row(null, null, 3, "C") ::
+        Row(4, "D", 4, "D") ::
+        Row(null, null, 5, "E") ::
+        Row(null, null, 6, "F") :: Nil)
 
     checkAnswer(
       left.join(right, FullOuter, Some(("left.N".attr === "right.N".attr) && ("right.N".attr !== 3))),
-      (1, "A", null, null) ::
-      (2, "B", null, null) ::
-      (3, "C", null, null) ::
-      (null, null, 3, "C") ::
-      (4, "D", 4, "D") ::
-      (null, null, 5, "E") ::
-      (null, null, 6, "F") :: Nil)
+      Row(1, "A", null, null) ::
+        Row(2, "B", null, null) ::
+        Row(3, "C", null, null) ::
+        Row(null, null, 3, "C") ::
+        Row(4, "D", 4, "D") ::
+        Row(null, null, 5, "E") ::
+        Row(null, null, 6, "F") :: Nil)
 
     // Make sure we are UnknownPartitioning as the outputPartitioning for the outer join operator.
     checkAnswer(
@@ -336,7 +342,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
           |FROM allNulls l FULL OUTER JOIN upperCaseData r ON (l.a = r.N)
           |GROUP BY l.a
         """.stripMargin),
-      (null, 10) :: Nil)
+      Row(null, 10))
 
     checkAnswer(
       sql(
@@ -345,13 +351,13 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
           |FROM allNulls l FULL OUTER JOIN upperCaseData r ON (l.a = r.N)
           |GROUP BY r.N
         """.stripMargin),
-      (1, 1) ::
-      (2, 1) ::
-      (3, 1) ::
-      (4, 1) ::
-      (5, 1) ::
-      (6, 1) ::
-      (null, 4) :: Nil)
+      Row(1, 1) ::
+        Row(2, 1) ::
+        Row(3, 1) ::
+        Row(4, 1) ::
+        Row(5, 1) ::
+        Row(6, 1) ::
+        Row(null, 4) :: Nil)
 
     checkAnswer(
       sql(
@@ -360,13 +366,13 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
           |FROM upperCaseData l FULL OUTER JOIN allNulls r ON (l.N = r.a)
           |GROUP BY l.N
         """.stripMargin),
-      (1, 1) ::
-      (2, 1) ::
-      (3, 1) ::
-      (4, 1) ::
-      (5, 1) ::
-      (6, 1) ::
-      (null, 4) :: Nil)
+      Row(1, 1) ::
+        Row(2, 1) ::
+        Row(3, 1) ::
+        Row(4, 1) ::
+        Row(5, 1) ::
+        Row(6, 1) ::
+        Row(null, 4) :: Nil)
 
     checkAnswer(
       sql(
@@ -375,6 +381,43 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
           |FROM upperCaseData l FULL OUTER JOIN allNulls r ON (l.N = r.a)
           |GROUP BY r.a
         """.stripMargin),
-      (null, 10) :: Nil)
+      Row(null, 10))
+  }
+
+  test("broadcasted left semi join operator selection") {
+    clearCache()
+    sql("CACHE TABLE testData")
+    val tmp = conf.autoBroadcastJoinThreshold
+
+    sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=1000000000")
+    Seq(
+      ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a",
+        classOf[BroadcastLeftSemiJoinHash])
+    ).foreach {
+      case (query, joinClass) => assertJoin(query, joinClass)
+    }
+
+    sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1")
+
+    Seq(
+      ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", classOf[LeftSemiJoinHash])
+    ).foreach {
+      case (query, joinClass) => assertJoin(query, joinClass)
+    }
+
+    setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, tmp.toString)
+    sql("UNCACHE TABLE testData")
+  }
+
+  test("left semi join") {
+    val rdd = sql("SELECT * FROM testData2 LEFT SEMI JOIN testData ON key = a")
+    checkAnswer(rdd,
+      Row(1, 1) ::
+        Row(1, 2) ::
+        Row(2, 1) ::
+        Row(2, 2) ::
+        Row(3, 1) ::
+        Row(3, 2) :: Nil)
+
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index 3d9f0cbf80fe7..42a21c148df53 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -47,17 +47,20 @@ class QueryTest extends PlanTest {
    * @param rdd the [[SchemaRDD]] to be executed
    * @param expectedAnswer the expected result, can either be an Any, Seq[Product], or Seq[ Seq[Any] ].
    */
-  protected def checkAnswer(rdd: SchemaRDD, expectedAnswer: Any): Unit = {
-    val convertedAnswer = expectedAnswer match {
-      case s: Seq[_] if s.isEmpty => s
-      case s: Seq[_] if s.head.isInstanceOf[Product] &&
-        !s.head.isInstanceOf[Seq[_]] => s.map(_.asInstanceOf[Product].productIterator.toIndexedSeq)
-      case s: Seq[_] => s
-      case singleItem => Seq(Seq(singleItem))
-    }
-
+  protected def checkAnswer(rdd: SchemaRDD, expectedAnswer: Seq[Row]): Unit = {
     val isSorted = rdd.logicalPlan.collect { case s: logical.Sort => s }.nonEmpty
-    def prepareAnswer(answer: Seq[Any]) = if (!isSorted) answer.sortBy(_.toString) else answer
+    def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
+      // Converts data to types that we can do equality comparison using Scala collections.
+      // For BigDecimal type, the Scala type has a better definition of equality test (similar to
+      // Java's java.math.BigDecimal.compareTo).
+      val converted: Seq[Row] = answer.map { s =>
+        Row.fromSeq(s.toSeq.map {
+          case d: java.math.BigDecimal => BigDecimal(d)
+          case o => o
+        })
+      }
+      if (!isSorted) converted.sortBy(_.toString) else converted
+    }
     val sparkAnswer = try rdd.collect().toSeq catch {
       case e: Exception =>
         fail(
@@ -70,7 +73,7 @@ class QueryTest extends PlanTest {
           """.stripMargin)
     }
 
-    if (prepareAnswer(convertedAnswer) != prepareAnswer(sparkAnswer)) {
+    if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) {
       fail(s"""
         |Results do not match for query:
         |${rdd.logicalPlan}
@@ -80,15 +83,19 @@ class QueryTest extends PlanTest {
         |${rdd.queryExecution.executedPlan}
         |== Results ==
         |${sideBySide(
-        s"== Correct Answer - ${convertedAnswer.size} ==" +:
-          prepareAnswer(convertedAnswer).map(_.toString),
+        s"== Correct Answer - ${expectedAnswer.size} ==" +:
+          prepareAnswer(expectedAnswer).map(_.toString),
         s"== Spark Answer - ${sparkAnswer.size} ==" +:
           prepareAnswer(sparkAnswer).map(_.toString)).mkString("\n")}
       """.stripMargin)
     }
   }
 
-  def sqlTest(sqlString: String, expectedAnswer: Any)(implicit sqlContext: SQLContext): Unit = {
+  protected def checkAnswer(rdd: SchemaRDD, expectedAnswer: Row): Unit = {
+    checkAnswer(rdd, Seq(expectedAnswer))
+  }
+
+  def sqlTest(sqlString: String, expectedAnswer: Seq[Row])(implicit sqlContext: SQLContext): Unit = {
     test(sqlString) {
       checkAnswer(sqlContext.sql(sqlString), expectedAnswer)
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index 811319e0a6601..f5b945f468dad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, SpecificMutableRow}
+import org.apache.spark.sql.types._
 
 class RowSuite extends FunSuite {
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
index 60701f0e154f8..bf73d0c7074a5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
@@ -37,7 +37,7 @@ class SQLConfSuite extends QueryTest with FunSuiteLike {
   }
 
   test("programmatic ways of basic setting and getting") {
-    clear()
+    conf.clear()
     assert(getAllConfs.size === 0)
 
     setConf(testKey, testVal)
@@ -51,11 +51,11 @@ class SQLConfSuite extends QueryTest with FunSuiteLike {
     assert(TestSQLContext.getConf(testKey, testVal + "_") == testVal)
     assert(TestSQLContext.getAllConfs.contains(testKey))
 
-    clear()
+    conf.clear()
   }
 
   test("parse SQL set commands") {
-    clear()
+    conf.clear()
     sql(s"set $testKey=$testVal")
     assert(getConf(testKey, testVal + "_") == testVal)
     assert(TestSQLContext.getConf(testKey, testVal + "_") == testVal)
@@ -73,11 +73,11 @@ class SQLConfSuite extends QueryTest with FunSuiteLike {
     sql(s"set $key=")
     assert(getConf(key, "0") == "")
 
-    clear()
+    conf.clear()
   }
 
   test("deprecated property") {
-    clear()
+    conf.clear()
     sql(s"set ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}=10")
     assert(getConf(SQLConf.SHUFFLE_PARTITIONS) == "10")
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index bcebce7603f13..03b44ca1d6695 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -23,6 +23,7 @@ import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.types._
 
 /* Implicits */
 import org.apache.spark.sql.TestData._
@@ -42,6 +43,13 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
     TimeZone.setDefault(origZone)
   }
 
+  test("SPARK-4625 support SORT BY in SimpleSQLParser & DSL") {
+    checkAnswer(
+      sql("SELECT a FROM testData2 SORT BY a"),
+      Seq(1, 1, 2 ,2 ,3 ,3).map(Row(_))
+    )
+  }
+
   test("grouping on nested fields") {
     jsonRDD(sparkContext.parallelize("""{"nested": {"attribute": 1}, "value": 2}""" :: Nil))
      .registerTempTable("rows")
@@ -62,17 +70,17 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("SPARK-3176 Added Parser of SQL ABS()") {
     checkAnswer(
       sql("SELECT ABS(-1.3)"),
-      1.3)
+      Row(1.3))
     checkAnswer(
       sql("SELECT ABS(0.0)"),
-      0.0)
+      Row(0.0))
     checkAnswer(
       sql("SELECT ABS(2.5)"),
-      2.5)
+      Row(2.5))
   }
 
   test("aggregation with codegen") {
-    val originalValue = codegenEnabled
+    val originalValue = conf.codegenEnabled
     setConf(SQLConf.CODEGEN_ENABLED, "true")
     sql("SELECT key FROM testData GROUP BY key").collect()
     setConf(SQLConf.CODEGEN_ENABLED, originalValue.toString)
@@ -81,13 +89,13 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("SPARK-3176 Added Parser of SQL LAST()") {
     checkAnswer(
       sql("SELECT LAST(n) FROM lowerCaseData"),
-      4)
+      Row(4))
   }
 
   test("SPARK-2041 column name equals tablename") {
     checkAnswer(
       sql("SELECT tableName FROM tableName"),
-      "test")
+      Row("test"))
   }
 
   test("SQRT") {
@@ -107,40 +115,40 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("SPARK-2407 Added Parser of SQL SUBSTR()") {
     checkAnswer(
       sql("SELECT substr(tableName, 1, 2) FROM tableName"),
-      "te")
+      Row("te"))
     checkAnswer(
       sql("SELECT substr(tableName, 3) FROM tableName"),
-      "st")
+      Row("st"))
     checkAnswer(
       sql("SELECT substring(tableName, 1, 2) FROM tableName"),
-      "te")
+      Row("te"))
     checkAnswer(
       sql("SELECT substring(tableName, 3) FROM tableName"),
-      "st")
+      Row("st"))
   }
 
   test("SPARK-3173 Timestamp support in the parser") {
     checkAnswer(sql(
       "SELECT time FROM timestamps WHERE time=CAST('1970-01-01 00:00:00.001' AS TIMESTAMP)"),
-      Seq(Seq(java.sql.Timestamp.valueOf("1970-01-01 00:00:00.001"))))
+      Row(java.sql.Timestamp.valueOf("1970-01-01 00:00:00.001")))
 
     checkAnswer(sql(
       "SELECT time FROM timestamps WHERE time='1970-01-01 00:00:00.001'"),
-      Seq(Seq(java.sql.Timestamp.valueOf("1970-01-01 00:00:00.001"))))
+      Row(java.sql.Timestamp.valueOf("1970-01-01 00:00:00.001")))
 
     checkAnswer(sql(
       "SELECT time FROM timestamps WHERE '1970-01-01 00:00:00.001'=time"),
-      Seq(Seq(java.sql.Timestamp.valueOf("1970-01-01 00:00:00.001"))))
+      Row(java.sql.Timestamp.valueOf("1970-01-01 00:00:00.001")))
 
     checkAnswer(sql(
       """SELECT time FROM timestamps WHERE time<'1970-01-01 00:00:00.003'
           AND time>'1970-01-01 00:00:00.001'"""),
-      Seq(Seq(java.sql.Timestamp.valueOf("1970-01-01 00:00:00.002"))))
+      Row(java.sql.Timestamp.valueOf("1970-01-01 00:00:00.002")))
 
     checkAnswer(sql(
       "SELECT time FROM timestamps WHERE time IN ('1970-01-01 00:00:00.001','1970-01-01 00:00:00.002')"),
-      Seq(Seq(java.sql.Timestamp.valueOf("1970-01-01 00:00:00.001")),
-        Seq(java.sql.Timestamp.valueOf("1970-01-01 00:00:00.002"))))
+      Seq(Row(java.sql.Timestamp.valueOf("1970-01-01 00:00:00.001")),
+        Row(java.sql.Timestamp.valueOf("1970-01-01 00:00:00.002"))))
 
     checkAnswer(sql(
       "SELECT time FROM timestamps WHERE time='123'"),
@@ -150,13 +158,13 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("index into array") {
     checkAnswer(
       sql("SELECT data, data[0], data[0] + data[1], data[0 + 1] FROM arrayData"),
-      arrayData.map(d => (d.data, d.data(0), d.data(0) + d.data(1), d.data(1))).collect().toSeq)
+      arrayData.map(d => Row(d.data, d.data(0), d.data(0) + d.data(1), d.data(1))).collect())
   }
 
   test("left semi greater than predicate") {
     checkAnswer(
       sql("SELECT * FROM testData2 x LEFT SEMI JOIN testData2 y ON x.a >= y.a + 2"),
-      Seq((3,1), (3,2))
+      Seq(Row(3,1), Row(3,2))
     )
   }
 
@@ -165,7 +173,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
       sql(
         "SELECT nestedData, nestedData[0][0], nestedData[0][0] + nestedData[0][1] FROM arrayData"),
       arrayData.map(d =>
-        (d.nestedData,
+        Row(d.nestedData,
          d.nestedData(0)(0),
          d.nestedData(0)(0) + d.nestedData(0)(1))).collect().toSeq)
   }
@@ -173,13 +181,13 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("agg") {
     checkAnswer(
       sql("SELECT a, SUM(b) FROM testData2 GROUP BY a"),
-      Seq((1,3),(2,3),(3,3)))
+      Seq(Row(1,3), Row(2,3), Row(3,3)))
   }
 
   test("aggregates with nulls") {
     checkAnswer(
       sql("SELECT MIN(a), MAX(a), AVG(a), SUM(a), COUNT(a) FROM nullInts"),
-      (1, 3, 2, 6, 3) :: Nil
+      Row(1, 3, 2, 6, 3)
     )
   }
 
@@ -192,29 +200,29 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("simple select") {
     checkAnswer(
       sql("SELECT value FROM testData WHERE key = 1"),
-      Seq(Seq("1")))
+      Row("1"))
   }
 
   def sortTest() = {
     checkAnswer(
       sql("SELECT * FROM testData2 ORDER BY a ASC, b ASC"),
-      Seq((1,1), (1,2), (2,1), (2,2), (3,1), (3,2)))
+      Seq(Row(1,1), Row(1,2), Row(2,1), Row(2,2), Row(3,1), Row(3,2)))
 
     checkAnswer(
       sql("SELECT * FROM testData2 ORDER BY a ASC, b DESC"),
-      Seq((1,2), (1,1), (2,2), (2,1), (3,2), (3,1)))
+      Seq(Row(1,2), Row(1,1), Row(2,2), Row(2,1), Row(3,2), Row(3,1)))
 
     checkAnswer(
       sql("SELECT * FROM testData2 ORDER BY a DESC, b DESC"),
-      Seq((3,2), (3,1), (2,2), (2,1), (1,2), (1,1)))
+      Seq(Row(3,2), Row(3,1), Row(2,2), Row(2,1), Row(1,2), Row(1,1)))
 
     checkAnswer(
       sql("SELECT * FROM testData2 ORDER BY a DESC, b ASC"),
-      Seq((3,1), (3,2), (2,1), (2,2), (1,1), (1,2)))
+      Seq(Row(3,1), Row(3,2), Row(2,1), Row(2,2), Row(1,1), Row(1,2)))
 
     checkAnswer(
       sql("SELECT b FROM binaryData ORDER BY a ASC"),
-      (1 to 5).map(Row(_)).toSeq)
+      (1 to 5).map(Row(_)))
 
     checkAnswer(
       sql("SELECT b FROM binaryData ORDER BY a DESC"),
@@ -222,30 +230,30 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
 
     checkAnswer(
       sql("SELECT * FROM arrayData ORDER BY data[0] ASC"),
-      arrayData.collect().sortBy(_.data(0)).toSeq)
+      arrayData.collect().sortBy(_.data(0)).map(Row.fromTuple).toSeq)
 
     checkAnswer(
       sql("SELECT * FROM arrayData ORDER BY data[0] DESC"),
-      arrayData.collect().sortBy(_.data(0)).reverse.toSeq)
+      arrayData.collect().sortBy(_.data(0)).reverse.map(Row.fromTuple).toSeq)
 
     checkAnswer(
       sql("SELECT * FROM mapData ORDER BY data[1] ASC"),
-      mapData.collect().sortBy(_.data(1)).toSeq)
+      mapData.collect().sortBy(_.data(1)).map(Row.fromTuple).toSeq)
 
     checkAnswer(
       sql("SELECT * FROM mapData ORDER BY data[1] DESC"),
-      mapData.collect().sortBy(_.data(1)).reverse.toSeq)
+      mapData.collect().sortBy(_.data(1)).reverse.map(Row.fromTuple).toSeq)
   }
 
   test("sorting") {
-    val before = externalSortEnabled
+    val before = conf.externalSortEnabled
     setConf(SQLConf.EXTERNAL_SORT, "false")
     sortTest()
     setConf(SQLConf.EXTERNAL_SORT, before.toString)
   }
 
   test("external sorting") {
-    val before = externalSortEnabled
+    val before = conf.externalSortEnabled
     setConf(SQLConf.EXTERNAL_SORT, "true")
     sortTest()
     setConf(SQLConf.EXTERNAL_SORT, before.toString)
@@ -258,77 +266,94 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
 
     checkAnswer(
       sql("SELECT * FROM arrayData LIMIT 1"),
-      arrayData.collect().take(1).toSeq)
+      arrayData.collect().take(1).map(Row.fromTuple).toSeq)
 
     checkAnswer(
       sql("SELECT * FROM mapData LIMIT 1"),
-      mapData.collect().take(1).toSeq)
+      mapData.collect().take(1).map(Row.fromTuple).toSeq)
+  }
+
+  test("from follow multiple brackets") {
+    checkAnswer(sql(
+      "select key from ((select * from testData limit 1) union all (select * from testData limit 1)) x limit 1"),
+      Row(1)
+    )
+
+    checkAnswer(sql(
+      "select key from (select * from testData) x limit 1"),
+      Row(1)
+    )
+
+    checkAnswer(sql(
+      "select key from (select * from testData limit 1 union all select * from testData limit 1) x limit 1"),
+      Row(1)
+    )
   }
 
   test("average") {
     checkAnswer(
       sql("SELECT AVG(a) FROM testData2"),
-      2.0)
+      Row(2.0))
   }
 
   test("average overflow") {
     checkAnswer(
       sql("SELECT AVG(a),b FROM largeAndSmallInts group by b"),
-      Seq((2147483645.0,1),(2.0,2)))
+      Seq(Row(2147483645.0,1), Row(2.0,2)))
   }
 
   test("count") {
     checkAnswer(
       sql("SELECT COUNT(*) FROM testData2"),
-      testData2.count())
+      Row(testData2.count()))
   }
 
   test("count distinct") {
     checkAnswer(
       sql("SELECT COUNT(DISTINCT b) FROM testData2"),
-      2)
+      Row(2))
   }
 
   test("approximate count distinct") {
     checkAnswer(
       sql("SELECT APPROXIMATE COUNT(DISTINCT a) FROM testData2"),
-      3)
+      Row(3))
   }
 
   test("approximate count distinct with user provided standard deviation") {
     checkAnswer(
       sql("SELECT APPROXIMATE(0.04) COUNT(DISTINCT a) FROM testData2"),
-      3)
+      Row(3))
   }
 
   test("null count") {
     checkAnswer(
       sql("SELECT a, COUNT(b) FROM testData3 GROUP BY a"),
-      Seq((1, 0), (2, 1)))
+      Seq(Row(1, 0), Row(2, 1)))
 
     checkAnswer(
       sql("SELECT COUNT(a), COUNT(b), COUNT(1), COUNT(DISTINCT a), COUNT(DISTINCT b) FROM testData3"),
-      (2, 1, 2, 2, 1) :: Nil)
+      Row(2, 1, 2, 2, 1))
   }
 
   test("inner join where, one match per row") {
     checkAnswer(
       sql("SELECT * FROM upperCaseData JOIN lowerCaseData WHERE n = N"),
       Seq(
-        (1, "A", 1, "a"),
-        (2, "B", 2, "b"),
-        (3, "C", 3, "c"),
-        (4, "D", 4, "d")))
+        Row(1, "A", 1, "a"),
+        Row(2, "B", 2, "b"),
+        Row(3, "C", 3, "c"),
+        Row(4, "D", 4, "d")))
   }
 
   test("inner join ON, one match per row") {
     checkAnswer(
       sql("SELECT * FROM upperCaseData JOIN lowerCaseData ON n = N"),
       Seq(
-        (1, "A", 1, "a"),
-        (2, "B", 2, "b"),
-        (3, "C", 3, "c"),
-        (4, "D", 4, "d")))
+        Row(1, "A", 1, "a"),
+        Row(2, "B", 2, "b"),
+        Row(3, "C", 3, "c"),
+        Row(4, "D", 4, "d")))
   }
 
   test("inner join, where, multiple matches") {
@@ -338,10 +363,10 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
         |  (SELECT * FROM testData2 WHERE a = 1) x JOIN
         |  (SELECT * FROM testData2 WHERE a = 1) y
         |WHERE x.a = y.a""".stripMargin),
-      (1,1,1,1) ::
-      (1,1,1,2) ::
-      (1,2,1,1) ::
-      (1,2,1,2) :: Nil)
+      Row(1,1,1,1) ::
+      Row(1,1,1,2) ::
+      Row(1,2,1,1) ::
+      Row(1,2,1,2) :: Nil)
   }
 
   test("inner join, no matches") {
@@ -372,38 +397,38 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
           |   SELECT * FROM testData) y
           |WHERE x.key = y.key""".stripMargin),
       testData.flatMap(
-        row => Seq.fill(16)((row ++ row).toSeq)).collect().toSeq)
+        row => Seq.fill(16)(Row.merge(row, row))).collect().toSeq)
   }
 
   ignore("cartesian product join") {
     checkAnswer(
       testData3.join(testData3),
-      (1, null, 1, null) ::
-      (1, null, 2, 2) ::
-      (2, 2, 1, null) ::
-      (2, 2, 2, 2) :: Nil)
+      Row(1, null, 1, null) ::
+      Row(1, null, 2, 2) ::
+      Row(2, 2, 1, null) ::
+      Row(2, 2, 2, 2) :: Nil)
   }
 
   test("left outer join") {
     checkAnswer(
       sql("SELECT * FROM upperCaseData LEFT OUTER JOIN lowerCaseData ON n = N"),
-      (1, "A", 1, "a") ::
-      (2, "B", 2, "b") ::
-      (3, "C", 3, "c") ::
-      (4, "D", 4, "d") ::
-      (5, "E", null, null) ::
-      (6, "F", null, null) :: Nil)
+      Row(1, "A", 1, "a") ::
+      Row(2, "B", 2, "b") ::
+      Row(3, "C", 3, "c") ::
+      Row(4, "D", 4, "d") ::
+      Row(5, "E", null, null) ::
+      Row(6, "F", null, null) :: Nil)
   }
 
   test("right outer join") {
     checkAnswer(
       sql("SELECT * FROM lowerCaseData RIGHT OUTER JOIN upperCaseData ON n = N"),
-      (1, "a", 1, "A") ::
-      (2, "b", 2, "B") ::
-      (3, "c", 3, "C") ::
-      (4, "d", 4, "D") ::
-      (null, null, 5, "E") ::
-      (null, null, 6, "F") :: Nil)
+      Row(1, "a", 1, "A") ::
+      Row(2, "b", 2, "B") ::
+      Row(3, "c", 3, "C") ::
+      Row(4, "d", 4, "D") ::
+      Row(null, null, 5, "E") ::
+      Row(null, null, 6, "F") :: Nil)
   }
 
   test("full outer join") {
@@ -415,12 +440,12 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
           |  (SELECT * FROM upperCaseData WHERE N >= 3) rightTable
           |    ON leftTable.N = rightTable.N
         """.stripMargin),
-      (1, "A", null, null) ::
-      (2, "B", null, null) ::
-      (3, "C", 3, "C") ::
-      (4, "D", 4, "D") ::
-      (null, null, 5, "E") ::
-      (null, null, 6, "F") :: Nil)
+      Row(1, "A", null, null) ::
+      Row(2, "B", null, null) ::
+      Row(3, "C", 3, "C") ::
+      Row (4, "D", 4, "D") ::
+      Row(null, null, 5, "E") ::
+      Row(null, null, 6, "F") :: Nil)
   }
 
   test("SPARK-3349 partitioning after limit") {
@@ -432,12 +457,12 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
       .registerTempTable("subset2")
     checkAnswer(
       sql("SELECT * FROM lowerCaseData INNER JOIN subset1 ON subset1.n = lowerCaseData.n"),
-      (3, "c", 3) ::
-      (4, "d", 4) :: Nil)
+      Row(3, "c", 3) ::
+      Row(4, "d", 4) :: Nil)
     checkAnswer(
       sql("SELECT * FROM lowerCaseData INNER JOIN subset2 ON subset2.n = lowerCaseData.n"),
-      (1, "a", 1) ::
-      (2, "b", 2) :: Nil)
+      Row(1, "a", 1) ::
+      Row(2, "b", 2) :: Nil)
   }
 
   test("mixed-case keywords") {
@@ -449,28 +474,28 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
           |  (sElEcT * FROM upperCaseData whERe N >= 3) rightTable
           |    oN leftTable.N = rightTable.N
         """.stripMargin),
-      (1, "A", null, null) ::
-      (2, "B", null, null) ::
-      (3, "C", 3, "C") ::
-      (4, "D", 4, "D") ::
-      (null, null, 5, "E") ::
-      (null, null, 6, "F") :: Nil)
+      Row(1, "A", null, null) ::
+      Row(2, "B", null, null) ::
+      Row(3, "C", 3, "C") ::
+      Row(4, "D", 4, "D") ::
+      Row(null, null, 5, "E") ::
+      Row(null, null, 6, "F") :: Nil)
   }
 
   test("select with table name as qualifier") {
     checkAnswer(
       sql("SELECT testData.value FROM testData WHERE testData.key = 1"),
-      Seq(Seq("1")))
+      Row("1"))
   }
 
   test("inner join ON with table name as qualifier") {
     checkAnswer(
       sql("SELECT * FROM upperCaseData JOIN lowerCaseData ON lowerCaseData.n = upperCaseData.N"),
       Seq(
-        (1, "A", 1, "a"),
-        (2, "B", 2, "b"),
-        (3, "C", 3, "c"),
-        (4, "D", 4, "d")))
+        Row(1, "A", 1, "a"),
+        Row(2, "B", 2, "b"),
+        Row(3, "C", 3, "c"),
+        Row(4, "D", 4, "d")))
   }
 
   test("qualified select with inner join ON with table name as qualifier") {
@@ -478,72 +503,72 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
       sql("SELECT upperCaseData.N, upperCaseData.L FROM upperCaseData JOIN lowerCaseData " +
         "ON lowerCaseData.n = upperCaseData.N"),
       Seq(
-        (1, "A"),
-        (2, "B"),
-        (3, "C"),
-        (4, "D")))
+        Row(1, "A"),
+        Row(2, "B"),
+        Row(3, "C"),
+        Row(4, "D")))
   }
 
   test("system function upper()") {
     checkAnswer(
       sql("SELECT n,UPPER(l) FROM lowerCaseData"),
       Seq(
-        (1, "A"),
-        (2, "B"),
-        (3, "C"),
-        (4, "D")))
+        Row(1, "A"),
+        Row(2, "B"),
+        Row(3, "C"),
+        Row(4, "D")))
 
     checkAnswer(
       sql("SELECT n, UPPER(s) FROM nullStrings"),
       Seq(
-        (1, "ABC"),
-        (2, "ABC"),
-        (3, null)))
+        Row(1, "ABC"),
+        Row(2, "ABC"),
+        Row(3, null)))
   }
 
   test("system function lower()") {
     checkAnswer(
       sql("SELECT N,LOWER(L) FROM upperCaseData"),
       Seq(
-        (1, "a"),
-        (2, "b"),
-        (3, "c"),
-        (4, "d"),
-        (5, "e"),
-        (6, "f")))
+        Row(1, "a"),
+        Row(2, "b"),
+        Row(3, "c"),
+        Row(4, "d"),
+        Row(5, "e"),
+        Row(6, "f")))
 
     checkAnswer(
       sql("SELECT n, LOWER(s) FROM nullStrings"),
       Seq(
-        (1, "abc"),
-        (2, "abc"),
-        (3, null)))
+        Row(1, "abc"),
+        Row(2, "abc"),
+        Row(3, null)))
   }
 
   test("UNION") {
     checkAnswer(
       sql("SELECT * FROM lowerCaseData UNION SELECT * FROM upperCaseData"),
-      (1, "A") :: (1, "a") :: (2, "B") :: (2, "b") :: (3, "C") :: (3, "c") ::
-      (4, "D") :: (4, "d") :: (5, "E") :: (6, "F") :: Nil)
+      Row(1, "A") :: Row(1, "a") :: Row(2, "B") :: Row(2, "b") :: Row(3, "C") :: Row(3, "c") ::
+      Row(4, "D") :: Row(4, "d") :: Row(5, "E") :: Row(6, "F") :: Nil)
     checkAnswer(
       sql("SELECT * FROM lowerCaseData UNION SELECT * FROM lowerCaseData"),
-      (1, "a") :: (2, "b") :: (3, "c") :: (4, "d") :: Nil)
+      Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Row(4, "d") :: Nil)
     checkAnswer(
       sql("SELECT * FROM lowerCaseData UNION ALL SELECT * FROM lowerCaseData"),
-      (1, "a") :: (1, "a") :: (2, "b") :: (2, "b") :: (3, "c") :: (3, "c") ::
-      (4, "d") :: (4, "d") :: Nil)
+      Row(1, "a") :: Row(1, "a") :: Row(2, "b") :: Row(2, "b") :: Row(3, "c") :: Row(3, "c") ::
+      Row(4, "d") :: Row(4, "d") :: Nil)
   }
 
   test("UNION with column mismatches") {
     // Column name mismatches are allowed.
     checkAnswer(
       sql("SELECT n,l FROM lowerCaseData UNION SELECT N as x1, L as x2 FROM upperCaseData"),
-      (1, "A") :: (1, "a") :: (2, "B") :: (2, "b") :: (3, "C") :: (3, "c") ::
-      (4, "D") :: (4, "d") :: (5, "E") :: (6, "F") :: Nil)
+      Row(1, "A") :: Row(1, "a") :: Row(2, "B") :: Row(2, "b") :: Row(3, "C") :: Row(3, "c") ::
+      Row(4, "D") :: Row(4, "d") :: Row(5, "E") :: Row(6, "F") :: Nil)
     // Column type mismatches are not allowed, forcing a type coercion.
     checkAnswer(
       sql("SELECT n FROM lowerCaseData UNION SELECT L FROM upperCaseData"),
-      ("1" :: "2" :: "3" :: "4" :: "A" :: "B" :: "C" :: "D" :: "E" :: "F" :: Nil).map(Tuple1(_)))
+      ("1" :: "2" :: "3" :: "4" :: "A" :: "B" :: "C" :: "D" :: "E" :: "F" :: Nil).map(Row(_)))
     // Column type mismatches where a coercion is not possible, in this case between integer
     // and array types, trigger a TreeNodeException.
     intercept[TreeNodeException[_]] {
@@ -554,10 +579,10 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("EXCEPT") {
     checkAnswer(
       sql("SELECT * FROM lowerCaseData EXCEPT SELECT * FROM upperCaseData"),
-      (1, "a") ::
-      (2, "b") ::
-      (3, "c") ::
-      (4, "d") :: Nil)
+      Row(1, "a") ::
+      Row(2, "b") ::
+      Row(3, "c") ::
+      Row(4, "d") :: Nil)
     checkAnswer(
       sql("SELECT * FROM lowerCaseData EXCEPT SELECT * FROM lowerCaseData"), Nil)
     checkAnswer(
@@ -567,16 +592,16 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("INTERSECT") {
     checkAnswer(
       sql("SELECT * FROM lowerCaseData INTERSECT SELECT * FROM lowerCaseData"),
-      (1, "a") ::
-      (2, "b") ::
-      (3, "c") ::
-      (4, "d") :: Nil)
+      Row(1, "a") ::
+      Row(2, "b") ::
+      Row(3, "c") ::
+      Row(4, "d") :: Nil)
     checkAnswer(
       sql("SELECT * FROM lowerCaseData INTERSECT SELECT * FROM upperCaseData"), Nil)
   }
 
   test("SET commands semantics using sql()") {
-    clear()
+    conf.clear()
     val testKey = "test.key.0"
     val testVal = "test.val.0"
     val nonexistentKey = "nonexistent"
@@ -588,27 +613,27 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
     sql(s"SET $testKey=$testVal")
     checkAnswer(
       sql("SET"),
-      Seq(Seq(s"$testKey=$testVal"))
+      Row(s"$testKey=$testVal")
     )
 
     sql(s"SET ${testKey + testKey}=${testVal + testVal}")
     checkAnswer(
       sql("set"),
       Seq(
-        Seq(s"$testKey=$testVal"),
-        Seq(s"${testKey + testKey}=${testVal + testVal}"))
+        Row(s"$testKey=$testVal"),
+        Row(s"${testKey + testKey}=${testVal + testVal}"))
     )
 
     // "set key"
     checkAnswer(
       sql(s"SET $testKey"),
-      Seq(Seq(s"$testKey=$testVal"))
+      Row(s"$testKey=$testVal")
     )
     checkAnswer(
       sql(s"SET $nonexistentKey"),
-      Seq(Seq(s"$nonexistentKey=<undefined>"))
+      Row(s"$nonexistentKey=<undefined>")
     )
-    clear()
+    conf.clear()
   }
 
   test("apply schema") {
@@ -630,17 +655,17 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
     schemaRDD1.registerTempTable("applySchema1")
     checkAnswer(
       sql("SELECT * FROM applySchema1"),
-      (1, "A1", true, null) ::
-      (2, "B2", false, null) ::
-      (3, "C3", true, null) ::
-      (4, "D4", true, 2147483644) :: Nil)
+      Row(1, "A1", true, null) ::
+      Row(2, "B2", false, null) ::
+      Row(3, "C3", true, null) ::
+      Row(4, "D4", true, 2147483644) :: Nil)
 
     checkAnswer(
       sql("SELECT f1, f4 FROM applySchema1"),
-      (1, null) ::
-      (2, null) ::
-      (3, null) ::
-      (4, 2147483644) :: Nil)
+      Row(1, null) ::
+      Row(2, null) ::
+      Row(3, null) ::
+      Row(4, 2147483644) :: Nil)
 
     val schema2 = StructType(
       StructField("f1", StructType(
@@ -660,17 +685,17 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
     schemaRDD2.registerTempTable("applySchema2")
     checkAnswer(
       sql("SELECT * FROM applySchema2"),
-      (Seq(1, true), Map("A1" -> null)) ::
-      (Seq(2, false), Map("B2" -> null)) ::
-      (Seq(3, true), Map("C3" -> null)) ::
-      (Seq(4, true), Map("D4" -> 2147483644)) :: Nil)
+      Row(Row(1, true), Map("A1" -> null)) ::
+      Row(Row(2, false), Map("B2" -> null)) ::
+      Row(Row(3, true), Map("C3" -> null)) ::
+      Row(Row(4, true), Map("D4" -> 2147483644)) :: Nil)
 
     checkAnswer(
       sql("SELECT f1.f11, f2['D4'] FROM applySchema2"),
-      (1, null) ::
-      (2, null) ::
-      (3, null) ::
-      (4, 2147483644) :: Nil)
+      Row(1, null) ::
+      Row(2, null) ::
+      Row(3, null) ::
+      Row(4, 2147483644) :: Nil)
 
     // The value of a MapType column can be a mutable map.
     val rowRDD3 = unparsedStrings.map { r =>
@@ -686,26 +711,26 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
 
     checkAnswer(
       sql("SELECT f1.f11, f2['D4'] FROM applySchema3"),
-      (1, null) ::
-      (2, null) ::
-      (3, null) ::
-      (4, 2147483644) :: Nil)
+      Row(1, null) ::
+      Row(2, null) ::
+      Row(3, null) ::
+      Row(4, 2147483644) :: Nil)
   }
 
   test("SPARK-3423 BETWEEN") {
     checkAnswer(
       sql("SELECT key, value FROM testData WHERE key BETWEEN 5 and 7"),
-      Seq((5, "5"), (6, "6"), (7, "7"))
+      Seq(Row(5, "5"), Row(6, "6"), Row(7, "7"))
     )
 
     checkAnswer(
       sql("SELECT key, value FROM testData WHERE key BETWEEN 7 and 7"),
-      Seq((7, "7"))
+      Row(7, "7")
     )
 
     checkAnswer(
       sql("SELECT key, value FROM testData WHERE key BETWEEN 9 and 7"),
-      Seq()
+      Nil
     )
   }
 
@@ -713,7 +738,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
     // TODO Ensure true/false string letter casing is consistent with Hive in all cases.
     checkAnswer(
       sql("SELECT CAST(TRUE AS STRING), CAST(FALSE AS STRING) FROM testData LIMIT 1"),
-      ("true", "false") :: Nil)
+      Row("true", "false"))
   }
 
   test("metadata is propagated correctly") {
@@ -724,7 +749,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
     val metadata = new MetadataBuilder()
       .putString(docKey, docValue)
       .build()
-    val schemaWithMeta = new StructType(Seq(
+    val schemaWithMeta = new StructType(Array(
       schema("id"), schema("name").copy(metadata = metadata), schema("age")))
     val personWithMeta = applySchema(person, schemaWithMeta)
     def validateMetadata(rdd: SchemaRDD): Unit = {
@@ -741,19 +766,22 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   }
 
   test("SPARK-3371 Renaming a function expression with group by gives error") {
-    registerFunction("len", (s: String) => s.length)
+    udf.register("len", (s: String) => s.length)
     checkAnswer(
-      sql("SELECT len(value) as temp FROM testData WHERE key = 1 group by len(value)"), 1)
+      sql("SELECT len(value) as temp FROM testData WHERE key = 1 group by len(value)"),
+      Row(1))
   }
 
   test("SPARK-3813 CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END") {
     checkAnswer(
-      sql("SELECT CASE key WHEN 1 THEN 1 ELSE 0 END FROM testData WHERE key = 1 group by key"), 1)
+      sql("SELECT CASE key WHEN 1 THEN 1 ELSE 0 END FROM testData WHERE key = 1 group by key"),
+      Row(1))
   }
 
   test("SPARK-3813 CASE WHEN a THEN b [WHEN c THEN d]* [ELSE e] END") {
     checkAnswer(
-      sql("SELECT CASE WHEN key = 1 THEN 1 ELSE 2 END FROM testData WHERE key = 1 group by key"), 1)
+      sql("SELECT CASE WHEN key = 1 THEN 1 ELSE 2 END FROM testData WHERE key = 1 group by key"),
+      Row(1))
   }
 
   test("throw errors for non-aggregate attributes with aggregation") {
@@ -783,130 +811,131 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
 
   test("Test to check we can use Long.MinValue") {
     checkAnswer(
-      sql(s"SELECT ${Long.MinValue} FROM testData ORDER BY key LIMIT 1"), Long.MinValue
+      sql(s"SELECT ${Long.MinValue} FROM testData ORDER BY key LIMIT 1"), Row(Long.MinValue)
     )
 
     checkAnswer(
-      sql(s"SELECT key FROM testData WHERE key > ${Long.MinValue}"), (1 to 100).map(Row(_)).toSeq
+      sql(s"SELECT key FROM testData WHERE key > ${Long.MinValue}"),
+      (1 to 100).map(Row(_)).toSeq
     )
   }
 
   test("Floating point number format") {
     checkAnswer(
-      sql("SELECT 0.3"), 0.3
+      sql("SELECT 0.3"), Row(0.3)
     )
 
     checkAnswer(
-      sql("SELECT -0.8"), -0.8
+      sql("SELECT -0.8"), Row(-0.8)
     )
 
     checkAnswer(
-      sql("SELECT .5"), 0.5
+      sql("SELECT .5"), Row(0.5)
     )
 
     checkAnswer(
-      sql("SELECT -.18"), -0.18
+      sql("SELECT -.18"), Row(-0.18)
     )
   }
 
   test("Auto cast integer type") {
     checkAnswer(
-      sql(s"SELECT ${Int.MaxValue + 1L}"), Int.MaxValue + 1L
+      sql(s"SELECT ${Int.MaxValue + 1L}"), Row(Int.MaxValue + 1L)
     )
 
     checkAnswer(
-      sql(s"SELECT ${Int.MinValue - 1L}"), Int.MinValue - 1L
+      sql(s"SELECT ${Int.MinValue - 1L}"), Row(Int.MinValue - 1L)
     )
 
     checkAnswer(
-      sql("SELECT 9223372036854775808"), BigDecimal("9223372036854775808")
+      sql("SELECT 9223372036854775808"), Row(new java.math.BigDecimal("9223372036854775808"))
     )
 
     checkAnswer(
-      sql("SELECT -9223372036854775809"), BigDecimal("-9223372036854775809")
+      sql("SELECT -9223372036854775809"), Row(new java.math.BigDecimal("-9223372036854775809"))
     )
   }
 
   test("Test to check we can apply sign to expression") {
 
     checkAnswer(
-      sql("SELECT -100"), -100
+      sql("SELECT -100"), Row(-100)
     )
 
     checkAnswer(
-      sql("SELECT +230"), 230
+      sql("SELECT +230"), Row(230)
     )
 
     checkAnswer(
-      sql("SELECT -5.2"), -5.2
+      sql("SELECT -5.2"), Row(-5.2)
     )
 
     checkAnswer(
-      sql("SELECT +6.8"), 6.8
+      sql("SELECT +6.8"), Row(6.8)
     )
 
     checkAnswer(
-      sql("SELECT -key FROM testData WHERE key = 2"), -2
+      sql("SELECT -key FROM testData WHERE key = 2"), Row(-2)
     )
 
     checkAnswer(
-      sql("SELECT +key FROM testData WHERE key = 3"), 3
+      sql("SELECT +key FROM testData WHERE key = 3"), Row(3)
     )
 
     checkAnswer(
-      sql("SELECT -(key + 1) FROM testData WHERE key = 1"), -2
+      sql("SELECT -(key + 1) FROM testData WHERE key = 1"), Row(-2)
     )
 
     checkAnswer(
-      sql("SELECT - key + 1 FROM testData WHERE key = 10"), -9
+      sql("SELECT - key + 1 FROM testData WHERE key = 10"), Row(-9)
     )
 
     checkAnswer(
-      sql("SELECT +(key + 5) FROM testData WHERE key = 5"), 10
+      sql("SELECT +(key + 5) FROM testData WHERE key = 5"), Row(10)
     )
 
     checkAnswer(
-      sql("SELECT -MAX(key) FROM testData"), -100
+      sql("SELECT -MAX(key) FROM testData"), Row(-100)
     )
 
     checkAnswer(
-      sql("SELECT +MAX(key) FROM testData"), 100
+      sql("SELECT +MAX(key) FROM testData"), Row(100)
     )
 
     checkAnswer(
-      sql("SELECT - (-10)"), 10
+      sql("SELECT - (-10)"), Row(10)
     )
 
     checkAnswer(
-      sql("SELECT + (-key) FROM testData WHERE key = 32"), -32
+      sql("SELECT + (-key) FROM testData WHERE key = 32"), Row(-32)
     )
 
     checkAnswer(
-      sql("SELECT - (+Max(key)) FROM testData"), -100
+      sql("SELECT - (+Max(key)) FROM testData"), Row(-100)
     )
 
     checkAnswer(
-      sql("SELECT - - 3"), 3
+      sql("SELECT - - 3"), Row(3)
     )
 
     checkAnswer(
-      sql("SELECT - + 20"), -20
+      sql("SELECT - + 20"), Row(-20)
     )
 
     checkAnswer(
-      sql("SELEcT - + 45"), -45
+      sql("SELEcT - + 45"), Row(-45)
     )
 
     checkAnswer(
-      sql("SELECT + + 100"), 100
+      sql("SELECT + + 100"), Row(100)
     )
 
     checkAnswer(
-      sql("SELECT - - Max(key) FROM testData"), 100
+      sql("SELECT - - Max(key) FROM testData"), Row(100)
     )
 
     checkAnswer(
-      sql("SELECT + - key FROM testData WHERE key = 33"), -33
+      sql("SELECT + - key FROM testData WHERE key = 33"), Row(-33)
     )
   }
 
@@ -918,7 +947,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
           |JOIN testData b ON a.key = b.key
           |JOIN testData c ON a.key = c.key
         """.stripMargin),
-      (1 to 100).map(i => Seq(i, i, i)))
+      (1 to 100).map(i => Row(i, i, i)))
   }
 
   test("SPARK-3483 Special chars in column names") {
@@ -928,19 +957,19 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   }
 
   test("SPARK-3814 Support Bitwise & operator") {
-    checkAnswer(sql("SELECT key&1 FROM testData WHERE key = 1 "), 1)
+    checkAnswer(sql("SELECT key&1 FROM testData WHERE key = 1 "), Row(1))
   }
 
   test("SPARK-3814 Support Bitwise | operator") {
-    checkAnswer(sql("SELECT key|0 FROM testData WHERE key = 1 "), 1)
+    checkAnswer(sql("SELECT key|0 FROM testData WHERE key = 1 "), Row(1))
   }
 
   test("SPARK-3814 Support Bitwise ^ operator") {
-    checkAnswer(sql("SELECT key^0 FROM testData WHERE key = 1 "), 1)
+    checkAnswer(sql("SELECT key^0 FROM testData WHERE key = 1 "), Row(1))
   }
 
   test("SPARK-3814 Support Bitwise ~ operator") {
-    checkAnswer(sql("SELECT ~key FROM testData WHERE key = 1 "), -2)
+    checkAnswer(sql("SELECT ~key FROM testData WHERE key = 1 "), Row(-2))
   }
 
   test("SPARK-4120 Join of multiple tables does not work in SparkSQL") {
@@ -950,33 +979,40 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
           |FROM testData a,testData b,testData c
           |where a.key = b.key and a.key = c.key
         """.stripMargin),
-      (1 to 100).map(i => Seq(i, i, i)))
+      (1 to 100).map(i => Row(i, i, i)))
   }
 
   test("SPARK-4154 Query does not work if it has 'not between' in Spark SQL and HQL") {
     checkAnswer(sql("SELECT key FROM testData WHERE key not between 0 and 10 order by key"),
-        (11 to 100).map(i => Seq(i)))
+        (11 to 100).map(i => Row(i)))
   }
 
   test("SPARK-4207 Query which has syntax like 'not like' is not working in Spark SQL") {
     checkAnswer(sql("SELECT key FROM testData WHERE value not like '100%' order by key"),
-        (1 to 99).map(i => Seq(i)))
+        (1 to 99).map(i => Row(i)))
   }
 
   test("SPARK-4322 Grouping field with struct field as sub expression") {
     jsonRDD(sparkContext.makeRDD("""{"a": {"b": [{"c": 1}]}}""" :: Nil)).registerTempTable("data")
-    checkAnswer(sql("SELECT a.b[0].c FROM data GROUP BY a.b[0].c"), 1)
+    checkAnswer(sql("SELECT a.b[0].c FROM data GROUP BY a.b[0].c"), Row(1))
     dropTempTable("data")
 
     jsonRDD(sparkContext.makeRDD("""{"a": {"b": 1}}""" :: Nil)).registerTempTable("data")
-    checkAnswer(sql("SELECT a.b + 1 FROM data GROUP BY a.b + 1"), 2)
+    checkAnswer(sql("SELECT a.b + 1 FROM data GROUP BY a.b + 1"), Row(2))
     dropTempTable("data")
   }
 
   test("SPARK-4432 Fix attribute reference resolution error when using ORDER BY") {
     checkAnswer(
       sql("SELECT a + b FROM testData2 ORDER BY a"),
-      Seq(2, 3, 3 ,4 ,4 ,5).map(Seq(_))
+      Seq(2, 3, 3 ,4 ,4 ,5).map(Row(_))
+    )
+  }
+
+  test("oder by asc by default when not specify ascending and descending") {
+    checkAnswer(
+      sql("SELECT a, b FROM testData2 ORDER BY a desc, b"),
+      Seq(Row(3, 1), Row(3, 2), Row(2, 1), Row(2,2), Row(1, 1), Row(1, 2))
     )
   }
 
@@ -989,13 +1025,13 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
     rdd2.registerTempTable("nulldata2")
     checkAnswer(sql("SELECT nulldata1.key FROM nulldata1 join " +
       "nulldata2 on nulldata1.value <=> nulldata2.value"),
-        (1 to 2).map(i => Seq(i)))
+        (1 to 2).map(i => Row(i)))
   }
 
   test("Multi-column COUNT(DISTINCT ...)") {
     val data = TestData(1,"val_1") :: TestData(2,"val_2") :: Nil
     val rdd = sparkContext.parallelize((0 to 1).map(i => data(i)))
     rdd.registerTempTable("distinctData")
-    checkAnswer(sql("SELECT COUNT(DISTINCT key,value) FROM distinctData"), 2)
+    checkAnswer(sql("SELECT COUNT(DISTINCT key,value) FROM distinctData"), Row(2))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
index cf3a59e545905..a015884bae282 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql
 
 import java.sql.{Date, Timestamp}
 
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.catalyst.expressions._
@@ -34,7 +33,7 @@ case class ReflectData(
     shortField: Short,
     byteField: Byte,
     booleanField: Boolean,
-    decimalField: BigDecimal,
+    decimalField: java.math.BigDecimal,
     date: Date,
     timestampField: Timestamp,
     seqInt: Seq[Int])
@@ -78,13 +77,13 @@ case class ComplexReflectData(
 class ScalaReflectionRelationSuite extends FunSuite {
   test("query case class RDD") {
     val data = ReflectData("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
-                           BigDecimal(1), new Date(12345), new Timestamp(12345), Seq(1,2,3))
+                           new java.math.BigDecimal(1), new Date(12345), new Timestamp(12345), Seq(1,2,3))
     val rdd = sparkContext.parallelize(data :: Nil)
     rdd.registerTempTable("reflectData")
 
     assert(sql("SELECT * FROM reflectData").collect().head ===
-      Seq("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
-          BigDecimal(1), new Date(12345), new Timestamp(12345), Seq(1,2,3)))
+      Row("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
+        new java.math.BigDecimal(1), new Date(12345), new Timestamp(12345), Seq(1,2,3)))
   }
 
   test("query case class RDD with nulls") {
@@ -92,7 +91,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
     val rdd = sparkContext.parallelize(data :: Nil)
     rdd.registerTempTable("reflectNullData")
 
-    assert(sql("SELECT * FROM reflectNullData").collect().head === Seq.fill(7)(null))
+    assert(sql("SELECT * FROM reflectNullData").collect().head === Row.fromSeq(Seq.fill(7)(null)))
   }
 
   test("query case class RDD with Nones") {
@@ -100,7 +99,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
     val rdd = sparkContext.parallelize(data :: Nil)
     rdd.registerTempTable("reflectOptionalData")
 
-    assert(sql("SELECT * FROM reflectOptionalData").collect().head === Seq.fill(7)(null))
+    assert(sql("SELECT * FROM reflectOptionalData").collect().head === Row.fromSeq(Seq.fill(7)(null)))
   }
 
   // Equality is broken for Arrays, so we test that separately.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
index bb553a0a1e50c..808ed5288cfb8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
@@ -55,10 +55,11 @@ object TestData {
       TestData2(2, 1) ::
       TestData2(2, 2) ::
       TestData2(3, 1) ::
-      TestData2(3, 2) :: Nil).toSchemaRDD
+      TestData2(3, 2) :: Nil, 2).toSchemaRDD
   testData2.registerTempTable("testData2")
 
   case class DecimalData(a: BigDecimal, b: BigDecimal)
+
   val decimalData =
     TestSQLContext.sparkContext.parallelize(
       DecimalData(1, 1) ::
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index ef9b76b1e251e..0c98120031242 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -27,18 +27,22 @@ case class FunctionResult(f1: String, f2: String)
 class UDFSuite extends QueryTest {
 
   test("Simple UDF") {
-    registerFunction("strLenScala", (_: String).length)
+    udf.register("strLenScala", (_: String).length)
     assert(sql("SELECT strLenScala('test')").first().getInt(0) === 4)
   }
 
+  test("ZeroArgument UDF") {
+    udf.register("random0", () => { Math.random()})
+    assert(sql("SELECT random0()").first().getDouble(0) >= 0.0)
+  }
+
   test("TwoArgument UDF") {
-    registerFunction("strLenScala", (_: String).length + (_:Int))
+    udf.register("strLenScala", (_: String).length + (_:Int))
     assert(sql("SELECT strLenScala('test', 1)").first().getInt(0) === 5)
   }
 
-
   test("struct UDF") {
-    registerFunction("returnStruct", (f1: String, f2: String) => FunctionResult(f1, f2))
+    udf.register("returnStruct", (f1: String, f2: String) => FunctionResult(f1, f2))
 
     val result=
       sql("SELECT returnStruct('test', 'test2') as ret")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index 1806a1dd82023..fbc8704f7837b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -20,9 +20,8 @@ package org.apache.spark.sql
 import scala.beans.{BeanInfo, BeanProperty}
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.annotation.SQLUserDefinedType
-import org.apache.spark.sql.catalyst.types.UserDefinedType
 import org.apache.spark.sql.test.TestSQLContext._
+import org.apache.spark.sql.types._
 
 @SQLUserDefinedType(udt = classOf[MyDenseVectorUDT])
 private[sql] class MyDenseVector(val data: Array[Double]) extends Serializable {
@@ -82,7 +81,7 @@ class UserDefinedTypeSuite extends QueryTest {
   }
 
   test("UDTs and UDFs") {
-    registerFunction("testType", (d: MyDenseVector) => d.isInstanceOf[MyDenseVector])
+    udf.register("testType", (d: MyDenseVector) => d.isInstanceOf[MyDenseVector])
     pointsRDD.registerTempTable("points")
     checkAnswer(
       sql("SELECT testType(features) from points"),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala
deleted file mode 100644
index 8afc3a9fb2187..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java
-
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
-
-import scala.beans.BeanProperty
-
-import org.scalatest.FunSuite
-
-import org.apache.spark.api.java.JavaSparkContext
-import org.apache.spark.sql.catalyst.util._
-import org.apache.spark.sql.test.TestSQLContext
-
-// Implicits
-import scala.collection.JavaConversions._
-
-class PersonBean extends Serializable {
-  @BeanProperty
-  var name: String = _
-
-  @BeanProperty
-  var age: Int = _
-}
-
-class AllTypesBean extends Serializable {
-  @BeanProperty var stringField: String = _
-  @BeanProperty var intField: java.lang.Integer = _
-  @BeanProperty var longField: java.lang.Long = _
-  @BeanProperty var floatField: java.lang.Float = _
-  @BeanProperty var doubleField: java.lang.Double = _
-  @BeanProperty var shortField: java.lang.Short = _
-  @BeanProperty var byteField: java.lang.Byte = _
-  @BeanProperty var booleanField: java.lang.Boolean = _
-  @BeanProperty var dateField: java.sql.Date = _
-  @BeanProperty var timestampField: java.sql.Timestamp = _
-  @BeanProperty var bigDecimalField: java.math.BigDecimal = _
-}
-
-class JavaSQLSuite extends FunSuite {
-  val javaCtx = new JavaSparkContext(TestSQLContext.sparkContext)
-  val javaSqlCtx = new JavaSQLContext(javaCtx)
-
-  test("schema from JavaBeans") {
-    val person = new PersonBean
-    person.setName("Michael")
-    person.setAge(29)
-
-    val rdd = javaCtx.parallelize(person :: Nil)
-    val schemaRDD = javaSqlCtx.applySchema(rdd, classOf[PersonBean])
-
-    schemaRDD.registerTempTable("people")
-    javaSqlCtx.sql("SELECT * FROM people").collect()
-  }
-
-  test("schema with null from JavaBeans") {
-    val person = new PersonBean
-    person.setName("Michael")
-    person.setAge(29)
-
-    val rdd = javaCtx.parallelize(person :: Nil)
-    val schemaRDD = javaSqlCtx.applySchema(rdd, classOf[PersonBean])
-
-    schemaRDD.registerTempTable("people")
-    val nullRDD = javaSqlCtx.sql("SELECT null FROM people")
-    val structFields = nullRDD.schema.getFields()
-    assert(structFields.size == 1)
-    assert(structFields(0).getDataType().isInstanceOf[NullType])
-    assert(nullRDD.collect.head.row === Seq(null))
-  }
-
-  test("all types in JavaBeans") {
-    val bean = new AllTypesBean
-    bean.setStringField("")
-    bean.setIntField(0)
-    bean.setLongField(0)
-    bean.setFloatField(0.0F)
-    bean.setDoubleField(0.0)
-    bean.setShortField(0.toShort)
-    bean.setByteField(0.toByte)
-    bean.setBooleanField(false)
-    bean.setDateField(java.sql.Date.valueOf("2014-10-10"))
-    bean.setTimestampField(java.sql.Timestamp.valueOf("2014-10-10 00:00:00.0"))
-    bean.setBigDecimalField(new java.math.BigDecimal(0))
-
-    val rdd = javaCtx.parallelize(bean :: Nil)
-    val schemaRDD = javaSqlCtx.applySchema(rdd, classOf[AllTypesBean])
-    schemaRDD.registerTempTable("allTypes")
-
-    assert(
-      javaSqlCtx.sql(
-        """
-          |SELECT stringField, intField, longField, floatField, doubleField, shortField, byteField,
-          |       booleanField, dateField, timestampField, bigDecimalField
-          |FROM allTypes
-        """.stripMargin).collect.head.row ===
-      Seq("", 0, 0L, 0F, 0.0, 0.toShort, 0.toByte, false, java.sql.Date.valueOf("2014-10-10"),
-        java.sql.Timestamp.valueOf("2014-10-10 00:00:00.0"), scala.math.BigDecimal(0)))
-  }
-
-  test("decimal types in JavaBeans") {
-    val bean = new AllTypesBean
-    bean.setStringField("")
-    bean.setIntField(0)
-    bean.setLongField(0)
-    bean.setFloatField(0.0F)
-    bean.setDoubleField(0.0)
-    bean.setShortField(0.toShort)
-    bean.setByteField(0.toByte)
-    bean.setBooleanField(false)
-    bean.setDateField(java.sql.Date.valueOf("2014-10-10"))
-    bean.setTimestampField(java.sql.Timestamp.valueOf("2014-10-10 00:00:00.0"))
-    bean.setBigDecimalField(new java.math.BigDecimal(0))
-
-    val rdd = javaCtx.parallelize(bean :: Nil)
-    val schemaRDD = javaSqlCtx.applySchema(rdd, classOf[AllTypesBean])
-    schemaRDD.registerTempTable("decimalTypes")
-
-    assert(javaSqlCtx.sql(
-      "select bigDecimalField + bigDecimalField from decimalTypes"
-    ).collect.head.row === Seq(scala.math.BigDecimal(0)))
-  }
-
-  test("all types null in JavaBeans") {
-    val bean = new AllTypesBean
-    bean.setStringField(null)
-    bean.setIntField(null)
-    bean.setLongField(null)
-    bean.setFloatField(null)
-    bean.setDoubleField(null)
-    bean.setShortField(null)
-    bean.setByteField(null)
-    bean.setBooleanField(null)
-    bean.setDateField(null)
-    bean.setTimestampField(null)
-    bean.setBigDecimalField(null)
-
-    val rdd = javaCtx.parallelize(bean :: Nil)
-    val schemaRDD = javaSqlCtx.applySchema(rdd, classOf[AllTypesBean])
-    schemaRDD.registerTempTable("allTypes")
-
-    assert(
-      javaSqlCtx.sql(
-        """
-          |SELECT stringField, intField, longField, floatField, doubleField, shortField, byteField,
-          |       booleanField, dateField, timestampField, bigDecimalField
-          |FROM allTypes
-        """.stripMargin).collect.head.row ===
-        Seq.fill(11)(null))
-  }
-
-  test("loads JSON datasets") {
-    val jsonString =
-      """{"string":"this is a simple string.",
-          "integer":10,
-          "long":21474836470,
-          "bigInteger":92233720368547758070,
-          "double":1.7976931348623157E308,
-          "boolean":true,
-          "null":null
-      }""".replaceAll("\n", " ")
-    val rdd = javaCtx.parallelize(jsonString :: Nil)
-
-    var schemaRDD = javaSqlCtx.jsonRDD(rdd)
-
-    schemaRDD.registerTempTable("jsonTable1")
-
-    assert(
-      javaSqlCtx.sql("select * from jsonTable1").collect.head.row ===
-        Seq(BigDecimal("92233720368547758070"),
-            true,
-            1.7976931348623157E308,
-            10,
-            21474836470L,
-            null,
-            "this is a simple string."))
-
-    val file = getTempFilePath("json")
-    val path = file.toString
-    rdd.saveAsTextFile(path)
-    schemaRDD = javaSqlCtx.jsonFile(path)
-
-    schemaRDD.registerTempTable("jsonTable2")
-
-    assert(
-      javaSqlCtx.sql("select * from jsonTable2").collect.head.row ===
-        Seq(BigDecimal("92233720368547758070"),
-            true,
-            1.7976931348623157E308,
-            10,
-            21474836470L,
-            null,
-            "this is a simple string."))
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/api/java/ScalaSideDataTypeConversionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/api/java/ScalaSideDataTypeConversionSuite.scala
deleted file mode 100644
index 62fe59dd345d7..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/api/java/ScalaSideDataTypeConversionSuite.scala
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java
-
-import org.scalatest.FunSuite
-
-import org.apache.spark.sql.{DataType => SDataType, StructField => SStructField, StructType => SStructType}
-import org.apache.spark.sql.types.util.DataTypeConversions._
-
-class ScalaSideDataTypeConversionSuite extends FunSuite {
-
-  def checkDataType(scalaDataType: SDataType) {
-    val javaDataType = asJavaDataType(scalaDataType)
-    val actual = asScalaDataType(javaDataType)
-    assert(scalaDataType === actual, s"Converted data type ${actual} " +
-      s"does not equal the expected data type ${scalaDataType}")
-  }
-
-  test("convert data types") {
-    // Simple DataTypes.
-    checkDataType(org.apache.spark.sql.StringType)
-    checkDataType(org.apache.spark.sql.BinaryType)
-    checkDataType(org.apache.spark.sql.BooleanType)
-    checkDataType(org.apache.spark.sql.DateType)
-    checkDataType(org.apache.spark.sql.TimestampType)
-    checkDataType(org.apache.spark.sql.DecimalType.Unlimited)
-    checkDataType(org.apache.spark.sql.DoubleType)
-    checkDataType(org.apache.spark.sql.FloatType)
-    checkDataType(org.apache.spark.sql.ByteType)
-    checkDataType(org.apache.spark.sql.IntegerType)
-    checkDataType(org.apache.spark.sql.LongType)
-    checkDataType(org.apache.spark.sql.ShortType)
-
-    // Simple ArrayType.
-    val simpleScalaArrayType =
-      org.apache.spark.sql.ArrayType(org.apache.spark.sql.StringType, true)
-    checkDataType(simpleScalaArrayType)
-
-    // Simple MapType.
-    val simpleScalaMapType =
-      org.apache.spark.sql.MapType(org.apache.spark.sql.StringType, org.apache.spark.sql.LongType)
-    checkDataType(simpleScalaMapType)
-
-    // Simple StructType.
-    val simpleScalaStructType = SStructType(
-      SStructField("a", org.apache.spark.sql.DecimalType.Unlimited, false) ::
-      SStructField("b", org.apache.spark.sql.BooleanType, true) ::
-      SStructField("c", org.apache.spark.sql.LongType, true) ::
-      SStructField("d", org.apache.spark.sql.BinaryType, false) :: Nil)
-    checkDataType(simpleScalaStructType)
-
-    // Complex StructType.
-    val metadata = new MetadataBuilder()
-      .putString("name", "age")
-      .build()
-    val complexScalaStructType = SStructType(
-      SStructField("simpleArray", simpleScalaArrayType, true) ::
-      SStructField("simpleMap", simpleScalaMapType, true) ::
-      SStructField("simpleStruct", simpleScalaStructType, true) ::
-      SStructField("boolean", org.apache.spark.sql.BooleanType, false) ::
-      SStructField("withMeta", org.apache.spark.sql.DoubleType, false, metadata) :: Nil)
-    checkDataType(complexScalaStructType)
-
-    // Complex ArrayType.
-    val complexScalaArrayType =
-      org.apache.spark.sql.ArrayType(complexScalaStructType, true)
-    checkDataType(complexScalaArrayType)
-
-    // Complex MapType.
-    val complexScalaMapType =
-      org.apache.spark.sql.MapType(complexScalaStructType, complexScalaArrayType, false)
-    checkDataType(complexScalaMapType)
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
index a9f0851f8826c..be2b34de077c9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.columnar
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.catalyst.expressions.Row
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 
 class ColumnStatsSuite extends FunSuite {
   testColumnStats(classOf[ByteColumnStats], BYTE, Row(Byte.MaxValue, Byte.MinValue, 0))
@@ -42,8 +42,8 @@ class ColumnStatsSuite extends FunSuite {
 
     test(s"$columnStatsName: empty") {
       val columnStats = columnStatsClass.newInstance()
-      columnStats.collectedStatistics.zip(initialStatistics).foreach { case (actual, expected) =>
-        assert(actual === expected)
+      columnStats.collectedStatistics.toSeq.zip(initialStatistics.toSeq).foreach {
+        case (actual, expected) => assert(actual === expected)
       }
     }
 
@@ -54,7 +54,7 @@ class ColumnStatsSuite extends FunSuite {
       val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1))
       rows.foreach(columnStats.gatherStats(_, 0))
 
-      val values = rows.take(10).map(_.head.asInstanceOf[T#JvmType])
+      val values = rows.take(10).map(_(0).asInstanceOf[T#JvmType])
       val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#JvmType]]
       val stats = columnStats.collectedStatistics
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index 3f3f35d50188b..87e608a8853dc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -24,9 +24,9 @@ import org.scalatest.FunSuite
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
 import org.apache.spark.sql.execution.SparkSqlSerializer
+import org.apache.spark.sql.types._
 
 class ColumnTypeSuite extends FunSuite with Logging {
   val DEFAULT_BUFFER_SIZE = 512
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
index a1f21219eaf2f..f941465fa3e35 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
@@ -24,7 +24,7 @@ import java.sql.{Date, Timestamp}
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.catalyst.types.{DataType, NativeType}
+import org.apache.spark.sql.types.{DataType, NativeType}
 
 object ColumnarTestUtils {
   def makeNullRow(length: Int) = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
index fc95dccc74e27..e61f3c39631da 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
@@ -39,7 +39,8 @@ class InMemoryColumnarQuerySuite extends QueryTest {
     sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString)).registerTempTable("sizeTst")
     cacheTable("sizeTst")
     assert(
-      table("sizeTst").queryExecution.logical.statistics.sizeInBytes > autoBroadcastJoinThreshold)
+      table("sizeTst").queryExecution.logical.statistics.sizeInBytes >
+        conf.autoBroadcastJoinThreshold)
   }
 
   test("projection") {
@@ -48,7 +49,7 @@ class InMemoryColumnarQuerySuite extends QueryTest {
 
     checkAnswer(scan, testData.collect().map {
       case Row(key: Int, value: String) => value -> key
-    }.toSeq)
+    }.map(Row.fromTuple))
   }
 
   test("SPARK-1436 regression: in-memory columns must be able to be accessed multiple times") {
@@ -62,49 +63,49 @@ class InMemoryColumnarQuerySuite extends QueryTest {
   test("SPARK-1678 regression: compression must not lose repeated values") {
     checkAnswer(
       sql("SELECT * FROM repeatedData"),
-      repeatedData.collect().toSeq)
+      repeatedData.collect().toSeq.map(Row.fromTuple))
 
     cacheTable("repeatedData")
 
     checkAnswer(
       sql("SELECT * FROM repeatedData"),
-      repeatedData.collect().toSeq)
+      repeatedData.collect().toSeq.map(Row.fromTuple))
   }
 
   test("with null values") {
     checkAnswer(
       sql("SELECT * FROM nullableRepeatedData"),
-      nullableRepeatedData.collect().toSeq)
+      nullableRepeatedData.collect().toSeq.map(Row.fromTuple))
 
     cacheTable("nullableRepeatedData")
 
     checkAnswer(
       sql("SELECT * FROM nullableRepeatedData"),
-      nullableRepeatedData.collect().toSeq)
+      nullableRepeatedData.collect().toSeq.map(Row.fromTuple))
   }
 
   test("SPARK-2729 regression: timestamp data type") {
     checkAnswer(
       sql("SELECT time FROM timestamps"),
-      timestamps.collect().toSeq)
+      timestamps.collect().toSeq.map(Row.fromTuple))
 
     cacheTable("timestamps")
 
     checkAnswer(
       sql("SELECT time FROM timestamps"),
-      timestamps.collect().toSeq)
+      timestamps.collect().toSeq.map(Row.fromTuple))
   }
 
   test("SPARK-3320 regression: batched column buffer building should work with empty partitions") {
     checkAnswer(
       sql("SELECT * FROM withEmptyParts"),
-      withEmptyParts.collect().toSeq)
+      withEmptyParts.collect().toSeq.map(Row.fromTuple))
 
     cacheTable("withEmptyParts")
 
     checkAnswer(
       sql("SELECT * FROM withEmptyParts"),
-      withEmptyParts.collect().toSeq)
+      withEmptyParts.collect().toSeq.map(Row.fromTuple))
   }
 
   test("SPARK-4182 Caching complex types") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
index 21906e3fdcc6f..f95c895587f3f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
@@ -22,7 +22,7 @@ import java.nio.ByteBuffer
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.catalyst.types.DataType
+import org.apache.spark.sql.types.DataType
 
 class TestNullableColumnAccessor[T <: DataType, JvmType](
     buffer: ByteBuffer,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
index cb73f3da81e24..80bd5c94570cb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.columnar
 
 import org.scalatest.FunSuite
 
-import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.execution.SparkSqlSerializer
+import org.apache.spark.sql.types._
 
 class TestNullableColumnBuilder[T <: DataType, JvmType](columnType: ColumnType[T, JvmType])
   extends BasicColumnBuilder[T, JvmType](new NoopColumnStats, columnType)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
index 82afa31a99a7e..c3a3f8ddc3ebf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
@@ -23,8 +23,8 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.test.TestSQLContext._
 
 class PartitionBatchPruningSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAfter {
-  val originalColumnBatchSize = columnBatchSize
-  val originalInMemoryPartitionPruning = inMemoryPartitionPruning
+  val originalColumnBatchSize = conf.columnBatchSize
+  val originalInMemoryPartitionPruning = conf.inMemoryPartitionPruning
 
   override protected def beforeAll(): Unit = {
     // Make a table with 5 partitions, 2 batches per partition, 10 elements per batch
@@ -105,16 +105,20 @@ class PartitionBatchPruningSuite extends FunSuite with BeforeAndAfterAll with Be
 
     test(query) {
       val schemaRdd = sql(query)
-      assertResult(expectedQueryResult.toArray, "Wrong query result") {
-        schemaRdd.collect().map(_.head).toArray
+      val queryExecution = schemaRdd.queryExecution
+
+      assertResult(expectedQueryResult.toArray, s"Wrong query result: $queryExecution") {
+        schemaRdd.collect().map(_(0)).toArray
       }
 
       val (readPartitions, readBatches) = schemaRdd.queryExecution.executedPlan.collect {
         case in: InMemoryColumnarTableScan => (in.readPartitions.value, in.readBatches.value)
       }.head
 
-      assert(readBatches === expectedReadBatches, "Wrong number of read batches")
-      assert(readPartitions === expectedReadPartitions, "Wrong number of read partitions")
+      assert(readBatches === expectedReadBatches, s"Wrong number of read batches: $queryExecution")
+      assert(
+        readPartitions === expectedReadPartitions,
+        s"Wrong number of read partitions: $queryExecution")
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
index d9e488e0ffd16..8b518f094174c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
@@ -34,7 +34,7 @@ class BooleanBitSetSuite extends FunSuite {
 
     val builder = TestCompressibleColumnBuilder(new NoopColumnStats, BOOLEAN, BooleanBitSet)
     val rows = Seq.fill[Row](count)(makeRandomRow(BOOLEAN))
-    val values = rows.map(_.head)
+    val values = rows.map(_(0))
 
     rows.foreach(builder.appendFrom(_, 0))
     val buffer = builder.build()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala
index 1cdb909146d57..c82d9799359c7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala
@@ -22,9 +22,9 @@ import java.nio.ByteBuffer
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.catalyst.types.NativeType
 import org.apache.spark.sql.columnar._
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
+import org.apache.spark.sql.types.NativeType
 
 class DictionaryEncodingSuite extends FunSuite {
   testDictionaryEncoding(new IntColumnStats,    INT)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala
index 73f31c0233343..88011631ee4e3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala
@@ -20,9 +20,9 @@ package org.apache.spark.sql.columnar.compression
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.catalyst.types.IntegralType
 import org.apache.spark.sql.columnar._
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
+import org.apache.spark.sql.types.IntegralType
 
 class IntegralDeltaSuite extends FunSuite {
   testIntegralDelta(new IntColumnStats,  INT,  IntDelta)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala
index 4ce2552112c92..08df1db375097 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala
@@ -20,9 +20,9 @@ package org.apache.spark.sql.columnar.compression
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.catalyst.types.NativeType
 import org.apache.spark.sql.columnar._
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
+import org.apache.spark.sql.types.NativeType
 
 class RunLengthEncodingSuite extends FunSuite {
   testRunLengthEncoding(new NoopColumnStats, BOOLEAN)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/TestCompressibleColumnBuilder.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/TestCompressibleColumnBuilder.scala
index 7db723d648d80..0b18b4119268f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/TestCompressibleColumnBuilder.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/TestCompressibleColumnBuilder.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.sql.columnar.compression
 
-import org.apache.spark.sql.catalyst.types.NativeType
 import org.apache.spark.sql.columnar._
+import org.apache.spark.sql.types.NativeType
 
 class TestCompressibleColumnBuilder[T <: NativeType](
     override val columnStats: ColumnStats,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index a5af71acfc79a..67007b8c093ca 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, ShuffledHashJoin}
 import org.apache.spark.sql.test.TestSQLContext._
 import org.apache.spark.sql.test.TestSQLContext.planner._
+import org.apache.spark.sql.types._
 
 class PlannerSuite extends FunSuite {
   test("unions are collapsed") {
@@ -60,25 +61,68 @@ class PlannerSuite extends FunSuite {
   }
 
   test("sizeInBytes estimation of limit operator for broadcast hash join optimization") {
-    val origThreshold = autoBroadcastJoinThreshold
-    setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, 81920.toString)
-
-    // Using a threshold that is definitely larger than the small testing table (b) below
-    val a = testData.as('a)
-    val b = testData.limit(3).as('b)
-    val planned = a.join(b, Inner, Some("a.key".attr === "b.key".attr)).queryExecution.executedPlan
-
-    val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join }
-    val shuffledHashJoins = planned.collect { case join: ShuffledHashJoin => join }
-
-    assert(broadcastHashJoins.size === 1, "Should use broadcast hash join")
-    assert(shuffledHashJoins.isEmpty, "Should not use shuffled hash join")
+    def checkPlan(fieldTypes: Seq[DataType], newThreshold: Int): Unit = {
+      setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, newThreshold.toString)
+      val fields = fieldTypes.zipWithIndex.map {
+        case (dataType, index) => StructField(s"c${index}", dataType, true)
+      } :+ StructField("key", IntegerType, true)
+      val schema = StructType(fields)
+      val row = Row.fromSeq(Seq.fill(fields.size)(null))
+      val rowRDD = org.apache.spark.sql.test.TestSQLContext.sparkContext.parallelize(row :: Nil)
+      applySchema(rowRDD, schema).registerTempTable("testLimit")
+
+      val planned = sql(
+        """
+          |SELECT l.a, l.b
+          |FROM testData2 l JOIN (SELECT * FROM testLimit LIMIT 1) r ON (l.a = r.key)
+        """.stripMargin).queryExecution.executedPlan
+
+      val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join }
+      val shuffledHashJoins = planned.collect { case join: ShuffledHashJoin => join }
+
+      assert(broadcastHashJoins.size === 1, "Should use broadcast hash join")
+      assert(shuffledHashJoins.isEmpty, "Should not use shuffled hash join")
+
+      dropTempTable("testLimit")
+    }
+
+    val origThreshold = conf.autoBroadcastJoinThreshold
+
+    val simpleTypes =
+      NullType ::
+      BooleanType ::
+      ByteType ::
+      ShortType ::
+      IntegerType ::
+      LongType ::
+      FloatType ::
+      DoubleType ::
+      DecimalType(10, 5) ::
+      DecimalType.Unlimited ::
+      DateType ::
+      TimestampType ::
+      StringType ::
+      BinaryType :: Nil
+
+    checkPlan(simpleTypes, newThreshold = 16434)
+
+    val complexTypes =
+      ArrayType(DoubleType, true) ::
+      ArrayType(StringType, false) ::
+      MapType(IntegerType, StringType, true) ::
+      MapType(IntegerType, ArrayType(DoubleType), false) ::
+      StructType(Seq(
+        StructField("a", IntegerType, nullable = true),
+        StructField("b", ArrayType(DoubleType), nullable = false),
+        StructField("c", DoubleType, nullable = false))) :: Nil
+
+    checkPlan(complexTypes, newThreshold = 901617)
 
     setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, origThreshold.toString)
   }
 
   test("InMemoryRelation statistics propagation") {
-    val origThreshold = autoBroadcastJoinThreshold
+    val origThreshold = conf.autoBroadcastJoinThreshold
     setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, 81920.toString)
 
     testData.limit(3).registerTempTable("tiny")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TgfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TgfSuite.scala
index 2cab5e0c44d92..272c0d4cb2335 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TgfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TgfSuite.scala
@@ -59,7 +59,7 @@ class TgfSuite extends QueryTest {
     checkAnswer(
       inputData.generate(ExampleTGF()),
       Seq(
-        "michael is 29 years old" :: Nil,
-        "Next year, michael will be 30 years old" :: Nil))
+        Row("michael is 29 years old"),
+        Row("Next year, michael will be 30 years old")))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index f088d413257a9..94d14acccbb18 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -17,19 +17,18 @@
 
 package org.apache.spark.sql.json
 
-import org.apache.spark.sql.catalyst.types._
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
-import org.apache.spark.sql.catalyst.util._
-import org.apache.spark.sql.json.JsonRDD.{enforceCorrectType, compatibleType}
-import org.apache.spark.sql.{Row, SQLConf, QueryTest}
+import java.sql.{Date, Timestamp}
+
 import org.apache.spark.sql.TestData._
+import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.json.JsonRDD.{compatibleType, enforceCorrectType}
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext._
-
-import java.sql.{Date, Timestamp}
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{QueryTest, Row, SQLConf}
 
 class JsonSuite extends QueryTest {
-  import TestJsonData._
+  import org.apache.spark.sql.json.TestJsonData._
   TestJsonData
 
   test("Type promotion") {
@@ -193,6 +192,25 @@ class JsonSuite extends QueryTest {
       StringType)
   }
 
+  test("Complex field and type inferring with null in sampling") {
+    val jsonSchemaRDD = jsonRDD(jsonNullStruct)
+    val expectedSchema = StructType(
+      StructField("headers", StructType(
+        StructField("Charset", StringType, true) ::
+          StructField("Host", StringType, true) :: Nil)
+        , true) ::
+        StructField("ip", StringType, true) ::
+        StructField("nullstr", StringType, true):: Nil)
+
+    assert(expectedSchema === jsonSchemaRDD.schema)
+    jsonSchemaRDD.registerTempTable("jsonTable")
+
+    checkAnswer(
+      sql("select nullstr, headers.Host from jsonTable"),
+      Seq(Row("", "1.abc.com"), Row("", null), Row("", null), Row(null, null))
+    )
+  }
+
   test("Primitive field and type inferring") {
     val jsonSchemaRDD = jsonRDD(primitiveFieldAndType)
 
@@ -211,13 +229,13 @@ class JsonSuite extends QueryTest {
 
     checkAnswer(
       sql("select * from jsonTable"),
-      (BigDecimal("92233720368547758070"),
-      true,
-      1.7976931348623157E308,
-      10,
-      21474836470L,
-      null,
-      "this is a simple string.") :: Nil
+      Row(new java.math.BigDecimal("92233720368547758070"),
+        true,
+        1.7976931348623157E308,
+        10,
+        21474836470L,
+        null,
+        "this is a simple string.")
     )
   }
 
@@ -253,68 +271,70 @@ class JsonSuite extends QueryTest {
     // Access elements of a primitive array.
     checkAnswer(
       sql("select arrayOfString[0], arrayOfString[1], arrayOfString[2] from jsonTable"),
-      ("str1", "str2", null) :: Nil
+      Row("str1", "str2", null)
     )
 
     // Access an array of null values.
     checkAnswer(
       sql("select arrayOfNull from jsonTable"),
-      Seq(Seq(null, null, null, null)) :: Nil
+      Row(Seq(null, null, null, null))
     )
 
     // Access elements of a BigInteger array (we use DecimalType internally).
     checkAnswer(
       sql("select arrayOfBigInteger[0], arrayOfBigInteger[1], arrayOfBigInteger[2] from jsonTable"),
-      (BigDecimal("922337203685477580700"), BigDecimal("-922337203685477580800"), null) :: Nil
+      Row(new java.math.BigDecimal("922337203685477580700"),
+        new java.math.BigDecimal("-922337203685477580800"), null)
     )
 
     // Access elements of an array of arrays.
     checkAnswer(
       sql("select arrayOfArray1[0], arrayOfArray1[1] from jsonTable"),
-      (Seq("1", "2", "3"), Seq("str1", "str2")) :: Nil
+      Row(Seq("1", "2", "3"), Seq("str1", "str2"))
     )
 
     // Access elements of an array of arrays.
     checkAnswer(
       sql("select arrayOfArray2[0], arrayOfArray2[1] from jsonTable"),
-      (Seq(1.0, 2.0, 3.0), Seq(1.1, 2.1, 3.1)) :: Nil
+      Row(Seq(1.0, 2.0, 3.0), Seq(1.1, 2.1, 3.1))
     )
 
     // Access elements of an array inside a filed with the type of ArrayType(ArrayType).
     checkAnswer(
       sql("select arrayOfArray1[1][1], arrayOfArray2[1][1] from jsonTable"),
-      ("str2", 2.1) :: Nil
+      Row("str2", 2.1)
     )
 
     // Access elements of an array of structs.
     checkAnswer(
       sql("select arrayOfStruct[0], arrayOfStruct[1], arrayOfStruct[2], arrayOfStruct[3] " +
         "from jsonTable"),
-      (true :: "str1" :: null :: Nil,
-      false :: null :: null :: Nil,
-      null :: null :: null :: Nil,
-      null) :: Nil
+      Row(
+        Row(true, "str1", null),
+        Row(false, null, null),
+        Row(null, null, null),
+        null)
     )
 
     // Access a struct and fields inside of it.
     checkAnswer(
       sql("select struct, struct.field1, struct.field2 from jsonTable"),
       Row(
-        Row(true, BigDecimal("92233720368547758070")),
+        Row(true, new java.math.BigDecimal("92233720368547758070")),
         true,
-        BigDecimal("92233720368547758070")) :: Nil
+        new java.math.BigDecimal("92233720368547758070")) :: Nil
     )
 
     // Access an array field of a struct.
     checkAnswer(
       sql("select structWithArrayFields.field1, structWithArrayFields.field2 from jsonTable"),
-      (Seq(4, 5, 6), Seq("str1", "str2")) :: Nil
+      Row(Seq(4, 5, 6), Seq("str1", "str2"))
     )
 
     // Access elements of an array field of a struct.
     checkAnswer(
       sql("select structWithArrayFields.field1[1], structWithArrayFields.field2[3] from jsonTable"),
-      (5, null) :: Nil
+      Row(5, null)
     )
   }
 
@@ -325,14 +345,14 @@ class JsonSuite extends QueryTest {
     // Right now, "field1" and "field2" are treated as aliases. We should fix it.
     checkAnswer(
       sql("select arrayOfStruct[0].field1, arrayOfStruct[0].field2 from jsonTable"),
-      (true, "str1") :: Nil
+      Row(true, "str1")
     )
 
     // Right now, the analyzer cannot resolve arrayOfStruct.field1 and arrayOfStruct.field2.
     // Getting all values of a specific field from an array of structs.
     checkAnswer(
       sql("select arrayOfStruct.field1, arrayOfStruct.field2 from jsonTable"),
-      (Seq(true, false), Seq("str1", null)) :: Nil
+      Row(Seq(true, false), Seq("str1", null))
     )
   }
 
@@ -353,57 +373,57 @@ class JsonSuite extends QueryTest {
 
     checkAnswer(
       sql("select * from jsonTable"),
-      ("true", 11L, null, 1.1, "13.1", "str1") ::
-      ("12", null, BigDecimal("21474836470.9"), null, null, "true") ::
-      ("false", 21474836470L, BigDecimal("92233720368547758070"), 100, "str1", "false") ::
-      (null, 21474836570L, BigDecimal(1.1), 21474836470L, "92233720368547758070", null) :: Nil
+      Row("true", 11L, null, 1.1, "13.1", "str1") ::
+        Row("12", null, new java.math.BigDecimal("21474836470.9"), null, null, "true") ::
+        Row("false", 21474836470L, new java.math.BigDecimal("92233720368547758070"), 100, "str1", "false") ::
+        Row(null, 21474836570L, new java.math.BigDecimal("1.1"), 21474836470L, "92233720368547758070", null) :: Nil
     )
 
     // Number and Boolean conflict: resolve the type as number in this query.
     checkAnswer(
       sql("select num_bool - 10 from jsonTable where num_bool > 11"),
-      2
+      Row(2)
     )
 
     // Widening to LongType
     checkAnswer(
       sql("select num_num_1 - 100 from jsonTable where num_num_1 > 11"),
-      Seq(21474836370L) :: Seq(21474836470L) :: Nil
+      Row(21474836370L) :: Row(21474836470L) :: Nil
     )
 
     checkAnswer(
       sql("select num_num_1 - 100 from jsonTable where num_num_1 > 10"),
-      Seq(-89) :: Seq(21474836370L) :: Seq(21474836470L) :: Nil
+      Row(-89) :: Row(21474836370L) :: Row(21474836470L) :: Nil
     )
 
     // Widening to DecimalType
     checkAnswer(
       sql("select num_num_2 + 1.2 from jsonTable where num_num_2 > 1.1"),
-      Seq(BigDecimal("21474836472.1")) :: Seq(BigDecimal("92233720368547758071.2")) :: Nil
+      Row(new java.math.BigDecimal("21474836472.1")) :: Row(new java.math.BigDecimal("92233720368547758071.2")) :: Nil
     )
 
     // Widening to DoubleType
     checkAnswer(
       sql("select num_num_3 + 1.2 from jsonTable where num_num_3 > 1.1"),
-      Seq(101.2) :: Seq(21474836471.2) :: Nil
+      Row(101.2) :: Row(21474836471.2) :: Nil
     )
 
     // Number and String conflict: resolve the type as number in this query.
     checkAnswer(
       sql("select num_str + 1.2 from jsonTable where num_str > 14"),
-      92233720368547758071.2
+      Row(92233720368547758071.2)
     )
 
     // Number and String conflict: resolve the type as number in this query.
     checkAnswer(
       sql("select num_str + 1.2 from jsonTable where num_str > 92233720368547758060"),
-      BigDecimal("92233720368547758061.2").toDouble
+      Row(new java.math.BigDecimal("92233720368547758061.2").doubleValue)
     )
 
     // String and Boolean conflict: resolve the type as string.
     checkAnswer(
       sql("select * from jsonTable where str_bool = 'str1'"),
-      ("true", 11L, null, 1.1, "13.1", "str1") :: Nil
+      Row("true", 11L, null, 1.1, "13.1", "str1")
     )
   }
 
@@ -415,24 +435,24 @@ class JsonSuite extends QueryTest {
     // Number and Boolean conflict: resolve the type as boolean in this query.
     checkAnswer(
       sql("select num_bool from jsonTable where NOT num_bool"),
-      false
+      Row(false)
     )
 
     checkAnswer(
       sql("select str_bool from jsonTable where NOT str_bool"),
-      false
+      Row(false)
     )
 
     // Right now, the analyzer does not know that num_bool should be treated as a boolean.
     // Number and Boolean conflict: resolve the type as boolean in this query.
     checkAnswer(
       sql("select num_bool from jsonTable where num_bool"),
-      true
+      Row(true)
     )
 
     checkAnswer(
       sql("select str_bool from jsonTable where str_bool"),
-      false
+      Row(false)
     )
 
     // The plan of the following DSL is
@@ -445,7 +465,7 @@ class JsonSuite extends QueryTest {
       jsonSchemaRDD.
         where('num_str > BigDecimal("92233720368547758060")).
         select('num_str + 1.2 as Symbol("num")),
-      BigDecimal("92233720368547758061.2")
+      Row(new java.math.BigDecimal("92233720368547758061.2"))
     )
 
     // The following test will fail. The type of num_str is StringType.
@@ -456,7 +476,7 @@ class JsonSuite extends QueryTest {
     // Number and String conflict: resolve the type as number in this query.
     checkAnswer(
       sql("select num_str + 1.2 from jsonTable where num_str > 13"),
-      Seq(14.3) :: Seq(92233720368547758071.2) :: Nil
+      Row(14.3) :: Row(92233720368547758071.2) :: Nil
     )
   }
 
@@ -477,10 +497,10 @@ class JsonSuite extends QueryTest {
 
     checkAnswer(
       sql("select * from jsonTable"),
-      (Seq(), "11", "[1,2,3]", Seq(null), "[]") ::
-      (null, """{"field":false}""", null, null, "{}") ::
-      (Seq(4, 5, 6), null, "str", Seq(null), "[7,8,9]") ::
-      (Seq(7), "{}","[str1,str2,33]", Seq("str"), """{"field":true}""") :: Nil
+      Row(Seq(), "11", "[1,2,3]", Row(null), "[]") ::
+        Row(null, """{"field":false}""", null, null, "{}") ::
+        Row(Seq(4, 5, 6), null, "str", Row(null), "[7,8,9]") ::
+        Row(Seq(7), "{}","[str1,str2,33]", Row("str"), """{"field":true}""") :: Nil
     )
   }
 
@@ -499,16 +519,16 @@ class JsonSuite extends QueryTest {
 
     checkAnswer(
       sql("select * from jsonTable"),
-      Seq(Seq("1", "1.1", "true", null, "[]", "{}", "[2,3,4]",
-        """{"field":"str"}"""), Seq(Seq(214748364700L), Seq(1)), null) ::
-      Seq(null, null, Seq("""{"field":"str"}""", """{"field":1}""")) ::
-      Seq(null, null, Seq("1", "2", "3")) :: Nil
+      Row(Seq("1", "1.1", "true", null, "[]", "{}", "[2,3,4]",
+        """{"field":"str"}"""), Seq(Row(214748364700L), Row(1)), null) ::
+      Row(null, null, Seq("""{"field":"str"}""", """{"field":1}""")) ::
+      Row(null, null, Seq("1", "2", "3")) :: Nil
     )
 
     // Treat an element as a number.
     checkAnswer(
       sql("select array1[0] + 1 from jsonTable where array1 is not null"),
-      2
+      Row(2)
     )
   }
 
@@ -549,13 +569,13 @@ class JsonSuite extends QueryTest {
 
     checkAnswer(
       sql("select * from jsonTable"),
-      (BigDecimal("92233720368547758070"),
+      Row(new java.math.BigDecimal("92233720368547758070"),
       true,
       1.7976931348623157E308,
       10,
       21474836470L,
       null,
-      "this is a simple string.") :: Nil
+      "this is a simple string.")
     )
   }
 
@@ -575,13 +595,13 @@ class JsonSuite extends QueryTest {
 
     checkAnswer(
       sql("select * from jsonTableSQL"),
-      (BigDecimal("92233720368547758070"),
+      Row(new java.math.BigDecimal("92233720368547758070"),
         true,
         1.7976931348623157E308,
         10,
         21474836470L,
         null,
-        "this is a simple string.") :: Nil
+        "this is a simple string.")
     )
   }
 
@@ -607,13 +627,13 @@ class JsonSuite extends QueryTest {
 
     checkAnswer(
       sql("select * from jsonTable1"),
-      (BigDecimal("92233720368547758070"),
+      Row(new java.math.BigDecimal("92233720368547758070"),
       true,
       1.7976931348623157E308,
       10,
       21474836470L,
       null,
-      "this is a simple string.") :: Nil
+      "this is a simple string.")
     )
 
     val jsonSchemaRDD2 = jsonRDD(primitiveFieldAndType, schema)
@@ -624,13 +644,13 @@ class JsonSuite extends QueryTest {
 
     checkAnswer(
       sql("select * from jsonTable2"),
-      (BigDecimal("92233720368547758070"),
+      Row(new java.math.BigDecimal("92233720368547758070"),
       true,
       1.7976931348623157E308,
       10,
       21474836470L,
       null,
-      "this is a simple string.") :: Nil
+      "this is a simple string.")
     )
   }
 
@@ -640,7 +660,7 @@ class JsonSuite extends QueryTest {
 
     checkAnswer(
       sql("select arrayOfStruct[0].field1, arrayOfStruct[0].field2 from jsonTable"),
-      (true, "str1") :: Nil
+      Row(true, "str1")
     )
     checkAnswer(
       sql(
@@ -648,7 +668,7 @@ class JsonSuite extends QueryTest {
           |select complexArrayOfStruct[0].field1[1].inner2[0], complexArrayOfStruct[1].field2[0][1]
           |from jsonTable
         """.stripMargin),
-      ("str2", 6) :: Nil
+      Row("str2", 6)
     )
   }
 
@@ -662,7 +682,7 @@ class JsonSuite extends QueryTest {
           |select arrayOfArray1[0][0][0], arrayOfArray1[1][0][1], arrayOfArray1[1][1][0]
           |from jsonTable
         """.stripMargin),
-      (5, 7, 8) :: Nil
+      Row(5, 7, 8)
     )
     checkAnswer(
       sql(
@@ -671,7 +691,7 @@ class JsonSuite extends QueryTest {
           |arrayOfArray2[1][1][1].inner2[0], arrayOfArray2[2][0][0].inner3[0][0].inner4
           |from jsonTable
         """.stripMargin),
-      ("str1", Nil, "str4", 2) :: Nil
+      Row("str1", Nil, "str4", 2)
     )
   }
 
@@ -685,16 +705,16 @@ class JsonSuite extends QueryTest {
           |select a, b, c
           |from jsonTable
         """.stripMargin),
-      ("str_a_1", null, null) ::
-      ("str_a_2", null, null) ::
-      (null, "str_b_3", null) ::
-      ("str_a_4", "str_b_4", "str_c_4") :: Nil
+      Row("str_a_1", null, null) ::
+        Row("str_a_2", null, null) ::
+        Row(null, "str_b_3", null) ::
+        Row("str_a_4", "str_b_4", "str_c_4") :: Nil
     )
   }
 
   test("Corrupt records") {
     // Test if we can query corrupt records.
-    val oldColumnNameOfCorruptRecord = TestSQLContext.columnNameOfCorruptRecord
+    val oldColumnNameOfCorruptRecord = TestSQLContext.conf.columnNameOfCorruptRecord
     TestSQLContext.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, "_unparsed")
 
     val jsonSchemaRDD = jsonRDD(corruptRecords)
@@ -715,12 +735,12 @@ class JsonSuite extends QueryTest {
           |SELECT a, b, c, _unparsed
           |FROM jsonTable
         """.stripMargin),
-      (null, null, null, "{") ::
-      (null, null, null, "") ::
-      (null, null, null, """{"a":1, b:2}""") ::
-      (null, null, null, """{"a":{, b:3}""") ::
-      ("str_a_4", "str_b_4", "str_c_4", null) ::
-      (null, null, null, "]") :: Nil
+      Row(null, null, null, "{") ::
+        Row(null, null, null, "") ::
+        Row(null, null, null, """{"a":1, b:2}""") ::
+        Row(null, null, null, """{"a":{, b:3}""") ::
+        Row("str_a_4", "str_b_4", "str_c_4", null) ::
+        Row(null, null, null, "]") :: Nil
     )
 
     checkAnswer(
@@ -730,7 +750,7 @@ class JsonSuite extends QueryTest {
           |FROM jsonTable
           |WHERE _unparsed IS NULL
         """.stripMargin),
-      ("str_a_4", "str_b_4", "str_c_4") :: Nil
+      Row("str_a_4", "str_b_4", "str_c_4")
     )
 
     checkAnswer(
@@ -740,11 +760,11 @@ class JsonSuite extends QueryTest {
           |FROM jsonTable
           |WHERE _unparsed IS NOT NULL
         """.stripMargin),
-      Seq("{") ::
-      Seq("") ::
-      Seq("""{"a":1, b:2}""") ::
-      Seq("""{"a":{, b:3}""") ::
-      Seq("]") :: Nil
+      Row("{") ::
+        Row("") ::
+        Row("""{"a":1, b:2}""") ::
+        Row("""{"a":{, b:3}""") ::
+        Row("]") :: Nil
     )
 
     TestSQLContext.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, oldColumnNameOfCorruptRecord)
@@ -774,10 +794,10 @@ class JsonSuite extends QueryTest {
           |SELECT field1, field2, field3, field4
           |FROM jsonTable
         """.stripMargin),
-      Seq(Seq(Seq(null), Seq(Seq(Seq("Test")))), null, null, null) ::
-      Seq(null, Seq(null, Seq(Seq(1))), null, null) ::
-      Seq(null, null, Seq(Seq(null), Seq(Seq("2"))), null) ::
-      Seq(null, null, null, Seq(Seq(null, Seq(1, 2, 3)))) :: Nil
+      Row(Seq(Seq(null), Seq(Seq(Seq("Test")))), null, null, null) ::
+        Row(null, Seq(null, Seq(Row(1))), null, null) ::
+        Row(null, null, Seq(Seq(null), Seq(Row("2"))), null) ::
+        Row(null, null, null, Seq(Seq(null, Seq(1, 2, 3)))) :: Nil
     )
   }
 
@@ -832,12 +852,12 @@ class JsonSuite extends QueryTest {
     primTable.registerTempTable("primativeTable")
     checkAnswer(
         sql("select * from primativeTable"),
-        (BigDecimal("92233720368547758070"),
+      Row(new java.math.BigDecimal("92233720368547758070"),
         true,
         1.7976931348623157E308,
         10,
         21474836470L,
-        "this is a simple string.") :: Nil
+        "this is a simple string.")
       )
 
     val complexJsonSchemaRDD = jsonRDD(complexFieldAndType1)
@@ -846,58 +866,59 @@ class JsonSuite extends QueryTest {
     // Access elements of a primitive array.
     checkAnswer(
       sql("select arrayOfString[0], arrayOfString[1], arrayOfString[2] from complexTable"),
-      ("str1", "str2", null) :: Nil
+      Row("str1", "str2", null)
     )
 
     // Access an array of null values.
     checkAnswer(
       sql("select arrayOfNull from complexTable"),
-      Seq(Seq(null, null, null, null)) :: Nil
+      Row(Seq(null, null, null, null))
     )
 
     // Access elements of a BigInteger array (we use DecimalType internally).
     checkAnswer(
       sql("select arrayOfBigInteger[0], arrayOfBigInteger[1], arrayOfBigInteger[2] from complexTable"),
-      (BigDecimal("922337203685477580700"), BigDecimal("-922337203685477580800"), null) :: Nil
+      Row(new java.math.BigDecimal("922337203685477580700"),
+        new java.math.BigDecimal("-922337203685477580800"), null)
     )
 
     // Access elements of an array of arrays.
     checkAnswer(
       sql("select arrayOfArray1[0], arrayOfArray1[1] from complexTable"),
-      (Seq("1", "2", "3"), Seq("str1", "str2")) :: Nil
+      Row(Seq("1", "2", "3"), Seq("str1", "str2"))
     )
 
     // Access elements of an array of arrays.
     checkAnswer(
       sql("select arrayOfArray2[0], arrayOfArray2[1] from complexTable"),
-      (Seq(1.0, 2.0, 3.0), Seq(1.1, 2.1, 3.1)) :: Nil
+      Row(Seq(1.0, 2.0, 3.0), Seq(1.1, 2.1, 3.1))
     )
 
     // Access elements of an array inside a filed with the type of ArrayType(ArrayType).
     checkAnswer(
       sql("select arrayOfArray1[1][1], arrayOfArray2[1][1] from complexTable"),
-      ("str2", 2.1) :: Nil
+      Row("str2", 2.1)
     )
 
     // Access a struct and fields inside of it.
     checkAnswer(
       sql("select struct, struct.field1, struct.field2 from complexTable"),
       Row(
-        Row(true, BigDecimal("92233720368547758070")),
+        Row(true, new java.math.BigDecimal("92233720368547758070")),
         true,
-        BigDecimal("92233720368547758070")) :: Nil
+        new java.math.BigDecimal("92233720368547758070")) :: Nil
     )
 
     // Access an array field of a struct.
     checkAnswer(
       sql("select structWithArrayFields.field1, structWithArrayFields.field2 from complexTable"),
-      (Seq(4, 5, 6), Seq("str1", "str2")) :: Nil
+      Row(Seq(4, 5, 6), Seq("str1", "str2"))
     )
 
     // Access elements of an array field of a struct.
     checkAnswer(
       sql("select structWithArrayFields.field1[1], structWithArrayFields.field2[3] from complexTable"),
-      (5, null) :: Nil
+      Row(5, null)
     )
 
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
index e5773a55875bc..3370b3c98b4be 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
@@ -43,6 +43,13 @@ object TestJsonData {
       """{"num_num_1":21474836570, "num_num_2":1.1, "num_num_3": 21474836470,
           "num_bool":null, "num_str":92233720368547758070, "str_bool":null}""" :: Nil)
 
+  val jsonNullStruct =
+    TestSQLContext.sparkContext.parallelize(
+      """{"nullstr":"","ip":"27.31.100.29","headers":{"Host":"1.abc.com","Charset":"UTF-8"}}""" ::
+        """{"nullstr":"","ip":"27.31.100.29","headers":{}}""" ::
+        """{"nullstr":"","ip":"27.31.100.29","headers":""}""" ::
+        """{"nullstr":null,"ip":"27.31.100.29","headers":null}""" :: Nil)
+
   val complexFieldValueTypeConflict =
     TestSQLContext.sparkContext.parallelize(
       """{"num_struct":11, "str_array":[1, 2, 3],
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
new file mode 100644
index 0000000000000..1e7d3e06fc196
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.parquet
+
+import parquet.filter2.predicate.Operators._
+import parquet.filter2.predicate.{FilterPredicate, Operators}
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal, Predicate, Row}
+import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.sql.{QueryTest, SQLConf, SchemaRDD}
+
+/**
+ * A test suite that tests Parquet filter2 API based filter pushdown optimization.
+ *
+ * NOTE:
+ *
+ * 1. `!(a cmp b)` is always transformed to its negated form `a cmp' b` by the
+ *    `BooleanSimplification` optimization rule whenever possible. As a result, predicate `!(a < 1)`
+ *    results in a `GtEq` filter predicate rather than a `Not`.
+ *
+ * 2. `Tuple1(Option(x))` is used together with `AnyVal` types like `Int` to ensure the inferred
+ *    data type is nullable.
+ */
+class ParquetFilterSuite extends QueryTest with ParquetTest {
+  val sqlContext = TestSQLContext
+
+  private def checkFilterPredicate(
+      rdd: SchemaRDD,
+      predicate: Predicate,
+      filterClass: Class[_ <: FilterPredicate],
+      checker: (SchemaRDD, Seq[Row]) => Unit,
+      expected: Seq[Row]): Unit = {
+    val output = predicate.collect { case a: Attribute => a }.distinct
+
+    withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED -> "true") {
+      val query = rdd.select(output: _*).where(predicate)
+
+      val maybeAnalyzedPredicate = query.queryExecution.executedPlan.collect {
+        case plan: ParquetTableScan => plan.columnPruningPred
+      }.flatten.reduceOption(_ && _)
+
+      assert(maybeAnalyzedPredicate.isDefined)
+      maybeAnalyzedPredicate.foreach { pred =>
+        val maybeFilter = ParquetFilters.createFilter(pred)
+        assert(maybeFilter.isDefined, s"Couldn't generate filter predicate for $pred")
+        maybeFilter.foreach { f =>
+          // Doesn't bother checking type parameters here (e.g. `Eq[Integer]`)
+          assert(f.getClass === filterClass)
+        }
+      }
+
+      checker(query, expected)
+    }
+  }
+
+  private def checkFilterPredicate
+      (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Seq[Row])
+      (implicit rdd: SchemaRDD): Unit = {
+    checkFilterPredicate(rdd, predicate, filterClass, checkAnswer(_, _: Seq[Row]), expected)
+  }
+
+  private def checkFilterPredicate[T]
+      (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: T)
+      (implicit rdd: SchemaRDD): Unit = {
+    checkFilterPredicate(predicate, filterClass, Seq(Row(expected)))(rdd)
+  }
+
+  test("filter pushdown - boolean") {
+    withParquetRDD((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { implicit rdd =>
+      checkFilterPredicate('_1.isNull,    classOf[Eq   [_]], Seq.empty[Row])
+      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], Seq(Row(true), Row(false)))
+
+      checkFilterPredicate('_1 === true, classOf[Eq   [_]], true)
+      checkFilterPredicate('_1 !== true, classOf[NotEq[_]], false)
+    }
+  }
+
+  test("filter pushdown - integer") {
+    withParquetRDD((1 to 4).map(i => Tuple1(Option(i)))) { implicit rdd =>
+      checkFilterPredicate('_1.isNull,    classOf[Eq   [_]], Seq.empty[Row])
+      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
+
+      checkFilterPredicate('_1 === 1, classOf[Eq   [_]], 1)
+      checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+
+      checkFilterPredicate('_1 < 2,  classOf[Lt  [_]], 1)
+      checkFilterPredicate('_1 > 3,  classOf[Gt  [_]], 4)
+      checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
+      checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
+
+      checkFilterPredicate(Literal(1) === '_1, classOf[Eq  [_]], 1)
+      checkFilterPredicate(Literal(2) >   '_1, classOf[Lt  [_]], 1)
+      checkFilterPredicate(Literal(3) <   '_1, classOf[Gt  [_]], 4)
+      checkFilterPredicate(Literal(1) >=  '_1, classOf[LtEq[_]], 1)
+      checkFilterPredicate(Literal(4) <=  '_1, classOf[GtEq[_]], 4)
+
+      checkFilterPredicate(!('_1 < 4),         classOf[GtEq[_]],       4)
+      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
+      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or],  Seq(Row(1), Row(4)))
+    }
+  }
+
+  test("filter pushdown - long") {
+    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toLong)))) { implicit rdd =>
+      checkFilterPredicate('_1.isNull,    classOf[Eq   [_]], Seq.empty[Row])
+      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
+
+      checkFilterPredicate('_1 === 1, classOf[Eq[_]],    1)
+      checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+
+      checkFilterPredicate('_1 <  2, classOf[Lt  [_]], 1)
+      checkFilterPredicate('_1 >  3, classOf[Gt  [_]], 4)
+      checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
+      checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
+
+      checkFilterPredicate(Literal(1) === '_1, classOf[Eq  [_]], 1)
+      checkFilterPredicate(Literal(2) >   '_1, classOf[Lt  [_]], 1)
+      checkFilterPredicate(Literal(3) <   '_1, classOf[Gt  [_]], 4)
+      checkFilterPredicate(Literal(1) >=  '_1, classOf[LtEq[_]], 1)
+      checkFilterPredicate(Literal(4) <=  '_1, classOf[GtEq[_]], 4)
+
+      checkFilterPredicate(!('_1 < 4),         classOf[GtEq[_]],       4)
+      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
+      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or],  Seq(Row(1), Row(4)))
+    }
+  }
+
+  test("filter pushdown - float") {
+    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toFloat)))) { implicit rdd =>
+      checkFilterPredicate('_1.isNull,    classOf[Eq   [_]], Seq.empty[Row])
+      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
+
+      checkFilterPredicate('_1 === 1, classOf[Eq   [_]], 1)
+      checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+
+      checkFilterPredicate('_1 <  2, classOf[Lt  [_]], 1)
+      checkFilterPredicate('_1 >  3, classOf[Gt  [_]], 4)
+      checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
+      checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
+
+      checkFilterPredicate(Literal(1) === '_1, classOf[Eq  [_]], 1)
+      checkFilterPredicate(Literal(2) >   '_1, classOf[Lt  [_]], 1)
+      checkFilterPredicate(Literal(3) <   '_1, classOf[Gt  [_]], 4)
+      checkFilterPredicate(Literal(1) >=  '_1, classOf[LtEq[_]], 1)
+      checkFilterPredicate(Literal(4) <=  '_1, classOf[GtEq[_]], 4)
+
+      checkFilterPredicate(!('_1 < 4),         classOf[GtEq[_]],       4)
+      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
+      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or],  Seq(Row(1), Row(4)))
+    }
+  }
+
+  test("filter pushdown - double") {
+    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toDouble)))) { implicit rdd =>
+      checkFilterPredicate('_1.isNull,    classOf[Eq[_]],    Seq.empty[Row])
+      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
+
+      checkFilterPredicate('_1 === 1, classOf[Eq   [_]], 1)
+      checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+
+      checkFilterPredicate('_1 <  2, classOf[Lt  [_]], 1)
+      checkFilterPredicate('_1 >  3, classOf[Gt  [_]], 4)
+      checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
+      checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
+
+      checkFilterPredicate(Literal(1) === '_1, classOf[Eq  [_]], 1)
+      checkFilterPredicate(Literal(2) >   '_1, classOf[Lt  [_]], 1)
+      checkFilterPredicate(Literal(3) <   '_1, classOf[Gt  [_]], 4)
+      checkFilterPredicate(Literal(1) >=  '_1, classOf[LtEq[_]], 1)
+      checkFilterPredicate(Literal(4) <=  '_1, classOf[GtEq[_]], 4)
+
+      checkFilterPredicate(!('_1 < 4),         classOf[GtEq[_]],       4)
+      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
+      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or],  Seq(Row(1), Row(4)))
+    }
+  }
+
+  test("filter pushdown - string") {
+    withParquetRDD((1 to 4).map(i => Tuple1(i.toString))) { implicit rdd =>
+      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+      checkFilterPredicate(
+        '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.toString)))
+
+      checkFilterPredicate('_1 === "1", classOf[Eq   [_]], "1")
+      checkFilterPredicate('_1 !== "1", classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.toString)))
+
+      checkFilterPredicate('_1 <  "2", classOf[Lt  [_]], "1")
+      checkFilterPredicate('_1 >  "3", classOf[Gt  [_]], "4")
+      checkFilterPredicate('_1 <= "1", classOf[LtEq[_]], "1")
+      checkFilterPredicate('_1 >= "4", classOf[GtEq[_]], "4")
+
+      checkFilterPredicate(Literal("1") === '_1, classOf[Eq  [_]], "1")
+      checkFilterPredicate(Literal("2") >   '_1, classOf[Lt  [_]], "1")
+      checkFilterPredicate(Literal("3") <   '_1, classOf[Gt  [_]], "4")
+      checkFilterPredicate(Literal("1") >=  '_1, classOf[LtEq[_]], "1")
+      checkFilterPredicate(Literal("4") <=  '_1, classOf[GtEq[_]], "4")
+
+      checkFilterPredicate(!('_1 < "4"),           classOf[GtEq[_]],       "4")
+      checkFilterPredicate('_1 > "2" && '_1 < "4", classOf[Operators.And], "3")
+      checkFilterPredicate('_1 < "2" || '_1 > "3", classOf[Operators.Or],  Seq(Row("1"), Row("4")))
+    }
+  }
+
+  def checkBinaryFilterPredicate
+      (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Seq[Row])
+      (implicit rdd: SchemaRDD): Unit = {
+    def checkBinaryAnswer(rdd: SchemaRDD, expected: Seq[Row]) = {
+      assertResult(expected.map(_.getAs[Array[Byte]](0).mkString(",")).toSeq.sorted) {
+        rdd.map(_.getAs[Array[Byte]](0).mkString(",")).collect().toSeq.sorted
+      }
+    }
+
+    checkFilterPredicate(rdd, predicate, filterClass, checkBinaryAnswer _, expected)
+  }
+
+  def checkBinaryFilterPredicate
+      (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Array[Byte])
+      (implicit rdd: SchemaRDD): Unit = {
+    checkBinaryFilterPredicate(predicate, filterClass, Seq(Row(expected)))(rdd)
+  }
+
+  test("filter pushdown - binary") {
+    implicit class IntToBinary(int: Int) {
+      def b: Array[Byte] = int.toString.getBytes("UTF-8")
+    }
+
+    withParquetRDD((1 to 4).map(i => Tuple1(i.b))) { implicit rdd =>
+      checkBinaryFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+      checkBinaryFilterPredicate(
+        '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.b)).toSeq)
+
+      checkBinaryFilterPredicate('_1 === 1.b, classOf[Eq   [_]], 1.b)
+      checkBinaryFilterPredicate(
+        '_1 !== 1.b, classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.b)).toSeq)
+
+      checkBinaryFilterPredicate('_1 <  2.b, classOf[Lt  [_]], 1.b)
+      checkBinaryFilterPredicate('_1 >  3.b, classOf[Gt  [_]], 4.b)
+      checkBinaryFilterPredicate('_1 <= 1.b, classOf[LtEq[_]], 1.b)
+      checkBinaryFilterPredicate('_1 >= 4.b, classOf[GtEq[_]], 4.b)
+
+      checkBinaryFilterPredicate(Literal(1.b) === '_1, classOf[Eq  [_]], 1.b)
+      checkBinaryFilterPredicate(Literal(2.b) >   '_1, classOf[Lt  [_]], 1.b)
+      checkBinaryFilterPredicate(Literal(3.b) <   '_1, classOf[Gt  [_]], 4.b)
+      checkBinaryFilterPredicate(Literal(1.b) >=  '_1, classOf[LtEq[_]], 1.b)
+      checkBinaryFilterPredicate(Literal(4.b) <=  '_1, classOf[GtEq[_]], 4.b)
+
+      checkBinaryFilterPredicate(!('_1 < 4.b), classOf[GtEq[_]], 4.b)
+      checkBinaryFilterPredicate('_1 > 2.b && '_1 < 4.b, classOf[Operators.And], 3.b)
+      checkBinaryFilterPredicate(
+        '_1 < 2.b || '_1 > 3.b, classOf[Operators.Or], Seq(Row(1.b), Row(4.b)))
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
new file mode 100644
index 0000000000000..a57e4e85a35ef
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -0,0 +1,287 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.parquet
+
+import scala.collection.JavaConversions._
+import scala.reflect.ClassTag
+import scala.reflect.runtime.universe.TypeTag
+
+import parquet.example.data.simple.SimpleGroup
+import parquet.example.data.{Group, GroupWriter}
+import parquet.hadoop.api.WriteSupport
+import parquet.hadoop.api.WriteSupport.WriteContext
+import parquet.hadoop.metadata.CompressionCodecName
+import parquet.hadoop.{ParquetFileWriter, ParquetWriter}
+import parquet.io.api.RecordConsumer
+import parquet.schema.{MessageType, MessageTypeParser}
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.spark.sql.catalyst.ScalaReflection
+import org.apache.spark.sql.catalyst.expressions.Row
+import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.sql.test.TestSQLContext._
+import org.apache.spark.sql.types.DecimalType
+import org.apache.spark.sql.{QueryTest, SQLConf, SchemaRDD}
+
+// Write support class for nested groups: ParquetWriter initializes GroupWriteSupport
+// with an empty configuration (it is after all not intended to be used in this way?)
+// and members are private so we need to make our own in order to pass the schema
+// to the writer.
+private[parquet] class TestGroupWriteSupport(schema: MessageType) extends WriteSupport[Group] {
+  var groupWriter: GroupWriter = null
+
+  override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
+    groupWriter = new GroupWriter(recordConsumer, schema)
+  }
+
+  override def init(configuration: Configuration): WriteContext = {
+    new WriteContext(schema, new java.util.HashMap[String, String]())
+  }
+
+  override def write(record: Group) {
+    groupWriter.write(record)
+  }
+}
+
+/**
+ * A test suite that tests basic Parquet I/O.
+ */
+class ParquetIOSuite extends QueryTest with ParquetTest {
+  val sqlContext = TestSQLContext
+
+  /**
+   * Writes `data` to a Parquet file, reads it back and check file contents.
+   */
+  protected def checkParquetFile[T <: Product : ClassTag: TypeTag](data: Seq[T]): Unit = {
+    withParquetRDD(data)(r => checkAnswer(r, data.map(Row.fromTuple)))
+  }
+
+  test("basic data types (without binary)") {
+    val data = (1 to 4).map { i =>
+      (i % 2 == 0, i, i.toLong, i.toFloat, i.toDouble)
+    }
+    checkParquetFile(data)
+  }
+
+  test("raw binary") {
+    val data = (1 to 4).map(i => Tuple1(Array.fill(3)(i.toByte)))
+    withParquetRDD(data) { rdd =>
+      assertResult(data.map(_._1.mkString(",")).sorted) {
+        rdd.collect().map(_.getAs[Array[Byte]](0).mkString(",")).sorted
+      }
+    }
+  }
+
+  test("string") {
+    val data = (1 to 4).map(i => Tuple1(i.toString))
+    // Property spark.sql.parquet.binaryAsString shouldn't affect Parquet files written by Spark SQL
+    // as we store Spark SQL schema in the extra metadata.
+    withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING -> "false")(checkParquetFile(data))
+    withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING -> "true")(checkParquetFile(data))
+  }
+
+  test("fixed-length decimals") {
+    def makeDecimalRDD(decimal: DecimalType): SchemaRDD =
+      sparkContext
+        .parallelize(0 to 1000)
+        .map(i => Tuple1(i / 100.0))
+        .select('_1 cast decimal)
+
+    for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17))) {
+      withTempPath { dir =>
+        val data = makeDecimalRDD(DecimalType(precision, scale))
+        data.saveAsParquetFile(dir.getCanonicalPath)
+        checkAnswer(parquetFile(dir.getCanonicalPath), data.collect().toSeq)
+      }
+    }
+
+    // Decimals with precision above 18 are not yet supported
+    intercept[RuntimeException] {
+      withTempPath { dir =>
+        makeDecimalRDD(DecimalType(19, 10)).saveAsParquetFile(dir.getCanonicalPath)
+        parquetFile(dir.getCanonicalPath).collect()
+      }
+    }
+
+    // Unlimited-length decimals are not yet supported
+    intercept[RuntimeException] {
+      withTempPath { dir =>
+        makeDecimalRDD(DecimalType.Unlimited).saveAsParquetFile(dir.getCanonicalPath)
+        parquetFile(dir.getCanonicalPath).collect()
+      }
+    }
+  }
+
+  test("map") {
+    val data = (1 to 4).map(i => Tuple1(Map(i -> s"val_$i")))
+    checkParquetFile(data)
+  }
+
+  test("array") {
+    val data = (1 to 4).map(i => Tuple1(Seq(i, i + 1)))
+    checkParquetFile(data)
+  }
+
+  test("struct") {
+    val data = (1 to 4).map(i => Tuple1((i, s"val_$i")))
+    withParquetRDD(data) { rdd =>
+      // Structs are converted to `Row`s
+      checkAnswer(rdd, data.map { case Tuple1(struct) =>
+        Row(Row(struct.productIterator.toSeq: _*))
+      })
+    }
+  }
+
+  test("nested struct with array of array as field") {
+    val data = (1 to 4).map(i => Tuple1((i, Seq(Seq(s"val_$i")))))
+    withParquetRDD(data) { rdd =>
+      // Structs are converted to `Row`s
+      checkAnswer(rdd, data.map { case Tuple1(struct) =>
+        Row(Row(struct.productIterator.toSeq: _*))
+      })
+    }
+  }
+
+  test("nested map with struct as value type") {
+    val data = (1 to 4).map(i => Tuple1(Map(i -> (i, s"val_$i"))))
+    withParquetRDD(data) { rdd =>
+      checkAnswer(rdd, data.map { case Tuple1(m) =>
+        Row(m.mapValues(struct => Row(struct.productIterator.toSeq: _*)))
+      })
+    }
+  }
+
+  test("nulls") {
+    val allNulls = (
+      null.asInstanceOf[java.lang.Boolean],
+      null.asInstanceOf[Integer],
+      null.asInstanceOf[java.lang.Long],
+      null.asInstanceOf[java.lang.Float],
+      null.asInstanceOf[java.lang.Double])
+
+    withParquetRDD(allNulls :: Nil) { rdd =>
+      val rows = rdd.collect()
+      assert(rows.size === 1)
+      assert(rows.head === Row(Seq.fill(5)(null): _*))
+    }
+  }
+
+  test("nones") {
+    val allNones = (
+      None.asInstanceOf[Option[Int]],
+      None.asInstanceOf[Option[Long]],
+      None.asInstanceOf[Option[String]])
+
+    withParquetRDD(allNones :: Nil) { rdd =>
+      val rows = rdd.collect()
+      assert(rows.size === 1)
+      assert(rows.head === Row(Seq.fill(3)(null): _*))
+    }
+  }
+
+  test("compression codec") {
+    def compressionCodecFor(path: String) = {
+      val codecs = ParquetTypesConverter
+        .readMetaData(new Path(path), Some(configuration))
+        .getBlocks
+        .flatMap(_.getColumns)
+        .map(_.getCodec.name())
+        .distinct
+
+      assert(codecs.size === 1)
+      codecs.head
+    }
+
+    val data = (0 until 10).map(i => (i, i.toString))
+
+    def checkCompressionCodec(codec: CompressionCodecName): Unit = {
+      withSQLConf(SQLConf.PARQUET_COMPRESSION -> codec.name()) {
+        withParquetFile(data) { path =>
+          assertResult(conf.parquetCompressionCodec.toUpperCase) {
+            compressionCodecFor(path)
+          }
+        }
+      }
+    }
+
+    // Checks default compression codec
+    checkCompressionCodec(CompressionCodecName.fromConf(conf.parquetCompressionCodec))
+
+    checkCompressionCodec(CompressionCodecName.UNCOMPRESSED)
+    checkCompressionCodec(CompressionCodecName.GZIP)
+    checkCompressionCodec(CompressionCodecName.SNAPPY)
+  }
+
+  test("read raw Parquet file") {
+    def makeRawParquetFile(path: Path): Unit = {
+      val schema = MessageTypeParser.parseMessageType(
+        """
+          |message root {
+          |  required boolean _1;
+          |  required int32   _2;
+          |  required int64   _3;
+          |  required float   _4;
+          |  required double  _5;
+          |}
+        """.stripMargin)
+
+      val writeSupport = new TestGroupWriteSupport(schema)
+      val writer = new ParquetWriter[Group](path, writeSupport)
+
+      (0 until 10).foreach { i =>
+        val record = new SimpleGroup(schema)
+        record.add(0, i % 2 == 0)
+        record.add(1, i)
+        record.add(2, i.toLong)
+        record.add(3, i.toFloat)
+        record.add(4, i.toDouble)
+        writer.write(record)
+      }
+
+      writer.close()
+    }
+
+    withTempDir { dir =>
+      val path = new Path(dir.toURI.toString, "part-r-0.parquet")
+      makeRawParquetFile(path)
+      checkAnswer(parquetFile(path.toString), (0 until 10).map { i =>
+        Row(i % 2 == 0, i, i.toLong, i.toFloat, i.toDouble)
+      })
+    }
+  }
+
+  test("write metadata") {
+    withTempPath { file =>
+      val path = new Path(file.toURI.toString)
+      val fs = FileSystem.getLocal(configuration)
+      val attributes = ScalaReflection.attributesFor[(Int, String)]
+      ParquetTypesConverter.writeMetaData(attributes, path, configuration)
+
+      assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)))
+      assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)))
+
+      val metaData = ParquetTypesConverter.readMetaData(path, Some(configuration))
+      val actualSchema = metaData.getFileMetaData.getSchema
+      val expectedSchema = ParquetTypesConverter.convertFromAttributes(attributes)
+
+      actualSchema.checkContains(expectedSchema)
+      expectedSchema.checkContains(actualSchema)
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index 7ee4f3c1e93eb..1263ff818ea19 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -17,925 +17,72 @@
 
 package org.apache.spark.sql.parquet
 
-import _root_.parquet.filter2.predicate.{FilterPredicate, Operators}
-import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.mapreduce.Job
-import org.scalatest.{BeforeAndAfterAll, FunSuiteLike}
-import parquet.hadoop.ParquetFileWriter
-import parquet.hadoop.util.ContextUtil
-
-import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.types.IntegerType
-import org.apache.spark.sql.catalyst.util.getTempFilePath
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext._
-import org.apache.spark.util.Utils
-
-case class TestRDDEntry(key: Int, value: String)
-
-case class NullReflectData(
-    intField: java.lang.Integer,
-    longField: java.lang.Long,
-    floatField: java.lang.Float,
-    doubleField: java.lang.Double,
-    booleanField: java.lang.Boolean)
-
-case class OptionalReflectData(
-    intField: Option[Int],
-    longField: Option[Long],
-    floatField: Option[Float],
-    doubleField: Option[Double],
-    booleanField: Option[Boolean])
-
-case class Nested(i: Int, s: String)
-
-case class Data(array: Seq[Int], nested: Nested)
-
-case class AllDataTypes(
-    stringField: String,
-    intField: Int,
-    longField: Long,
-    floatField: Float,
-    doubleField: Double,
-    shortField: Short,
-    byteField: Byte,
-    booleanField: Boolean)
-
-case class AllDataTypesWithNonPrimitiveType(
-    stringField: String,
-    intField: Int,
-    longField: Long,
-    floatField: Float,
-    doubleField: Double,
-    shortField: Short,
-    byteField: Byte,
-    booleanField: Boolean,
-    array: Seq[Int],
-    arrayContainsNull: Seq[Option[Int]],
-    map: Map[Int, Long],
-    mapValueContainsNull: Map[Int, Option[Long]],
-    data: Data)
-
-case class BinaryData(binaryData: Array[Byte])
-
-case class NumericData(i: Int, d: Double)
-
-class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterAll {
-  TestData // Load test data tables.
-
-  var testRDD: SchemaRDD = null
-
-  override def beforeAll() {
-    ParquetTestData.writeFile()
-    ParquetTestData.writeFilterFile()
-    ParquetTestData.writeNestedFile1()
-    ParquetTestData.writeNestedFile2()
-    ParquetTestData.writeNestedFile3()
-    ParquetTestData.writeNestedFile4()
-    testRDD = parquetFile(ParquetTestData.testDir.toString)
-    testRDD.registerTempTable("testsource")
-    parquetFile(ParquetTestData.testFilterDir.toString)
-      .registerTempTable("testfiltersource")
-
-    setConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED, "true")
-  }
-
-  override def afterAll() {
-    Utils.deleteRecursively(ParquetTestData.testDir)
-    Utils.deleteRecursively(ParquetTestData.testFilterDir)
-    Utils.deleteRecursively(ParquetTestData.testNestedDir1)
-    Utils.deleteRecursively(ParquetTestData.testNestedDir2)
-    Utils.deleteRecursively(ParquetTestData.testNestedDir3)
-    Utils.deleteRecursively(ParquetTestData.testNestedDir4)
-    // here we should also unregister the table??
-  }
-
-  test("Read/Write All Types") {
-    val tempDir = getTempFilePath("parquetTest").getCanonicalPath
-    val range = (0 to 255)
-    val data = sparkContext.parallelize(range)
-      .map(x => AllDataTypes(s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
-
-    data.saveAsParquetFile(tempDir)
-
-    checkAnswer(
-      parquetFile(tempDir),
-      data.toSchemaRDD.collect().toSeq)
-  }
-
-  test("read/write binary data") {
-    // Since equality for Array[Byte] is broken we test this separately.
-    val tempDir = getTempFilePath("parquetTest").getCanonicalPath
-    sparkContext.parallelize(BinaryData("test".getBytes("utf8")) :: Nil).saveAsParquetFile(tempDir)
-    parquetFile(tempDir)
-      .map(r => new String(r(0).asInstanceOf[Array[Byte]], "utf8"))
-      .collect().toSeq == Seq("test")
-  }
-
-  ignore("Treat binary as string") {
-    val oldIsParquetBinaryAsString = TestSQLContext.isParquetBinaryAsString
-
-    // Create the test file.
-    val file = getTempFilePath("parquet")
-    val path = file.toString
-    val range = (0 to 255)
-    val rowRDD = TestSQLContext.sparkContext.parallelize(range)
-      .map(i => org.apache.spark.sql.Row(i, s"val_$i".getBytes))
-    // We need to ask Parquet to store the String column as a Binary column.
-    val schema = StructType(
-      StructField("c1", IntegerType, false) ::
-      StructField("c2", BinaryType, false) :: Nil)
-    val schemaRDD1 = applySchema(rowRDD, schema)
-    schemaRDD1.saveAsParquetFile(path)
-    checkAnswer(
-      parquetFile(path).select('c1, 'c2.cast(StringType)),
-      schemaRDD1.select('c1, 'c2.cast(StringType)).collect().toSeq)
-
-    setConf(SQLConf.PARQUET_BINARY_AS_STRING, "true")
-    parquetFile(path).printSchema()
-    checkAnswer(
-      parquetFile(path),
-      schemaRDD1.select('c1, 'c2.cast(StringType)).collect().toSeq)
-
-
-    // Set it back.
-    TestSQLContext.setConf(SQLConf.PARQUET_BINARY_AS_STRING, oldIsParquetBinaryAsString.toString)
-  }
-
-  test("Compression options for writing to a Parquetfile") {
-    val defaultParquetCompressionCodec = TestSQLContext.parquetCompressionCodec
-    import scala.collection.JavaConversions._
-
-    val file = getTempFilePath("parquet")
-    val path = file.toString
-    val rdd = TestSQLContext.sparkContext.parallelize((1 to 100))
-      .map(i => TestRDDEntry(i, s"val_$i"))
-
-    // test default compression codec
-    rdd.saveAsParquetFile(path)
-    var actualCodec = ParquetTypesConverter.readMetaData(new Path(path), Some(TestSQLContext.sparkContext.hadoopConfiguration))
-      .getBlocks.flatMap(block => block.getColumns).map(column => column.getCodec.name()).distinct
-    assert(actualCodec === TestSQLContext.parquetCompressionCodec.toUpperCase :: Nil)
-
-    parquetFile(path).registerTempTable("tmp")
-    checkAnswer(
-      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
-      (5, "val_5") ::
-      (7, "val_7") :: Nil)
-
-    Utils.deleteRecursively(file)
-
-    // test uncompressed parquet file with property value "UNCOMPRESSED"
-    TestSQLContext.setConf(SQLConf.PARQUET_COMPRESSION, "UNCOMPRESSED")
 
-    rdd.saveAsParquetFile(path)
-    actualCodec = ParquetTypesConverter.readMetaData(new Path(path), Some(TestSQLContext.sparkContext.hadoopConfiguration))
-      .getBlocks.flatMap(block => block.getColumns).map(column => column.getCodec.name()).distinct
-    assert(actualCodec === TestSQLContext.parquetCompressionCodec.toUpperCase :: Nil)
-
-    parquetFile(path).registerTempTable("tmp")
-    checkAnswer(
-      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
-      (5, "val_5") ::
-      (7, "val_7") :: Nil)
-
-    Utils.deleteRecursively(file)
-
-    // test uncompressed parquet file with property value "none"
-    TestSQLContext.setConf(SQLConf.PARQUET_COMPRESSION, "none")
-
-    rdd.saveAsParquetFile(path)
-    actualCodec = ParquetTypesConverter.readMetaData(new Path(path), Some(TestSQLContext.sparkContext.hadoopConfiguration))
-      .getBlocks.flatMap(block => block.getColumns).map(column => column.getCodec.name()).distinct
-    assert(actualCodec === "UNCOMPRESSED" :: Nil)
-
-    parquetFile(path).registerTempTable("tmp")
-    checkAnswer(
-      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
-      (5, "val_5") ::
-      (7, "val_7") :: Nil)
-
-    Utils.deleteRecursively(file)
-
-    // test gzip compression codec
-    TestSQLContext.setConf(SQLConf.PARQUET_COMPRESSION, "gzip")
-
-    rdd.saveAsParquetFile(path)
-    actualCodec = ParquetTypesConverter.readMetaData(new Path(path), Some(TestSQLContext.sparkContext.hadoopConfiguration))
-      .getBlocks.flatMap(block => block.getColumns).map(column => column.getCodec.name()).distinct
-    assert(actualCodec === TestSQLContext.parquetCompressionCodec.toUpperCase :: Nil)
-
-    parquetFile(path).registerTempTable("tmp")
-    checkAnswer(
-      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
-      (5, "val_5") ::
-      (7, "val_7") :: Nil)
-
-    Utils.deleteRecursively(file)
-
-    // test snappy compression codec
-    TestSQLContext.setConf(SQLConf.PARQUET_COMPRESSION, "snappy")
-
-    rdd.saveAsParquetFile(path)
-    actualCodec = ParquetTypesConverter.readMetaData(new Path(path), Some(TestSQLContext.sparkContext.hadoopConfiguration))
-      .getBlocks.flatMap(block => block.getColumns).map(column => column.getCodec.name()).distinct
-    assert(actualCodec === TestSQLContext.parquetCompressionCodec.toUpperCase :: Nil)
-
-    parquetFile(path).registerTempTable("tmp")
-    checkAnswer(
-      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
-      (5, "val_5") ::
-      (7, "val_7") :: Nil)
-
-    Utils.deleteRecursively(file)
-
-    // TODO: Lzo requires additional external setup steps so leave it out for now
-    // ref.: https://github.com/Parquet/parquet-mr/blob/parquet-1.5.0/parquet-hadoop/src/test/java/parquet/hadoop/example/TestInputOutputFormat.java#L169
-
-    // Set it back.
-    TestSQLContext.setConf(SQLConf.PARQUET_COMPRESSION, defaultParquetCompressionCodec)
-  }
-
-  test("Read/Write All Types with non-primitive type") {
-    val tempDir = getTempFilePath("parquetTest").getCanonicalPath
-    val range = (0 to 255)
-    val data = sparkContext.parallelize(range)
-      .map(x => AllDataTypesWithNonPrimitiveType(
-        s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0,
-        (0 until x),
-        (0 until x).map(Option(_).filter(_ % 3 == 0)),
-        (0 until x).map(i => i -> i.toLong).toMap,
-        (0 until x).map(i => i -> Option(i.toLong)).toMap + (x -> None),
-        Data((0 until x), Nested(x, s"$x"))))
-    data.saveAsParquetFile(tempDir)
-
-    checkAnswer(
-      parquetFile(tempDir),
-      data.toSchemaRDD.collect().toSeq)
-  }
-
-  test("self-join parquet files") {
-    val x = ParquetTestData.testData.as('x)
-    val y = ParquetTestData.testData.as('y)
-    val query = x.join(y).where("x.myint".attr === "y.myint".attr)
-
-    // Check to make sure that the attributes from either side of the join have unique expression
-    // ids.
-    query.queryExecution.analyzed.output.filter(_.name == "myint") match {
-      case Seq(i1, i2) if(i1.exprId == i2.exprId) =>
-        fail(s"Duplicate expression IDs found in query plan: $query")
-      case Seq(_, _) => // All good
-    }
+/**
+ * A test suite that tests various Parquet queries.
+ */
+class ParquetQuerySuite extends QueryTest with ParquetTest {
+  val sqlContext = TestSQLContext
 
-    val result = query.collect()
-    assert(result.size === 9, "self-join result has incorrect size")
-    assert(result(0).size === 12, "result row has incorrect size")
-    result.zipWithIndex.foreach {
-      case (row, index) => row.zipWithIndex.foreach {
-        case (field, column) => assert(field != null, s"self-join contains null value in row $index field $column")
-      }
+  test("simple projection") {
+    withParquetTable((0 until 10).map(i => (i, i.toString)), "t") {
+      checkAnswer(sql("SELECT _1 FROM t"), (0 until 10).map(Row.apply(_)))
     }
   }
 
-  test("Import of simple Parquet file") {
-    val result = parquetFile(ParquetTestData.testDir.toString).collect()
-    assert(result.size === 15)
-    result.zipWithIndex.foreach {
-      case (row, index) => {
-        val checkBoolean =
-          if (index % 3 == 0)
-            row(0) == true
-          else
-            row(0) == false
-        assert(checkBoolean === true, s"boolean field value in line $index did not match")
-        if (index % 5 == 0) assert(row(1) === 5, s"int field value in line $index did not match")
-        assert(row(2) === "abc", s"string field value in line $index did not match")
-        assert(row(3) === (index.toLong << 33), s"long value in line $index did not match")
-        assert(row(4) === 2.5F, s"float field value in line $index did not match")
-        assert(row(5) === 4.5D, s"double field value in line $index did not match")
-      }
+  test("appending") {
+    val data = (0 until 10).map(i => (i, i.toString))
+    withParquetTable(data, "t") {
+      sql("INSERT INTO t SELECT * FROM t")
+      checkAnswer(table("t"), (data ++ data).map(Row.fromTuple))
     }
   }
 
-  test("Projection of simple Parquet file") {
-    val result = ParquetTestData.testData.select('myboolean, 'mylong).collect()
-    result.zipWithIndex.foreach {
-      case (row, index) => {
-          if (index % 3 == 0)
-            assert(row(0) === true, s"boolean field value in line $index did not match (every third row)")
-          else
-            assert(row(0) === false, s"boolean field value in line $index did not match")
-        assert(row(1) === (index.toLong << 33), s"long field value in line $index did not match")
-        assert(row.size === 2, s"number of columns in projection in line $index is incorrect")
-      }
+  test("self-join") {
+    // 4 rows, cells of column 1 of row 2 and row 4 are null
+    val data = (1 to 4).map { i =>
+      val maybeInt = if (i % 2 == 0) None else Some(i)
+      (maybeInt, i.toString)
     }
-  }
 
-  test("Writing metadata from scratch for table CREATE") {
-    val job = new Job()
-    val path = new Path(getTempFilePath("testtable").getCanonicalFile.toURI.toString)
-    val fs: FileSystem = FileSystem.getLocal(ContextUtil.getConfiguration(job))
-    ParquetTypesConverter.writeMetaData(
-      ParquetTestData.testData.output,
-      path,
-      TestSQLContext.sparkContext.hadoopConfiguration)
-    assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)))
-    val metaData = ParquetTypesConverter.readMetaData(path, Some(ContextUtil.getConfiguration(job)))
-    assert(metaData != null)
-    ParquetTestData
-      .testData
-      .parquetSchema
-      .checkContains(metaData.getFileMetaData.getSchema) // throws exception if incompatible
-    metaData
-      .getFileMetaData
-      .getSchema
-      .checkContains(ParquetTestData.testData.parquetSchema) // throws exception if incompatible
-    fs.delete(path, true)
-  }
+    withParquetTable(data, "t") {
+      val selfJoin = sql("SELECT * FROM t x JOIN t y WHERE x._1 = y._1")
+      val queryOutput = selfJoin.queryExecution.analyzed.output
 
-  test("Creating case class RDD table") {
-    TestSQLContext.sparkContext.parallelize((1 to 100))
-      .map(i => TestRDDEntry(i, s"val_$i"))
-      .registerTempTable("tmp")
-    val rdd = sql("SELECT * FROM tmp").collect().sortBy(_.getInt(0))
-    var counter = 1
-    rdd.foreach {
-      // '===' does not like string comparison?
-      row: Row => {
-        assert(row.getString(1).equals(s"val_$counter"), s"row $counter value ${row.getString(1)} does not match val_$counter")
-        counter = counter + 1
+      assertResult(4, s"Field count mismatches")(queryOutput.size)
+      assertResult(2, s"Duplicated expression ID in query plan:\n $selfJoin") {
+        queryOutput.filter(_.name == "_1").map(_.exprId).size
       }
-    }
-  }
-
-  test("Read a parquet file instead of a directory") {
-    val file = getTempFilePath("parquet")
-    val path = file.toString
-    val fsPath = new Path(path)
-    val fs: FileSystem = fsPath.getFileSystem(TestSQLContext.sparkContext.hadoopConfiguration)
-    val rdd = TestSQLContext.sparkContext.parallelize((1 to 100))
-      .map(i => TestRDDEntry(i, s"val_$i"))
-    rdd.coalesce(1).saveAsParquetFile(path)
 
-    val children = fs.listStatus(fsPath).filter(_.getPath.getName.endsWith(".parquet"))
-    assert(children.length > 0)
-    val readFile = parquetFile(path + "/" + children(0).getPath.getName)
-    readFile.registerTempTable("tmpx")
-    val rdd_copy = sql("SELECT * FROM tmpx").collect()
-    val rdd_orig = rdd.collect()
-    for(i <- 0 to 99) {
-      assert(rdd_copy(i).apply(0) === rdd_orig(i).key,   s"key error in line $i")
-      assert(rdd_copy(i).apply(1) === rdd_orig(i).value, s"value error in line $i")
+      checkAnswer(selfJoin, List(Row(1, "1", 1, "1"), Row(3, "3", 3, "3")))
     }
-    Utils.deleteRecursively(file)
-  }
-
-  test("Insert (overwrite) via Scala API") {
-    val dirname = Utils.createTempDir()
-    val source_rdd = TestSQLContext.sparkContext.parallelize((1 to 100))
-      .map(i => TestRDDEntry(i, s"val_$i"))
-    source_rdd.registerTempTable("source")
-    val dest_rdd = createParquetFile[TestRDDEntry](dirname.toString)
-    dest_rdd.registerTempTable("dest")
-    sql("INSERT OVERWRITE INTO dest SELECT * FROM source").collect()
-    val rdd_copy1 = sql("SELECT * FROM dest").collect()
-    assert(rdd_copy1.size === 100)
-
-    sql("INSERT INTO dest SELECT * FROM source")
-    val rdd_copy2 = sql("SELECT * FROM dest").collect().sortBy(_.getInt(0))
-    assert(rdd_copy2.size === 200)
-    Utils.deleteRecursively(dirname)
-  }
-
-  test("Insert (appending) to same table via Scala API") {
-    sql("INSERT INTO testsource SELECT * FROM testsource")
-    val double_rdd = sql("SELECT * FROM testsource").collect()
-    assert(double_rdd != null)
-    assert(double_rdd.size === 30)
-
-    // let's restore the original test data
-    Utils.deleteRecursively(ParquetTestData.testDir)
-    ParquetTestData.writeFile()
-  }
-
-  test("save and load case class RDD with nulls as parquet") {
-    val data = NullReflectData(null, null, null, null, null)
-    val rdd = sparkContext.parallelize(data :: Nil)
-
-    val file = getTempFilePath("parquet")
-    val path = file.toString
-    rdd.saveAsParquetFile(path)
-    val readFile = parquetFile(path)
-
-    val rdd_saved = readFile.collect()
-    assert(rdd_saved(0) === Seq.fill(5)(null))
-    Utils.deleteRecursively(file)
-    assert(true)
-  }
-
-  test("save and load case class RDD with Nones as parquet") {
-    val data = OptionalReflectData(None, None, None, None, None)
-    val rdd = sparkContext.parallelize(data :: Nil)
-
-    val file = getTempFilePath("parquet")
-    val path = file.toString
-    rdd.saveAsParquetFile(path)
-    val readFile = parquetFile(path)
-
-    val rdd_saved = readFile.collect()
-    assert(rdd_saved(0) === Seq.fill(5)(null))
-    Utils.deleteRecursively(file)
-    assert(true)
   }
 
-  test("make RecordFilter for simple predicates") {
-    def checkFilter[T <: FilterPredicate](predicate: Expression, defined: Boolean = true): Unit = {
-      val filter = ParquetFilters.createFilter(predicate)
-      if (defined) {
-        assert(filter.isDefined)
-        assert(filter.get.isInstanceOf[T])
-      } else {
-        assert(filter.isEmpty)
-      }
+  test("nested data - struct with array field") {
+    val data = (1 to 10).map(i => Tuple1((i, Seq(s"val_$i"))))
+    withParquetTable(data, "t") {
+      checkAnswer(sql("SELECT _1._2[0] FROM t"), data.map {
+        case Tuple1((_, Seq(string))) => Row(string)
+      })
     }
-
-    checkFilter[Operators.Eq[Integer]]('a.int === 1)
-    checkFilter[Operators.Eq[Integer]](Literal(1) === 'a.int)
-
-    checkFilter[Operators.Lt[Integer]]('a.int < 4)
-    checkFilter[Operators.Lt[Integer]](Literal(4) > 'a.int)
-    checkFilter[Operators.LtEq[Integer]]('a.int <= 4)
-    checkFilter[Operators.LtEq[Integer]](Literal(4) >= 'a.int)
-
-    checkFilter[Operators.Gt[Integer]]('a.int > 4)
-    checkFilter[Operators.Gt[Integer]](Literal(4) < 'a.int)
-    checkFilter[Operators.GtEq[Integer]]('a.int >= 4)
-    checkFilter[Operators.GtEq[Integer]](Literal(4) <= 'a.int)
-
-    checkFilter[Operators.And]('a.int === 1 && 'a.int < 4)
-    checkFilter[Operators.Or]('a.int === 1 || 'a.int < 4)
-    checkFilter[Operators.Not](!('a.int === 1))
-
-    checkFilter('a.int > 'b.int, defined = false)
-    checkFilter(('a.int > 'b.int) && ('a.int > 'b.int), defined = false)
   }
 
-  test("test filter by predicate pushdown") {
-    for(myval <- Seq("myint", "mylong", "mydouble", "myfloat")) {
-      val query1 = sql(s"SELECT * FROM testfiltersource WHERE $myval < 150 AND $myval >= 100")
-      assert(
-        query1.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-        "Top operator should be ParquetTableScan after pushdown")
-      val result1 = query1.collect()
-      assert(result1.size === 50)
-      assert(result1(0)(1) === 100)
-      assert(result1(49)(1) === 149)
-      val query2 = sql(s"SELECT * FROM testfiltersource WHERE $myval > 150 AND $myval <= 200")
-      assert(
-        query2.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-        "Top operator should be ParquetTableScan after pushdown")
-      val result2 = query2.collect()
-      assert(result2.size === 50)
-      if (myval == "myint" || myval == "mylong") {
-        assert(result2(0)(1) === 151)
-        assert(result2(49)(1) === 200)
-      } else {
-        assert(result2(0)(1) === 150)
-        assert(result2(49)(1) === 199)
-      }
-    }
-    for(myval <- Seq("myint", "mylong")) {
-      val query3 = sql(s"SELECT * FROM testfiltersource WHERE $myval > 190 OR $myval < 10")
-      assert(
-        query3.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-        "Top operator should be ParquetTableScan after pushdown")
-      val result3 = query3.collect()
-      assert(result3.size === 20)
-      assert(result3(0)(1) === 0)
-      assert(result3(9)(1) === 9)
-      assert(result3(10)(1) === 191)
-      assert(result3(19)(1) === 200)
-    }
-    for(myval <- Seq("mydouble", "myfloat")) {
-      val result4 =
-        if (myval == "mydouble") {
-          val query4 = sql(s"SELECT * FROM testfiltersource WHERE $myval > 190.5 OR $myval < 10.0")
-          assert(
-            query4.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-            "Top operator should be ParquetTableScan after pushdown")
-          query4.collect()
-        } else {
-          // CASTs are problematic. Here myfloat will be casted to a double and it seems there is
-          // currently no way to specify float constants in SqlParser?
-          sql(s"SELECT * FROM testfiltersource WHERE $myval > 190.5 OR $myval < 10").collect()
-        }
-      assert(result4.size === 20)
-      assert(result4(0)(1) === 0)
-      assert(result4(9)(1) === 9)
-      assert(result4(10)(1) === 191)
-      assert(result4(19)(1) === 200)
-    }
-    val query5 = sql(s"SELECT * FROM testfiltersource WHERE myboolean = true AND myint < 40")
-    assert(
-      query5.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-      "Top operator should be ParquetTableScan after pushdown")
-    val booleanResult = query5.collect()
-    assert(booleanResult.size === 10)
-    for(i <- 0 until 10) {
-      if (!booleanResult(i).getBoolean(0)) {
-        fail(s"Boolean value in result row $i not true")
-      }
-      if (booleanResult(i).getInt(1) != i * 4) {
-        fail(s"Int value in result row $i should be ${4*i}")
-      }
-    }
-    val query6 = sql("SELECT * FROM testfiltersource WHERE mystring = \"100\"")
-    assert(
-      query6.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-      "Top operator should be ParquetTableScan after pushdown")
-    val stringResult = query6.collect()
-    assert(stringResult.size === 1)
-    assert(stringResult(0).getString(2) == "100", "stringvalue incorrect")
-    assert(stringResult(0).getInt(1) === 100)
-
-    val query7 = sql(s"SELECT * FROM testfiltersource WHERE myoptint < 40")
-    assert(
-      query7.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-      "Top operator should be ParquetTableScan after pushdown")
-    val optResult = query7.collect()
-    assert(optResult.size === 20)
-    for(i <- 0 until 20) {
-      if (optResult(i)(7) != i * 2) {
-        fail(s"optional Int value in result row $i should be ${2*4*i}")
-      }
+  test("nested data - array of struct") {
+    val data = (1 to 10).map(i => Tuple1(Seq(i -> s"val_$i")))
+    withParquetTable(data, "t") {
+      checkAnswer(sql("SELECT _1[0]._2 FROM t"), data.map {
+        case Tuple1(Seq((_, string))) => Row(string)
+      })
     }
-    for(myval <- Seq("myoptint", "myoptlong", "myoptdouble", "myoptfloat")) {
-      val query8 = sql(s"SELECT * FROM testfiltersource WHERE $myval < 150 AND $myval >= 100")
-      assert(
-        query8.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-        "Top operator should be ParquetTableScan after pushdown")
-      val result8 = query8.collect()
-      assert(result8.size === 25)
-      assert(result8(0)(7) === 100)
-      assert(result8(24)(7) === 148)
-      val query9 = sql(s"SELECT * FROM testfiltersource WHERE $myval > 150 AND $myval <= 200")
-      assert(
-        query9.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-        "Top operator should be ParquetTableScan after pushdown")
-      val result9 = query9.collect()
-      assert(result9.size === 25)
-      if (myval == "myoptint" || myval == "myoptlong") {
-        assert(result9(0)(7) === 152)
-        assert(result9(24)(7) === 200)
-      } else {
-        assert(result9(0)(7) === 150)
-        assert(result9(24)(7) === 198)
-      }
-    }
-    val query10 = sql("SELECT * FROM testfiltersource WHERE myoptstring = \"100\"")
-    assert(
-      query10.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-      "Top operator should be ParquetTableScan after pushdown")
-    val result10 = query10.collect()
-    assert(result10.size === 1)
-    assert(result10(0).getString(8) == "100", "stringvalue incorrect")
-    assert(result10(0).getInt(7) === 100)
-    val query11 = sql(s"SELECT * FROM testfiltersource WHERE myoptboolean = true AND myoptint < 40")
-    assert(
-      query11.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-      "Top operator should be ParquetTableScan after pushdown")
-    val result11 = query11.collect()
-    assert(result11.size === 7)
-    for(i <- 0 until 6) {
-      if (!result11(i).getBoolean(6)) {
-        fail(s"optional Boolean value in result row $i not true")
-      }
-      if (result11(i).getInt(7) != i * 6) {
-        fail(s"optional Int value in result row $i should be ${6*i}")
-      }
-    }
-
-    val query12 = sql("SELECT * FROM testfiltersource WHERE mystring >= \"50\"")
-    assert(
-      query12.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-      "Top operator should be ParquetTableScan after pushdown")
-    val result12 = query12.collect()
-    assert(result12.size === 54)
-    assert(result12(0).getString(2) == "6")
-    assert(result12(4).getString(2) == "50")
-    assert(result12(53).getString(2) == "99")
-
-    val query13 = sql("SELECT * FROM testfiltersource WHERE mystring > \"50\"")
-    assert(
-      query13.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-      "Top operator should be ParquetTableScan after pushdown")
-    val result13 = query13.collect()
-    assert(result13.size === 53)
-    assert(result13(0).getString(2) == "6")
-    assert(result13(4).getString(2) == "51")
-    assert(result13(52).getString(2) == "99")
-
-    val query14 = sql("SELECT * FROM testfiltersource WHERE mystring <= \"50\"")
-    assert(
-      query14.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-      "Top operator should be ParquetTableScan after pushdown")
-    val result14 = query14.collect()
-    assert(result14.size === 148)
-    assert(result14(0).getString(2) == "0")
-    assert(result14(46).getString(2) == "50")
-    assert(result14(147).getString(2) == "200")
-
-    val query15 = sql("SELECT * FROM testfiltersource WHERE mystring < \"50\"")
-    assert(
-      query15.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan],
-      "Top operator should be ParquetTableScan after pushdown")
-    val result15 = query15.collect()
-    assert(result15.size === 147)
-    assert(result15(0).getString(2) == "0")
-    assert(result15(46).getString(2) == "100")
-    assert(result15(146).getString(2) == "200")
   }
 
   test("SPARK-1913 regression: columns only referenced by pushed down filters should remain") {
-    val query = sql(s"SELECT mystring FROM testfiltersource WHERE myint < 10")
-    assert(query.collect().size === 10)
-  }
-
-  test("Importing nested Parquet file (Addressbook)") {
-    val result = TestSQLContext
-      .parquetFile(ParquetTestData.testNestedDir1.toString)
-      .toSchemaRDD
-      .collect()
-    assert(result != null)
-    assert(result.size === 2)
-    val first_record = result(0)
-    val second_record = result(1)
-    assert(first_record != null)
-    assert(second_record != null)
-    assert(first_record.size === 3)
-    assert(second_record(1) === null)
-    assert(second_record(2) === null)
-    assert(second_record(0) === "A. Nonymous")
-    assert(first_record(0) === "Julien Le Dem")
-    val first_owner_numbers = first_record(1)
-      .asInstanceOf[CatalystConverter.ArrayScalaType[_]]
-    val first_contacts = first_record(2)
-      .asInstanceOf[CatalystConverter.ArrayScalaType[_]]
-    assert(first_owner_numbers != null)
-    assert(first_owner_numbers(0) === "555 123 4567")
-    assert(first_owner_numbers(2) === "XXX XXX XXXX")
-    assert(first_contacts(0)
-      .asInstanceOf[CatalystConverter.StructScalaType[_]].size === 2)
-    val first_contacts_entry_one = first_contacts(0)
-      .asInstanceOf[CatalystConverter.StructScalaType[_]]
-    assert(first_contacts_entry_one(0) === "Dmitriy Ryaboy")
-    assert(first_contacts_entry_one(1) === "555 987 6543")
-    val first_contacts_entry_two = first_contacts(1)
-      .asInstanceOf[CatalystConverter.StructScalaType[_]]
-    assert(first_contacts_entry_two(0) === "Chris Aniszczyk")
-  }
-
-  test("Importing nested Parquet file (nested numbers)") {
-    val result = TestSQLContext
-      .parquetFile(ParquetTestData.testNestedDir2.toString)
-      .toSchemaRDD
-      .collect()
-    assert(result.size === 1, "number of top-level rows incorrect")
-    assert(result(0).size === 5, "number of fields in row incorrect")
-    assert(result(0)(0) === 1)
-    assert(result(0)(1) === 7)
-    val subresult1 = result(0)(2).asInstanceOf[CatalystConverter.ArrayScalaType[_]]
-    assert(subresult1.size === 3)
-    assert(subresult1(0) === (1.toLong << 32))
-    assert(subresult1(1) === (1.toLong << 33))
-    assert(subresult1(2) === (1.toLong << 34))
-    val subresult2 = result(0)(3)
-      .asInstanceOf[CatalystConverter.ArrayScalaType[_]](0)
-      .asInstanceOf[CatalystConverter.StructScalaType[_]]
-    assert(subresult2.size === 2)
-    assert(subresult2(0) === 2.5)
-    assert(subresult2(1) === false)
-    val subresult3 = result(0)(4)
-      .asInstanceOf[CatalystConverter.ArrayScalaType[_]]
-    assert(subresult3.size === 2)
-    assert(subresult3(0).asInstanceOf[CatalystConverter.ArrayScalaType[_]].size === 2)
-    val subresult4 = subresult3(0).asInstanceOf[CatalystConverter.ArrayScalaType[_]]
-    assert(subresult4(0).asInstanceOf[CatalystConverter.ArrayScalaType[_]](0) === 7)
-    assert(subresult4(1).asInstanceOf[CatalystConverter.ArrayScalaType[_]](0) === 8)
-    assert(subresult3(1).asInstanceOf[CatalystConverter.ArrayScalaType[_]].size === 1)
-    assert(subresult3(1).asInstanceOf[CatalystConverter.ArrayScalaType[_]](0)
-      .asInstanceOf[CatalystConverter.ArrayScalaType[_]](0) === 9)
-  }
-
-  test("Simple query on addressbook") {
-    val data = TestSQLContext
-      .parquetFile(ParquetTestData.testNestedDir1.toString)
-      .toSchemaRDD
-    val tmp = data.where('owner === "Julien Le Dem").select('owner as 'a, 'contacts as 'c).collect()
-    assert(tmp.size === 1)
-    assert(tmp(0)(0) === "Julien Le Dem")
-  }
-
-  test("Projection in addressbook") {
-    val data = parquetFile(ParquetTestData.testNestedDir1.toString).toSchemaRDD
-    data.registerTempTable("data")
-    val query = sql("SELECT owner, contacts[1].name FROM data")
-    val tmp = query.collect()
-    assert(tmp.size === 2)
-    assert(tmp(0).size === 2)
-    assert(tmp(0)(0) === "Julien Le Dem")
-    assert(tmp(0)(1) === "Chris Aniszczyk")
-    assert(tmp(1)(0) === "A. Nonymous")
-    assert(tmp(1)(1) === null)
-  }
-
-  test("Simple query on nested int data") {
-    val data = parquetFile(ParquetTestData.testNestedDir2.toString).toSchemaRDD
-    data.registerTempTable("data")
-    val result1 = sql("SELECT entries[0].value FROM data").collect()
-    assert(result1.size === 1)
-    assert(result1(0).size === 1)
-    assert(result1(0)(0) === 2.5)
-    val result2 = sql("SELECT entries[0] FROM data").collect()
-    assert(result2.size === 1)
-    val subresult1 = result2(0)(0).asInstanceOf[CatalystConverter.StructScalaType[_]]
-    assert(subresult1.size === 2)
-    assert(subresult1(0) === 2.5)
-    assert(subresult1(1) === false)
-    val result3 = sql("SELECT outerouter FROM data").collect()
-    val subresult2 = result3(0)(0)
-      .asInstanceOf[CatalystConverter.ArrayScalaType[_]](0)
-      .asInstanceOf[CatalystConverter.ArrayScalaType[_]]
-    assert(subresult2(0).asInstanceOf[CatalystConverter.ArrayScalaType[_]](0) === 7)
-    assert(subresult2(1).asInstanceOf[CatalystConverter.ArrayScalaType[_]](0) === 8)
-    assert(result3(0)(0)
-      .asInstanceOf[CatalystConverter.ArrayScalaType[_]](1)
-      .asInstanceOf[CatalystConverter.ArrayScalaType[_]](0)
-      .asInstanceOf[CatalystConverter.ArrayScalaType[_]](0) === 9)
-  }
-
-  test("nested structs") {
-    val data = parquetFile(ParquetTestData.testNestedDir3.toString)
-      .toSchemaRDD
-    data.registerTempTable("data")
-    val result1 = sql("SELECT booleanNumberPairs[0].value[0].truth FROM data").collect()
-    assert(result1.size === 1)
-    assert(result1(0).size === 1)
-    assert(result1(0)(0) === false)
-    val result2 = sql("SELECT booleanNumberPairs[0].value[1].truth FROM data").collect()
-    assert(result2.size === 1)
-    assert(result2(0).size === 1)
-    assert(result2(0)(0) === true)
-    val result3 = sql("SELECT booleanNumberPairs[1].value[0].truth FROM data").collect()
-    assert(result3.size === 1)
-    assert(result3(0).size === 1)
-    assert(result3(0)(0) === false)
-  }
-
-  test("simple map") {
-    val data = TestSQLContext
-      .parquetFile(ParquetTestData.testNestedDir4.toString)
-      .toSchemaRDD
-    data.registerTempTable("mapTable")
-    val result1 = sql("SELECT data1 FROM mapTable").collect()
-    assert(result1.size === 1)
-    assert(result1(0)(0)
-      .asInstanceOf[CatalystConverter.MapScalaType[String, _]]
-      .getOrElse("key1", 0) === 1)
-    assert(result1(0)(0)
-      .asInstanceOf[CatalystConverter.MapScalaType[String, _]]
-      .getOrElse("key2", 0) === 2)
-    val result2 = sql("""SELECT data1["key1"] FROM mapTable""").collect()
-    assert(result2(0)(0) === 1)
-  }
-
-  test("map with struct values") {
-    val data = parquetFile(ParquetTestData.testNestedDir4.toString).toSchemaRDD
-    data.registerTempTable("mapTable")
-    val result1 = sql("SELECT data2 FROM mapTable").collect()
-    assert(result1.size === 1)
-    val entry1 = result1(0)(0)
-      .asInstanceOf[CatalystConverter.MapScalaType[String, CatalystConverter.StructScalaType[_]]]
-      .getOrElse("seven", null)
-    assert(entry1 != null)
-    assert(entry1(0) === 42)
-    assert(entry1(1) === "the answer")
-    val entry2 = result1(0)(0)
-      .asInstanceOf[CatalystConverter.MapScalaType[String, CatalystConverter.StructScalaType[_]]]
-      .getOrElse("eight", null)
-    assert(entry2 != null)
-    assert(entry2(0) === 49)
-    assert(entry2(1) === null)
-    val result2 = sql("""SELECT data2["seven"].payload1, data2["seven"].payload2 FROM mapTable""").collect()
-    assert(result2.size === 1)
-    assert(result2(0)(0) === 42.toLong)
-    assert(result2(0)(1) === "the answer")
-  }
-
-  test("Writing out Addressbook and reading it back in") {
-    // TODO: find out why CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME
-    // has no effect in this test case
-    val tmpdir = Utils.createTempDir()
-    Utils.deleteRecursively(tmpdir)
-    val result = parquetFile(ParquetTestData.testNestedDir1.toString).toSchemaRDD
-    result.saveAsParquetFile(tmpdir.toString)
-    parquetFile(tmpdir.toString)
-      .toSchemaRDD
-      .registerTempTable("tmpcopy")
-    val tmpdata = sql("SELECT owner, contacts[1].name FROM tmpcopy").collect()
-    assert(tmpdata.size === 2)
-    assert(tmpdata(0).size === 2)
-    assert(tmpdata(0)(0) === "Julien Le Dem")
-    assert(tmpdata(0)(1) === "Chris Aniszczyk")
-    assert(tmpdata(1)(0) === "A. Nonymous")
-    assert(tmpdata(1)(1) === null)
-    Utils.deleteRecursively(tmpdir)
-  }
-
-  test("Writing out Map and reading it back in") {
-    val data = parquetFile(ParquetTestData.testNestedDir4.toString).toSchemaRDD
-    val tmpdir = Utils.createTempDir()
-    Utils.deleteRecursively(tmpdir)
-    data.saveAsParquetFile(tmpdir.toString)
-    parquetFile(tmpdir.toString)
-      .toSchemaRDD
-      .registerTempTable("tmpmapcopy")
-    val result1 = sql("""SELECT data1["key2"] FROM tmpmapcopy""").collect()
-    assert(result1.size === 1)
-    assert(result1(0)(0) === 2)
-    val result2 = sql("SELECT data2 FROM tmpmapcopy").collect()
-    assert(result2.size === 1)
-    val entry1 = result2(0)(0)
-      .asInstanceOf[CatalystConverter.MapScalaType[String, CatalystConverter.StructScalaType[_]]]
-      .getOrElse("seven", null)
-    assert(entry1 != null)
-    assert(entry1(0) === 42)
-    assert(entry1(1) === "the answer")
-    val entry2 = result2(0)(0)
-      .asInstanceOf[CatalystConverter.MapScalaType[String, CatalystConverter.StructScalaType[_]]]
-      .getOrElse("eight", null)
-    assert(entry2 != null)
-    assert(entry2(0) === 49)
-    assert(entry2(1) === null)
-    val result3 = sql("""SELECT data2["seven"].payload1, data2["seven"].payload2 FROM tmpmapcopy""").collect()
-    assert(result3.size === 1)
-    assert(result3(0)(0) === 42.toLong)
-    assert(result3(0)(1) === "the answer")
-    Utils.deleteRecursively(tmpdir)
-  }
-
-  test("Querying on empty parquet throws exception (SPARK-3536)") {
-    val tmpdir = Utils.createTempDir()
-    Utils.deleteRecursively(tmpdir)
-    createParquetFile[TestRDDEntry](tmpdir.toString()).registerTempTable("tmpemptytable")
-    val result1 = sql("SELECT * FROM tmpemptytable").collect()
-    assert(result1.size === 0)
-    Utils.deleteRecursively(tmpdir)
-  }
-
-  test("DataType string parser compatibility") {
-    val schema = StructType(List(
-      StructField("c1", IntegerType, false),
-      StructField("c2", BinaryType, false)))
-
-    val fromCaseClassString = ParquetTypesConverter.convertFromString(schema.toString)
-    val fromJson = ParquetTypesConverter.convertFromString(schema.json)
-
-    (fromCaseClassString, fromJson).zipped.foreach { (a, b) =>
-      assert(a.name == b.name)
-      assert(a.dataType === b.dataType)
-    }
-  }
-
-  test("read/write fixed-length decimals") {
-    for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17))) {
-      val tempDir = getTempFilePath("parquetTest").getCanonicalPath
-      val data = sparkContext.parallelize(0 to 1000)
-        .map(i => NumericData(i, i / 100.0))
-        .select('i, 'd cast DecimalType(precision, scale))
-      data.saveAsParquetFile(tempDir)
-      checkAnswer(parquetFile(tempDir), data.toSchemaRDD.collect().toSeq)
-    }
-
-    // Decimals with precision above 18 are not yet supported
-    intercept[RuntimeException] {
-      val tempDir = getTempFilePath("parquetTest").getCanonicalPath
-      val data = sparkContext.parallelize(0 to 1000)
-        .map(i => NumericData(i, i / 100.0))
-        .select('i, 'd cast DecimalType(19, 10))
-      data.saveAsParquetFile(tempDir)
-      checkAnswer(parquetFile(tempDir), data.toSchemaRDD.collect().toSeq)
-    }
-
-    // Unlimited-length decimals are not yet supported
-    intercept[RuntimeException] {
-      val tempDir = getTempFilePath("parquetTest").getCanonicalPath
-      val data = sparkContext.parallelize(0 to 1000)
-        .map(i => NumericData(i, i / 100.0))
-        .select('i, 'd cast DecimalType.Unlimited)
-      data.saveAsParquetFile(tempDir)
-      checkAnswer(parquetFile(tempDir), data.toSchemaRDD.collect().toSeq)
+    withParquetTable((1 to 10).map(Tuple1.apply), "t") {
+      checkAnswer(sql(s"SELECT _1 FROM t WHERE _1 < 10"), (1 to 9).map(Row.apply(_)))
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
new file mode 100644
index 0000000000000..64274950b868e
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.parquet
+
+import scala.reflect.ClassTag
+import scala.reflect.runtime.universe.TypeTag
+
+import org.scalatest.FunSuite
+import parquet.schema.MessageTypeParser
+
+import org.apache.spark.sql.catalyst.ScalaReflection
+import org.apache.spark.sql.test.TestSQLContext
+
+class ParquetSchemaSuite extends FunSuite with ParquetTest {
+  val sqlContext = TestSQLContext
+
+  /**
+   * Checks whether the reflected Parquet message type for product type `T` conforms `messageType`.
+   */
+  private def testSchema[T <: Product: ClassTag: TypeTag](
+      testName: String, messageType: String): Unit = {
+    test(testName) {
+      val actual = ParquetTypesConverter.convertFromAttributes(ScalaReflection.attributesFor[T])
+      val expected = MessageTypeParser.parseMessageType(messageType)
+      actual.checkContains(expected)
+      expected.checkContains(actual)
+    }
+  }
+
+  testSchema[(Boolean, Int, Long, Float, Double, Array[Byte])](
+    "basic types",
+    """
+      |message root {
+      |  required boolean _1;
+      |  required int32   _2;
+      |  required int64   _3;
+      |  required float   _4;
+      |  required double  _5;
+      |  optional binary  _6;
+      |}
+    """.stripMargin)
+
+  testSchema[(Byte, Short, Int, Long)](
+    "logical integral types",
+    """
+      |message root {
+      |  required int32 _1 (INT_8);
+      |  required int32 _2 (INT_16);
+      |  required int32 _3 (INT_32);
+      |  required int64 _4 (INT_64);
+      |}
+    """.stripMargin)
+
+  // Currently String is the only supported logical binary type.
+  testSchema[Tuple1[String]](
+    "binary logical types",
+    """
+      |message root {
+      |  optional binary _1 (UTF8);
+      |}
+    """.stripMargin)
+
+  testSchema[Tuple1[Seq[Int]]](
+    "array",
+    """
+      |message root {
+      |  optional group _1 (LIST) {
+      |    repeated int32 array;
+      |  }
+      |}
+    """.stripMargin)
+
+  testSchema[Tuple1[Map[Int, String]]](
+    "map",
+    """
+      |message root {
+      |  optional group _1 (MAP) {
+      |    repeated group map (MAP_KEY_VALUE) {
+      |      required int32 key;
+      |      optional binary value (UTF8);
+      |    }
+      |  }
+      |}
+    """.stripMargin)
+
+  testSchema[Tuple1[Pair[Int, String]]](
+    "struct",
+    """
+      |message root {
+      |  optional group _1 {
+      |    required int32 _1;
+      |    optional binary _2 (UTF8);
+      |  }
+      |}
+    """.stripMargin)
+
+  testSchema[Tuple1[Map[Int, (String, Seq[(Int, Double)])]]](
+    "deeply nested type",
+    """
+      |message root {
+      |  optional group _1 (MAP) {
+      |    repeated group map (MAP_KEY_VALUE) {
+      |      required int32 key;
+      |      optional group value {
+      |        optional binary _1 (UTF8);
+      |        optional group _2 (LIST) {
+      |          repeated group bag {
+      |            optional group array {
+      |              required int32 _1;
+      |              required double _2;
+      |            }
+      |          }
+      |        }
+      |      }
+      |    }
+      |  }
+      |}
+    """.stripMargin)
+
+  testSchema[(Option[Int], Map[Int, Option[Double]])](
+    "optional types",
+    """
+      |message root {
+      |  optional int32 _1;
+      |  optional group _2 (MAP) {
+      |    repeated group map (MAP_KEY_VALUE) {
+      |      required int32 key;
+      |      optional double value;
+      |    }
+      |  }
+      |}
+    """.stripMargin)
+
+  test("DataType string parser compatibility") {
+    // This is the generated string from previous versions of the Spark SQL, using the following:
+    // val schema = StructType(List(
+    //  StructField("c1", IntegerType, false),
+    //  StructField("c2", BinaryType, true)))
+    val caseClassString =
+      "StructType(List(StructField(c1,IntegerType,false), StructField(c2,BinaryType,true)))"
+
+    val jsonString =
+      """
+        |{"type":"struct","fields":[{"name":"c1","type":"integer","nullable":false,"metadata":{}},{"name":"c2","type":"binary","nullable":true,"metadata":{}}]}
+      """.stripMargin
+
+    val fromCaseClassString = ParquetTypesConverter.convertFromString(caseClassString)
+    val fromJson = ParquetTypesConverter.convertFromString(jsonString)
+
+    (fromCaseClassString, fromJson).zipped.foreach { (a, b) =>
+      assert(a.name == b.name)
+      assert(a.dataType === b.dataType)
+      assert(a.nullable === b.nullable)
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
index 939b3c0c66de7..390538d35a348 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql.sources
 import scala.language.existentials
 
 import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+
 
 class FilteredScanSource extends RelationProvider {
   override def createRelation(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala
index fee2e22611cdc..7900b3e8948d9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.sources
 
 import org.apache.spark.sql._
+import org.apache.spark.sql.types._
 
 class PrunedScanSource extends RelationProvider {
   override def createRelation(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
index b254b0620c779..b1e0919b7aed1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
@@ -17,7 +17,10 @@
 
 package org.apache.spark.sql.sources
 
+import java.sql.{Timestamp, Date}
+
 import org.apache.spark.sql._
+import org.apache.spark.sql.types._
 
 class DefaultSource extends SimpleScanSource
 
@@ -25,7 +28,7 @@ class SimpleScanSource extends RelationProvider {
   override def createRelation(
       sqlContext: SQLContext,
       parameters: Map[String, String]): BaseRelation = {
-    SimpleScan(parameters("from").toInt, parameters("to").toInt)(sqlContext)
+    SimpleScan(parameters("from").toInt, parameters("TO").toInt)(sqlContext)
   }
 }
 
@@ -38,17 +41,116 @@ case class SimpleScan(from: Int, to: Int)(@transient val sqlContext: SQLContext)
   override def buildScan() = sqlContext.sparkContext.parallelize(from to to).map(Row(_))
 }
 
+class AllDataTypesScanSource extends SchemaRelationProvider {
+  override def createRelation(
+      sqlContext: SQLContext,
+      parameters: Map[String, String],
+      schema: StructType): BaseRelation = {
+    AllDataTypesScan(parameters("from").toInt, parameters("TO").toInt, schema)(sqlContext)
+  }
+}
+
+case class AllDataTypesScan(
+  from: Int,
+  to: Int,
+  userSpecifiedSchema: StructType)(@transient val sqlContext: SQLContext)
+  extends TableScan {
+
+  override def schema = userSpecifiedSchema
+
+  override def buildScan() = {
+    sqlContext.sparkContext.parallelize(from to to).map { i =>
+      Row(
+        s"str_$i",
+        s"str_$i".getBytes(),
+        i % 2 == 0,
+        i.toByte,
+        i.toShort,
+        i,
+        i.toLong,
+        i.toFloat,
+        i.toDouble,
+        new java.math.BigDecimal(i),
+        new java.math.BigDecimal(i),
+        new Date((i + 1) * 8640000),
+        new Timestamp(20000 + i),
+        s"varchar_$i",
+        Seq(i, i + 1),
+        Seq(Map(s"str_$i" -> Row(i.toLong))),
+        Map(i -> i.toString),
+        Map(Map(s"str_$i" -> i.toFloat) -> Row(i.toLong)),
+        Row(i, i.toString),
+        Row(Seq(s"str_$i", s"str_${i + 1}"), Row(Seq(new Date((i + 2) * 8640000)))))
+    }
+  }
+}
+
 class TableScanSuite extends DataSourceTest {
   import caseInsensisitiveContext._
 
+  var tableWithSchemaExpected = (1 to 10).map { i =>
+    Row(
+      s"str_$i",
+      s"str_$i",
+      i % 2 == 0,
+      i.toByte,
+      i.toShort,
+      i,
+      i.toLong,
+      i.toFloat,
+      i.toDouble,
+      new java.math.BigDecimal(i),
+      new java.math.BigDecimal(i),
+      new Date((i + 1) * 8640000),
+      new Timestamp(20000 + i),
+      s"varchar_$i",
+      Seq(i, i + 1),
+      Seq(Map(s"str_$i" -> Row(i.toLong))),
+      Map(i -> i.toString),
+      Map(Map(s"str_$i" -> i.toFloat) -> Row(i.toLong)),
+      Row(i, i.toString),
+      Row(Seq(s"str_$i", s"str_${i + 1}"), Row(Seq(new Date((i + 2) * 8640000)))))
+  }.toSeq
+
   before {
     sql(
       """
         |CREATE TEMPORARY TABLE oneToTen
         |USING org.apache.spark.sql.sources.SimpleScanSource
         |OPTIONS (
-        |  from '1',
-        |  to '10'
+        |  From '1',
+        |  To '10'
+        |)
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TEMPORARY TABLE tableWithSchema (
+        |`string$%Field` stRIng,
+        |binaryField binary,
+        |`booleanField` boolean,
+        |ByteField tinyint,
+        |shortField smaLlint,
+        |int_Field iNt,
+        |`longField_:,<>=+/~^` Bigint,
+        |floatField flOat,
+        |doubleField doubLE,
+        |decimalField1 decimal,
+        |decimalField2 decimal(9,2),
+        |dateField dAte,
+        |timestampField tiMestamp,
+        |varcharField varchaR(12),
+        |arrayFieldSimple Array<inT>,
+        |arrayFieldComplex Array<Map<String, Struct<key:bigInt>>>,
+        |mapFieldSimple MAP<iNt, StRing>,
+        |mapFieldComplex Map<Map<stRING, fLOAT>, Struct<key:bigInt>>,
+        |structFieldSimple StRuct<key:INt, Value:STrINg>,
+        |structFieldComplex StRuct<key:Array<String>, Value:struct<`value_(2)`:Array<date>>>
+        |)
+        |USING org.apache.spark.sql.sources.AllDataTypesScanSource
+        |OPTIONS (
+        |  From '1',
+        |  To '10'
         |)
       """.stripMargin)
   }
@@ -73,6 +175,96 @@ class TableScanSuite extends DataSourceTest {
     "SELECT a.i, b.i FROM oneToTen a JOIN oneToTen b ON a.i = b.i + 1",
     (2 to 10).map(i => Row(i, i - 1)).toSeq)
 
+  test("Schema and all fields") {
+    val expectedSchema = StructType(
+      StructField("string$%Field", StringType, true) ::
+      StructField("binaryField", BinaryType, true) ::
+      StructField("booleanField", BooleanType, true) ::
+      StructField("ByteField", ByteType, true) ::
+      StructField("shortField", ShortType, true) ::
+      StructField("int_Field", IntegerType, true) ::
+      StructField("longField_:,<>=+/~^", LongType, true) ::
+      StructField("floatField", FloatType, true) ::
+      StructField("doubleField", DoubleType, true) ::
+      StructField("decimalField1", DecimalType.Unlimited, true) ::
+      StructField("decimalField2", DecimalType(9, 2), true) ::
+      StructField("dateField", DateType, true) ::
+      StructField("timestampField", TimestampType, true) ::
+      StructField("varcharField", StringType, true) ::
+      StructField("arrayFieldSimple", ArrayType(IntegerType), true) ::
+      StructField("arrayFieldComplex",
+        ArrayType(
+          MapType(StringType, StructType(StructField("key", LongType, true) :: Nil))), true) ::
+      StructField("mapFieldSimple", MapType(IntegerType, StringType), true) ::
+      StructField("mapFieldComplex",
+        MapType(
+          MapType(StringType, FloatType),
+          StructType(StructField("key", LongType, true) :: Nil)), true) ::
+      StructField("structFieldSimple",
+        StructType(
+          StructField("key", IntegerType, true) ::
+          StructField("Value", StringType, true) ::  Nil), true) ::
+      StructField("structFieldComplex",
+        StructType(
+          StructField("key", ArrayType(StringType), true) ::
+          StructField("Value",
+            StructType(
+              StructField("value_(2)", ArrayType(DateType), true) :: Nil), true) ::  Nil), true) ::
+      Nil
+    )
+
+    assert(expectedSchema == table("tableWithSchema").schema)
+
+    checkAnswer(
+      sql(
+        """SELECT
+          | `string$%Field`,
+          | cast(binaryField as string),
+          | booleanField,
+          | byteField,
+          | shortField,
+          | int_Field,
+          | `longField_:,<>=+/~^`,
+          | floatField,
+          | doubleField,
+          | decimalField1,
+          | decimalField2,
+          | dateField,
+          | timestampField,
+          | varcharField,
+          | arrayFieldSimple,
+          | arrayFieldComplex,
+          | mapFieldSimple,
+          | mapFieldComplex,
+          | structFieldSimple,
+          | structFieldComplex FROM tableWithSchema""".stripMargin),
+      tableWithSchemaExpected
+    )
+  }
+
+  sqlTest(
+    "SELECT count(*) FROM tableWithSchema",
+    Seq(Row(10)))
+
+  sqlTest(
+    "SELECT `string$%Field` FROM tableWithSchema",
+    (1 to 10).map(i => Row(s"str_$i")).toSeq)
+
+  sqlTest(
+    "SELECT int_Field FROM tableWithSchema WHERE int_Field < 5",
+    (1 to 4).map(Row(_)).toSeq)
+
+  sqlTest(
+    "SELECT `longField_:,<>=+/~^` * 2 FROM tableWithSchema",
+    (1 to 10).map(i => Row(i * 2.toLong)).toSeq)
+
+  sqlTest(
+    "SELECT structFieldSimple.key, arrayFieldSimple[1] FROM tableWithSchema a where int_Field=1",
+    Seq(Row(1, 2)))
+
+  sqlTest(
+    "SELECT structFieldComplex.Value.`value_(2)` FROM tableWithSchema",
+    (1 to 10).map(i => Row(Seq(new Date((i + 2) * 8640000)))).toSeq)
 
   test("Caching")  {
     // Cached Query Execution
@@ -122,4 +314,34 @@ class TableScanSuite extends DataSourceTest {
       sql("SELECT * FROM oneToTenDef"),
       (1 to 10).map(Row(_)).toSeq)
   }
+
+  test("exceptions") {
+    // Make sure we do throw correct exception when users use a relation provider that
+    // only implements the RelationProvier or the SchemaRelationProvider.
+    val schemaNotAllowed = intercept[Exception] {
+      sql(
+        """
+          |CREATE TEMPORARY TABLE relationProvierWithSchema (i int)
+          |USING org.apache.spark.sql.sources.SimpleScanSource
+          |OPTIONS (
+          |  From '1',
+          |  To '10'
+          |)
+        """.stripMargin)
+    }
+    assert(schemaNotAllowed.getMessage.contains("does not allow user-specified schemas"))
+
+    val schemaNeeded = intercept[Exception] {
+      sql(
+        """
+          |CREATE TEMPORARY TABLE schemaRelationProvierWithoutSchema
+          |USING org.apache.spark.sql.sources.AllDataTypesScanSource
+          |OPTIONS (
+          |  From '1',
+          |  To '10'
+          |)
+        """.stripMargin)
+    }
+    assert(schemaNeeded.getMessage.contains("A schema needs to be specified when using"))
+  }
 }
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 490cfbce654d7..123a1f629ab1c 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -42,34 +42,22 @@
       <version>${project.version}</version>
     </dependency>
     <dependency>
-      <groupId>org.spark-project.hive</groupId>
+      <groupId>${hive.group}</groupId>
       <artifactId>hive-cli</artifactId>
-      <version>${hive.version}</version>
     </dependency>
     <dependency>
-      <groupId>org.spark-project.hive</groupId>
+      <groupId>${hive.group}</groupId>
       <artifactId>hive-jdbc</artifactId>
-      <version>${hive.version}</version>
     </dependency>
     <dependency>
-      <groupId>org.spark-project.hive</groupId>
+      <groupId>${hive.group}</groupId>
       <artifactId>hive-beeline</artifactId>
-      <version>${hive.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
     </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
     <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
       <plugin>
         <groupId>org.codehaus.mojo</groupId>
         <artifactId>build-helper-maven-plugin</artifactId>
@@ -88,13 +76,6 @@
           </execution>
         </executions>
       </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
-      </plugin>
     </plugins>
   </build>
 </project>
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala
index 6ed8fd2768f95..59f3a75768082 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala
@@ -53,6 +53,7 @@ private[hive] abstract class AbstractSparkSQLDriver(
   override def run(command: String): CommandProcessorResponse = {
     // TODO unify the error code
     try {
+      context.sparkContext.setJobDescription(command)
       val execution = context.executePlan(context.sql(command).logicalPlan)
       hiveResponse = execution.stringResult()
       tableSchema = getResultSetSchema(execution)
@@ -60,7 +61,7 @@ private[hive] abstract class AbstractSparkSQLDriver(
     } catch {
       case cause: Throwable =>
         logError(s"Failed in [$command]", cause)
-        new CommandProcessorResponse(0, ExceptionUtils.getFullStackTrace(cause), null)
+        new CommandProcessorResponse(1, ExceptionUtils.getFullStackTrace(cause), null)
     }
   }
 
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
index bd4e99492b395..6e07df18b0e15 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -19,13 +19,15 @@ package org.apache.spark.sql.hive.thriftserver
 
 import org.apache.commons.logging.LogFactory
 import org.apache.hadoop.hive.conf.HiveConf
-import org.apache.hive.service.cli.thrift.ThriftBinaryCLIService
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
+import org.apache.hive.service.cli.thrift.{ThriftBinaryCLIService, ThriftHttpCLIService}
 import org.apache.hive.service.server.{HiveServer2, ServerOptionsProcessor}
 
 import org.apache.spark.Logging
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
+import org.apache.spark.scheduler.{SparkListenerApplicationEnd, SparkListener}
 
 /**
  * The main entry point for the Spark SQL port of HiveServer2.  Starts up a `SparkSQLContext` and a
@@ -43,9 +45,9 @@ object HiveThriftServer2 extends Logging {
     val server = new HiveThriftServer2(sqlContext)
     server.init(sqlContext.hiveconf)
     server.start()
+    sqlContext.sparkContext.addSparkListener(new HiveThriftServer2Listener(server))
   }
 
-
   def main(args: Array[String]) {
     val optionsProcessor = new ServerOptionsProcessor("HiveThriftServer2")
     if (!optionsProcessor.process(args)) {
@@ -68,12 +70,23 @@ object HiveThriftServer2 extends Logging {
       server.init(SparkSQLEnv.hiveContext.hiveconf)
       server.start()
       logInfo("HiveThriftServer2 started")
+      SparkSQLEnv.sparkContext.addSparkListener(new HiveThriftServer2Listener(server))
     } catch {
       case e: Exception =>
         logError("Error starting HiveThriftServer2", e)
         System.exit(-1)
     }
   }
+
+  /**
+   * A inner sparkListener called in sc.stop to clean up the HiveThriftServer2
+   */
+  class HiveThriftServer2Listener(val server: HiveServer2) extends SparkListener {
+    override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {
+      server.stop()
+    }
+  }
+
 }
 
 private[hive] class HiveThriftServer2(hiveContext: HiveContext)
@@ -85,10 +98,22 @@ private[hive] class HiveThriftServer2(hiveContext: HiveContext)
     setSuperField(this, "cliService", sparkSqlCliService)
     addService(sparkSqlCliService)
 
-    val thriftCliService = new ThriftBinaryCLIService(sparkSqlCliService)
-    setSuperField(this, "thriftCLIService", thriftCliService)
-    addService(thriftCliService)
+    if (isHTTPTransportMode(hiveConf)) {
+      val thriftCliService = new ThriftHttpCLIService(sparkSqlCliService)
+      setSuperField(this, "thriftCLIService", thriftCliService)
+      addService(thriftCliService)
+    } else {
+      val thriftCliService = new ThriftBinaryCLIService(sparkSqlCliService)
+      setSuperField(this, "thriftCLIService", thriftCliService)
+      addService(thriftCliService)
+    }
 
     initCompositeService(hiveConf)
   }
+
+  private def isHTTPTransportMode(hiveConf: HiveConf): Boolean = {
+    val transportMode: String = hiveConf.getVar(ConfVars.HIVE_SERVER2_TRANSPORT_MODE)
+    transportMode.equalsIgnoreCase("http")
+  }
+
 }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
index 6b3275b4eaf04..89e9ede7261c9 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
@@ -27,11 +27,14 @@ import org.apache.hive.service.cli.session.SessionManager
 import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
 import org.apache.spark.sql.hive.thriftserver.server.SparkSQLOperationManager
+import org.apache.hive.service.cli.SessionHandle
 
 private[hive] class SparkSQLSessionManager(hiveContext: HiveContext)
   extends SessionManager
   with ReflectedCompositeService {
 
+  private lazy val sparkSqlOperationManager = new SparkSQLOperationManager(hiveContext)
+
   override def init(hiveConf: HiveConf) {
     setSuperField(this, "hiveConf", hiveConf)
 
@@ -40,10 +43,14 @@ private[hive] class SparkSQLSessionManager(hiveContext: HiveContext)
     getAncestorField[Log](this, 3, "LOG").info(
       s"HiveServer2: Async execution pool size $backgroundPoolSize")
 
-    val sparkSqlOperationManager = new SparkSQLOperationManager(hiveContext)
     setSuperField(this, "operationManager", sparkSqlOperationManager)
     addService(sparkSqlOperationManager)
 
     initCompositeService(hiveConf)
   }
+
+  override def closeSession(sessionHandle: SessionHandle) {
+    super.closeSession(sessionHandle)
+    sparkSqlOperationManager.sessionToActivePool -= sessionHandle
+  }
 }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
index 99c4f46a82b8e..9c0bf02391e0e 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
@@ -36,8 +36,7 @@ private[thriftserver] class SparkSQLOperationManager(hiveContext: HiveContext)
   val handleToOperation = ReflectionUtils
     .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation")
 
-  // TODO: Currenlty this will grow infinitely, even as sessions expire
-  val sessionToActivePool = Map[HiveSession, String]()
+  val sessionToActivePool = Map[SessionHandle, String]()
 
   override def newExecuteStatementOperation(
       parentSession: HiveSession,
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index e8ffbc5b954d4..60953576d0e37 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -48,6 +48,7 @@ class CliSuite extends FunSuite with BeforeAndAfterAll with Logging {
          |  --master local
          |  --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$jdbcUrl
          |  --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath
+         |  --driver-class-path ${sys.props("java.class.path")}
        """.stripMargin.split("\\s+").toSeq ++ extraArgs
     }
 
@@ -70,7 +71,7 @@ class CliSuite extends FunSuite with BeforeAndAfterAll with Logging {
     }
 
     // Searching expected output line from both stdout and stderr of the CLI process
-    val process = (Process(command) #< queryStream).run(
+    val process = (Process(command, None) #< queryStream).run(
       ProcessLogger(captureOutput("stdout"), captureOutput("stderr")))
 
     try {
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala
index 23d12cbff3495..b52a51d11e4ad 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala
@@ -70,11 +70,20 @@ class HiveThriftServer2Suite extends FunSuite with Logging {
     port
   }
 
-  def withJdbcStatement(serverStartTimeout: FiniteDuration = 1.minute)(f: Statement => Unit) {
+  def withJdbcStatement(
+      serverStartTimeout: FiniteDuration = 1.minute,
+      httpMode: Boolean = false)(
+      f: Statement => Unit) {
     val port = randomListeningPort
 
-    startThriftServer(port, serverStartTimeout) {
-      val jdbcUri = s"jdbc:hive2://${"localhost"}:$port/"
+    startThriftServer(port, serverStartTimeout, httpMode) {
+      val jdbcUri = if (httpMode) {
+        s"jdbc:hive2://${"localhost"}:$port/" +
+          "default?hive.server2.transport.mode=http;hive.server2.thrift.http.path=cliservice"
+      } else {
+        s"jdbc:hive2://${"localhost"}:$port/"
+      }
+
       val user = System.getProperty("user.name")
       val connection = DriverManager.getConnection(jdbcUri, user, "")
       val statement = connection.createStatement()
@@ -113,7 +122,8 @@ class HiveThriftServer2Suite extends FunSuite with Logging {
 
   def startThriftServer(
       port: Int,
-      serverStartTimeout: FiniteDuration = 1.minute)(
+      serverStartTimeout: FiniteDuration = 1.minute,
+      httpMode: Boolean = false)(
       f: => Unit) {
     val startScript = "../../sbin/start-thriftserver.sh".split("/").mkString(File.separator)
     val stopScript = "../../sbin/stop-thriftserver.sh".split("/").mkString(File.separator)
@@ -121,15 +131,32 @@ class HiveThriftServer2Suite extends FunSuite with Logging {
     val warehousePath = getTempFilePath("warehouse")
     val metastorePath = getTempFilePath("metastore")
     val metastoreJdbcUri = s"jdbc:derby:;databaseName=$metastorePath;create=true"
+
     val command =
-      s"""$startScript
-         |  --master local
-         |  --hiveconf hive.root.logger=INFO,console
-         |  --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$metastoreJdbcUri
-         |  --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath
-         |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST}=${"localhost"}
-         |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_PORT}=$port
-       """.stripMargin.split("\\s+").toSeq
+      if (httpMode) {
+          s"""$startScript
+             |  --master local
+             |  --hiveconf hive.root.logger=INFO,console
+             |  --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$metastoreJdbcUri
+             |  --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath
+             |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST}=localhost
+             |  --hiveconf ${ConfVars.HIVE_SERVER2_TRANSPORT_MODE}=http
+             |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_HTTP_PORT}=$port
+             |  --driver-class-path ${sys.props("java.class.path")}
+             |  --conf spark.ui.enabled=false
+           """.stripMargin.split("\\s+").toSeq
+      } else {
+          s"""$startScript
+             |  --master local
+             |  --hiveconf hive.root.logger=INFO,console
+             |  --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$metastoreJdbcUri
+             |  --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath
+             |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST}=localhost
+             |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_PORT}=$port
+             |  --driver-class-path ${sys.props("java.class.path")}
+             |  --conf spark.ui.enabled=false
+           """.stripMargin.split("\\s+").toSeq
+      }
 
     val serverRunning = Promise[Unit]()
     val buffer = new ArrayBuffer[String]()
@@ -140,7 +167,8 @@ class HiveThriftServer2Suite extends FunSuite with Logging {
 
     def captureLogOutput(line: String): Unit = {
       buffer += line
-      if (line.contains("ThriftBinaryCLIService listening on")) {
+      if (line.contains("ThriftBinaryCLIService listening on") ||
+          line.contains("Started ThriftHttpCLIService in http")) {
         serverRunning.success(())
       }
     }
@@ -155,8 +183,9 @@ class HiveThriftServer2Suite extends FunSuite with Logging {
       }
     }
 
-    // Resets SPARK_TESTING to avoid loading Log4J configurations in testing class paths
-    val env = Seq("SPARK_TESTING" -> "0")
+    val env = Seq(
+      // Resets SPARK_TESTING to avoid loading Log4J configurations in testing class paths
+      "SPARK_TESTING" -> "0")
 
     Process(command, None, env: _*).run(ProcessLogger(
       captureThriftServerOutput("stdout"),
@@ -190,7 +219,7 @@ class HiveThriftServer2Suite extends FunSuite with Logging {
     } finally {
       warehousePath.delete()
       metastorePath.delete()
-      Process(stopScript).run().exitValue()
+      Process(stopScript, None, env: _*).run().exitValue()
       // The `spark-daemon.sh' script uses kill, which is not synchronous, have to wait for a while.
       Thread.sleep(3.seconds.toMillis)
       Option(logTailingProcess).map(_.destroy())
@@ -217,6 +246,25 @@ class HiveThriftServer2Suite extends FunSuite with Logging {
     }
   }
 
+  test("Test JDBC query execution in Http Mode") {
+    withJdbcStatement(httpMode = true) { statement =>
+      val queries = Seq(
+        "SET spark.sql.shuffle.partitions=3",
+        "DROP TABLE IF EXISTS test",
+        "CREATE TABLE test(key INT, val STRING)",
+        s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test",
+        "CACHE TABLE test")
+
+      queries.foreach(statement.execute)
+
+      assertResult(5, "Row count mismatch") {
+        val resultSet = statement.executeQuery("SELECT COUNT(*) FROM test")
+        resultSet.next()
+        resultSet.getInt(1)
+      }
+    }
+  }
+
   test("SPARK-3004 regression: result set containing NULL") {
     withJdbcStatement() { statement =>
       val queries = Seq(
@@ -267,6 +315,14 @@ class HiveThriftServer2Suite extends FunSuite with Logging {
     }
   }
 
+  test("Checks Hive version in Http Mode") {
+    withJdbcStatement(httpMode = true) { statement =>
+      val resultSet = statement.executeQuery("SET spark.sql.hive.version")
+      resultSet.next()
+      assert(resultSet.getString(1) === s"spark.sql.hive.version=${HiveShim.version}")
+    }
+  }
+
   test("SPARK-4292 regression: result set iterator issue") {
     withJdbcStatement() { statement =>
       val queries = Seq(
diff --git a/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala b/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala
index 9258ad0cdf1d0..166c56b9dfe20 100644
--- a/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala
+++ b/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala
@@ -22,7 +22,6 @@ import java.util.{ArrayList => JArrayList, Map => JMap}
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.{ArrayBuffer, Map => SMap}
-import scala.math._
 
 import org.apache.hadoop.hive.common.`type`.HiveDecimal
 import org.apache.hadoop.hive.metastore.api.FieldSchema
@@ -33,11 +32,11 @@ import org.apache.hive.service.cli.operation.ExecuteStatementOperation
 import org.apache.hive.service.cli.session.HiveSession
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.plans.logical.SetCommand
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.{SQLConf, SchemaRDD, Row => SparkRow}
+import org.apache.spark.sql.execution.SetCommand
 import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
 import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}
-import org.apache.spark.sql.{SQLConf, SchemaRDD, Row => SparkRow}
+import org.apache.spark.sql.types._
 
 /**
  * A compatibility layer for interacting with Hive version 0.12.0.
@@ -69,7 +68,7 @@ private[hive] class SparkExecuteStatementOperation(
     statement: String,
     confOverlay: JMap[String, String])(
     hiveContext: HiveContext,
-    sessionToActivePool: SMap[HiveSession, String])
+    sessionToActivePool: SMap[SessionHandle, String])
   extends ExecuteStatementOperation(parentSession, statement, confOverlay) with Logging {
 
   private var result: SchemaRDD = _
@@ -123,7 +122,7 @@ private[hive] class SparkExecuteStatementOperation(
       case FloatType =>
         to.addColumnValue(ColumnValue.floatValue(from.getFloat(ordinal)))
       case DecimalType() =>
-        val hiveDecimal = from.get(ordinal).asInstanceOf[BigDecimal].bigDecimal
+        val hiveDecimal = from.getDecimal(ordinal)
         to.addColumnValue(ColumnValue.stringValue(new HiveDecimal(hiveDecimal)))
       case LongType =>
         to.addColumnValue(ColumnValue.longValue(from.getLong(ordinal)))
@@ -190,15 +189,13 @@ private[hive] class SparkExecuteStatementOperation(
       result = hiveContext.sql(statement)
       logDebug(result.queryExecution.toString())
       result.queryExecution.logical match {
-        case SetCommand(Some((SQLConf.THRIFTSERVER_POOL, Some(value)))) =>
-          sessionToActivePool(parentSession) = value
+        case SetCommand(Some((SQLConf.THRIFTSERVER_POOL, Some(value))), _) =>
+          sessionToActivePool(parentSession.getSessionHandle) = value
           logInfo(s"Setting spark.scheduler.pool=$value for future statements in this session.")
         case _ =>
       }
-
-      val groupId = round(random * 1000000).toString
-      hiveContext.sparkContext.setJobGroup(groupId, statement)
-      sessionToActivePool.get(parentSession).foreach { pool =>
+      hiveContext.sparkContext.setJobDescription(statement)
+      sessionToActivePool.get(parentSession.getSessionHandle).foreach { pool =>
         hiveContext.sparkContext.setLocalProperty("spark.scheduler.pool", pool)
       }
       iter = {
diff --git a/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala b/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala
index 17f1ad3e4690e..eaf7a1ddd4996 100644
--- a/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala
+++ b/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala
@@ -22,7 +22,6 @@ import java.util.{ArrayList => JArrayList, List => JList, Map => JMap}
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.{ArrayBuffer, Map => SMap}
-import scala.math._
 
 import org.apache.hadoop.hive.metastore.api.FieldSchema
 import org.apache.hadoop.security.UserGroupInformation
@@ -31,11 +30,11 @@ import org.apache.hive.service.cli.operation.ExecuteStatementOperation
 import org.apache.hive.service.cli.session.HiveSession
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.plans.logical.SetCommand
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.{Row => SparkRow, SQLConf, SchemaRDD}
+import org.apache.spark.sql.execution.SetCommand
 import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
 import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}
-import org.apache.spark.sql.{Row => SparkRow, SQLConf, SchemaRDD}
+import org.apache.spark.sql.types._
 
 /**
  * A compatibility layer for interacting with Hive version 0.13.1.
@@ -69,7 +68,7 @@ private[hive] class SparkExecuteStatementOperation(
     confOverlay: JMap[String, String],
     runInBackground: Boolean = true)(
     hiveContext: HiveContext,
-    sessionToActivePool: SMap[HiveSession, String])
+    sessionToActivePool: SMap[SessionHandle, String])
   // NOTE: `runInBackground` is set to `false` intentionally to disable asynchronous execution
   extends ExecuteStatementOperation(parentSession, statement, confOverlay, false) with Logging {
 
@@ -95,7 +94,7 @@ private[hive] class SparkExecuteStatementOperation(
       case FloatType =>
         to += from.getFloat(ordinal)
       case DecimalType() =>
-        to += from.getAs[BigDecimal](ordinal).bigDecimal
+        to += from.getDecimal(ordinal)
       case LongType =>
         to += from.getLong(ordinal)
       case ByteType =>
@@ -161,15 +160,13 @@ private[hive] class SparkExecuteStatementOperation(
       result = hiveContext.sql(statement)
       logDebug(result.queryExecution.toString())
       result.queryExecution.logical match {
-        case SetCommand(Some((SQLConf.THRIFTSERVER_POOL, Some(value)))) =>
-          sessionToActivePool(parentSession) = value
+        case SetCommand(Some((SQLConf.THRIFTSERVER_POOL, Some(value))), _) =>
+          sessionToActivePool(parentSession.getSessionHandle) = value
           logInfo(s"Setting spark.scheduler.pool=$value for future statements in this session.")
         case _ =>
       }
-
-      val groupId = round(random * 1000000).toString
-      hiveContext.sparkContext.setJobGroup(groupId, statement)
-      sessionToActivePool.get(parentSession).foreach { pool =>
+      hiveContext.sparkContext.setJobDescription(statement)
+      sessionToActivePool.get(parentSession.getSessionHandle).foreach { pool =>
         hiveContext.sparkContext.setLocalProperty("spark.scheduler.pool", pool)
       }
       iter = {
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 7c0be4872d762..0d934620aca09 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -36,8 +36,8 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
 
   private val originalTimeZone = TimeZone.getDefault
   private val originalLocale = Locale.getDefault
-  private val originalColumnBatchSize = TestHive.columnBatchSize
-  private val originalInMemoryPartitionPruning = TestHive.inMemoryPartitionPruning
+  private val originalColumnBatchSize = TestHive.conf.columnBatchSize
+  private val originalInMemoryPartitionPruning = TestHive.conf.inMemoryPartitionPruning
 
   def testCases = hiveQueryDir.listFiles.map(f => f.getName.stripSuffix(".q") -> f)
 
@@ -89,7 +89,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "authorization_5",
     "keyword_1",
     "misc_json",
-    "create_like_tbl_props",
     "load_overwrite",
     "alter_table_serde2",
     "alter_table_not_sorted",
@@ -100,9 +99,9 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "protectmode2",
     //"describe_table",
     "describe_comment_nonascii",
-    "udf5",
-    "udf_java_method",
+
     "create_merge_compressed",
+    "create_view",
     "create_view_partitioned",
     "database_location",
     "database_properties",
@@ -112,7 +111,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
 
     // Weird DDL differences result in failures on jenkins.
     "create_like2",
-    "create_view_translate",
     "partitions_json",
 
     // This test is totally fine except that it includes wrong queries and expects errors, but error
@@ -221,16 +219,11 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "orc_predicate_pushdown",
 
     // Requires precision decimal support:
-    "decimal_1",
-    "udf_pmod",
     "udf_when",
     "udf_case",
-    "udf_to_double",
-    "udf_to_float",
 
     // Needs constant object inspectors
     "udf_round",
-    "udf7",
 
     // Sort with Limit clause causes failure.
     "ctas",
@@ -351,10 +344,12 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "count",
     "cp_mj_rc",
     "create_insert_outputformat",
+    "create_like_tbl_props",
     "create_like_view",
     "create_nested_type",
     "create_skewed_table1",
     "create_struct_table",
+    "create_view_translate",
     "cross_join",
     "cross_product_check_1",
     "cross_product_check_2",
@@ -409,6 +404,13 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "groupby11",
     "groupby12",
     "groupby1_limit",
+    "groupby_grouping_id1",
+    "groupby_grouping_id2",
+    "groupby_grouping_sets1",
+    "groupby_grouping_sets2",
+    "groupby_grouping_sets3",
+    "groupby_grouping_sets4",
+    "groupby_grouping_sets5",
     "groupby1_map",
     "groupby1_map_nomap",
     "groupby1_map_skew",
@@ -785,6 +787,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udaf_covar_samp",
     "udaf_histogram_numeric",
     "udf2",
+    "udf5",
     "udf6",
     "udf7",
     "udf8",
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index f6805b942153a..58b0722464be8 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -47,9 +47,8 @@
       <version>${project.version}</version>
     </dependency>
     <dependency>
-      <groupId>org.spark-project.hive</groupId>
+      <groupId>${hive.group}</groupId>
       <artifactId>hive-metastore</artifactId>
-      <version>${hive.version}</version>
     </dependency>
     <dependency>
       <groupId>commons-httpclient</groupId>
@@ -57,58 +56,29 @@
       <version>3.1</version>
     </dependency>
     <dependency>
-      <groupId>org.spark-project.hive</groupId>
+      <groupId>${hive.group}</groupId>
       <artifactId>hive-exec</artifactId>
-      <version>${hive.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>commons-logging</groupId>
-          <artifactId>commons-logging</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>com.esotericsoftware.kryo</groupId>
-          <artifactId>kryo</artifactId>
-        </exclusion>
-      </exclusions>
     </dependency>
     <dependency>
       <groupId>org.codehaus.jackson</groupId>
       <artifactId>jackson-mapper-asl</artifactId>
     </dependency>
     <dependency>
-      <groupId>org.spark-project.hive</groupId>
+      <groupId>${hive.group}</groupId>
       <artifactId>hive-serde</artifactId>
-      <version>${hive.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>commons-logging</groupId>
-          <artifactId>commons-logging</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>commons-logging</groupId>
-          <artifactId>commons-logging-api</artifactId>
-        </exclusion>
-      </exclusions>
     </dependency>
     <!-- hive-serde already depends on avro, but this brings in customized config of avro deps from parent -->
     <dependency>
       <groupId>org.apache.avro</groupId>
       <artifactId>avro</artifactId>
-      <version>${avro.version}</version>
     </dependency>
     <!-- use the build matching the hadoop api of avro-mapred (i.e. no classifier for hadoop 1 API,
     hadoop2 classifier for hadoop 2 API. avro-mapred is a dependency of org.spark-project.hive:hive-serde -->
     <dependency>
       <groupId>org.apache.avro</groupId>
       <artifactId>avro-mapred</artifactId>
-      <version>${avro.version}</version>
       <classifier>${avro.mapred.classifier}</classifier>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
@@ -161,6 +131,10 @@
       <plugin>
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
+        <configuration>
+          <!-- Specially disable assertions since some Hive tests fail them -->
+          <argLine>-da -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
+        </configuration>
       </plugin>
       <plugin>
          <groupId>org.codehaus.mojo</groupId>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala
index 430ffb29989ea..3f20c6142e59a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala
@@ -20,29 +20,20 @@ package org.apache.spark.sql.hive
 import scala.language.implicitConversions
 
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.{AbstractSparkSQLParser, SqlLexical}
+import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
+import org.apache.spark.sql.hive.execution.{AddJar, AddFile, HiveNativeCommand}
 
 /**
  * A parser that recognizes all HiveQL constructs together with Spark SQL specific extensions.
  */
 private[hive] class ExtendedHiveQlParser extends AbstractSparkSQLParser {
-  protected implicit def asParser(k: Keyword): Parser[String] =
-    lexical.allCaseVersions(k.str).map(x => x : Parser[String]).reduce(_ | _)
-
+  // Keyword is a convention with AbstractSparkSQLParser, which will scan all of the `Keyword`
+  // properties via reflection the class in runtime for constructing the SqlLexical object
   protected val ADD  = Keyword("ADD")
   protected val DFS  = Keyword("DFS")
   protected val FILE = Keyword("FILE")
   protected val JAR  = Keyword("JAR")
 
-  private val reservedWords =
-    this
-      .getClass
-      .getMethods
-      .filter(_.getReturnType == classOf[Keyword])
-      .map(_.invoke(this).asInstanceOf[Keyword].str)
-
-  override val lexical = new SqlLexical(reservedWords)
-
   protected lazy val start: Parser[LogicalPlan] = dfs | addJar | addFile | hiveQl
 
   protected lazy val hiveQl: Parser[LogicalPlan] =
@@ -52,7 +43,7 @@ private[hive] class ExtendedHiveQlParser extends AbstractSparkSQLParser {
 
   protected lazy val dfs: Parser[LogicalPlan] =
     DFS ~> wholeInput ^^ {
-      case command => NativeCommand(command.trim)
+      case command => HiveNativeCommand(command.trim)
     }
 
   private lazy val addFile: Parser[LogicalPlan] =
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 34fc21e61f60f..9d2cfd8e0d669 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hive
 
-import java.io.{BufferedReader, File, InputStreamReader, PrintStream}
+import java.io.{BufferedReader, InputStreamReader, PrintStream}
 import java.sql.{Date, Timestamp}
 
 import scala.collection.JavaConversions._
@@ -29,41 +29,20 @@ import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.ql.Driver
 import org.apache.hadoop.hive.ql.metadata.Table
 import org.apache.hadoop.hive.ql.processors._
+import org.apache.hadoop.hive.ql.parse.VariableSubstitution
 import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.serde2.io.{DateWritable, TimestampWritable}
 
 import org.apache.spark.SparkContext
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.{Analyzer, EliminateAnalysisOperators, OverrideCatalog, OverrideFunctionRegistry}
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.types.DecimalType
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
-import org.apache.spark.sql.execution.{ExtractPythonUdfs, QueryExecutionException, Command => PhysicalCommand}
-import org.apache.spark.sql.hive.execution.DescribeHiveTableCommand
+import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUdfs, SetCommand, QueryExecutionException}
+import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DescribeHiveTableCommand}
 import org.apache.spark.sql.sources.DataSourceStrategy
-
-/**
- * DEPRECATED: Use HiveContext instead.
- */
-@deprecated("""
-  Use HiveContext instead.  It will still create a local metastore if one is not specified.
-  However, note that the default directory is ./metastore_db, not ./metastore
-  """, "1.1")
-class LocalHiveContext(sc: SparkContext) extends HiveContext(sc) {
-
-  lazy val metastorePath = new File("metastore").getCanonicalPath
-  lazy val warehousePath: String = new File("warehouse").getCanonicalPath
-
-  /** Sets up the system initially or after a RESET command */
-  protected def configure() {
-    setConf("javax.jdo.option.ConnectionURL",
-      s"jdbc:derby:;databaseName=$metastorePath;create=true")
-    setConf("hive.metastore.warehouse.dir", warehousePath)
-  }
-
-  configure() // Must be called before initializing the catalog below.
-}
+import org.apache.spark.sql.types._
 
 /**
  * An instance of the Spark SQL execution engine that integrates with data stored in Hive.
@@ -72,39 +51,33 @@ class LocalHiveContext(sc: SparkContext) extends HiveContext(sc) {
 class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   self =>
 
-  // Change the default SQL dialect to HiveQL
-  override private[spark] def dialect: String = getConf(SQLConf.DIALECT, "hiveql")
+  protected[sql] override lazy val conf: SQLConf = new SQLConf {
+    override def dialect: String = getConf(SQLConf.DIALECT, "hiveql")
+  }
 
   /**
    * When true, enables an experimental feature where metastore tables that use the parquet SerDe
    * are automatically converted to use the Spark SQL parquet table scan, instead of the Hive
    * SerDe.
    */
-  private[spark] def convertMetastoreParquet: Boolean =
+  protected[sql] def convertMetastoreParquet: Boolean =
     getConf("spark.sql.hive.convertMetastoreParquet", "true") == "true"
 
   override protected[sql] def executePlan(plan: LogicalPlan): this.QueryExecution =
     new this.QueryExecution { val logical = plan }
 
   override def sql(sqlText: String): SchemaRDD = {
+    val substituted = new VariableSubstitution().substitute(hiveconf, sqlText)
     // TODO: Create a framework for registering parsers instead of just hardcoding if statements.
-    if (dialect == "sql") {
-      super.sql(sqlText)
-    } else if (dialect == "hiveql") {
-      new SchemaRDD(this, ddlParser(sqlText).getOrElse(HiveQl.parseSql(sqlText)))
+    if (conf.dialect == "sql") {
+      super.sql(substituted)
+    } else if (conf.dialect == "hiveql") {
+      new SchemaRDD(this, ddlParser(sqlText, false).getOrElse(HiveQl.parseSql(substituted)))
     }  else {
-      sys.error(s"Unsupported SQL dialect: $dialect.  Try 'sql' or 'hiveql'")
+      sys.error(s"Unsupported SQL dialect: ${conf.dialect}.  Try 'sql' or 'hiveql'")
     }
   }
 
-  @deprecated("hiveql() is deprecated as the sql function now parses using HiveQL by default. " +
-             s"The SQL dialect for parsing can be set using ${SQLConf.DIALECT}", "1.1")
-  def hiveql(hqlQuery: String): SchemaRDD = new SchemaRDD(this, HiveQl.parseSql(hqlQuery))
-
-  @deprecated("hql() is deprecated as the sql function now parses using HiveQL by default. " +
-             s"The SQL dialect for parsing can be set using ${SQLConf.DIALECT}", "1.1")
-  def hql(hqlQuery: String): SchemaRDD = hiveql(hqlQuery)
-
   /**
    * Creates a table using the schema of the given class.
    *
@@ -116,6 +89,22 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
     catalog.createTable("default", tableName, ScalaReflection.attributesFor[A], allowExisting)
   }
 
+  /**
+   * Invalidate and refresh all the cached the metadata of the given table. For performance reasons,
+   * Spark SQL or the external data source library it uses might cache certain metadata about a
+   * table, such as the location of blocks. When those change outside of Spark SQL, users should
+   * call this function to invalidate the cache.
+   */
+  def refreshTable(tableName: String): Unit = {
+    // TODO: Database support...
+    catalog.refreshTable("default", tableName)
+  }
+
+  protected[hive] def invalidateTable(tableName: String): Unit = {
+    // TODO: Database support...
+    catalog.invalidateTable("default", tableName)
+  }
+
   /**
    * Analyzes the given table in the current database to generate statistics, which will be
    * used in query optimizations.
@@ -123,8 +112,9 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    * Right now, it only supports Hive tables and it only updates the size of a Hive table
    * in the Hive metastore.
    */
+  @Experimental
   def analyze(tableName: String) {
-    val relation = EliminateAnalysisOperators(catalog.lookupRelation(None, tableName))
+    val relation = EliminateAnalysisOperators(catalog.lookupRelation(Seq(tableName)))
 
     relation match {
       case relation: MetastoreRelation =>
@@ -279,12 +269,11 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
     results
   }
 
-
   /**
    * Execute the command using Hive and return the results as a sequence. Each element
    * in the sequence is one row.
    */
-  protected def runHive(cmd: String, maxRows: Int = 1000): Seq[String] = {
+  protected def runHive(cmd: String, maxRows: Int = 1000): Seq[String] = synchronized {
     try {
       val cmd_trimmed: String = cmd.trim()
       val tokens: Array[String] = cmd_trimmed.split("\\s+")
@@ -335,13 +324,14 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   }
 
   @transient
-  val hivePlanner = new SparkPlanner with HiveStrategies {
+  private val hivePlanner = new SparkPlanner with HiveStrategies {
     val hiveContext = self
 
-    override val strategies: Seq[Strategy] = extraStrategies ++ Seq(
+    override def strategies: Seq[Strategy] = experimental.extraStrategies ++ Seq(
       DataSourceStrategy,
-      CommandStrategy(self),
       HiveCommandStrategy(self),
+      HiveDDLStrategy,
+      DDLStrategy,
       TakeOrdered,
       ParquetOperations,
       InMemoryScans,
@@ -369,15 +359,21 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
      * execution is simply passed back to Hive.
      */
     def stringResult(): Seq[String] = executedPlan match {
-      case describeHiveTableCommand: DescribeHiveTableCommand =>
+      case ExecutedCommand(desc: DescribeHiveTableCommand) =>
         // If it is a describe command for a Hive table, we want to have the output format
         // be similar with Hive.
-        describeHiveTableCommand.hiveString
-      case command: PhysicalCommand =>
-        command.executeCollect().map(_.head.toString)
+        desc.run(self).map {
+          case Row(name: String, dataType: String, comment) =>
+            Seq(name, dataType,
+              Option(comment.asInstanceOf[String]).getOrElse(""))
+              .map(s => String.format(s"%-20s", s))
+              .mkString("\t")
+        }
+      case command: ExecutedCommand =>
+        command.executeCollect().map(_(0).toString)
 
       case other =>
-        val result: Seq[Seq[Any]] = other.executeCollect().toSeq
+        val result: Seq[Seq[Any]] = other.executeCollect().map(_.toSeq).toSeq
         // We need the types so we can output struct field names
         val types = analyzed.output.map(_.dataType)
         // Reformat to match hive tab delimited output.
@@ -386,21 +382,22 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
 
     override def simpleString: String =
       logical match {
-        case _: NativeCommand => "<Native command: executed by Hive>"
+        case _: HiveNativeCommand => "<Native command: executed by Hive>"
         case _: SetCommand => "<SET command: executed by Hive, and noted by SQLContext>"
         case _ => super.simpleString
       }
   }
 }
 
-object HiveContext {
+
+private object HiveContext {
   protected val primitiveTypes =
     Seq(StringType, IntegerType, LongType, DoubleType, FloatType, BooleanType, ByteType,
       ShortType, DateType, TimestampType, BinaryType)
 
   protected[sql] def toHiveString(a: (Any, DataType)): String = a match {
     case (struct: Row, StructType(fields)) =>
-      struct.zip(fields).map {
+      struct.toSeq.zip(fields).map {
         case (v, t) => s""""${t.name}":${toHiveStructString(v, t.dataType)}"""
       }.mkString("{", ",", "}")
     case (seq: Seq[_], ArrayType(typ, _)) =>
@@ -414,17 +411,16 @@ object HiveContext {
     case (d: Date, DateType) => new DateWritable(d).toString
     case (t: Timestamp, TimestampType) => new TimestampWritable(t).toString
     case (bin: Array[Byte], BinaryType) => new String(bin, "UTF-8")
-    case (decimal: Decimal, DecimalType()) =>  // Hive strips trailing zeros so use its toString
-      HiveShim.createDecimal(decimal.toBigDecimal.underlying()).toString
-    case (decimal: BigDecimal, DecimalType()) =>
-      HiveShim.createDecimal(decimal.underlying()).toString
+    case (decimal: java.math.BigDecimal, DecimalType()) =>
+      // Hive strips trailing zeros so use its toString
+      HiveShim.createDecimal(decimal).toString
     case (other, tpe) if primitiveTypes contains tpe => other.toString
   }
 
   /** Hive outputs fields of structs slightly differently than top level attributes. */
   protected def toHiveStructString(a: (Any, DataType)): String = a match {
     case (struct: Row, StructType(fields)) =>
-      struct.zip(fields).map {
+      struct.toSeq.zip(fields).map {
         case (v, t) => s""""${t.name}":${toHiveStructString(v, t.dataType)}"""
       }.mkString("{", ",", "}")
     case (seq: Seq[_], ArrayType(typ, _)) =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 0eeac8620f01c..82dba99900df9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -18,21 +18,157 @@
 package org.apache.spark.sql.hive
 
 import org.apache.hadoop.hive.common.`type`.{HiveDecimal, HiveVarchar}
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory
 import org.apache.hadoop.hive.serde2.objectinspector._
-import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector._
 import org.apache.hadoop.hive.serde2.objectinspector.primitive._
 import org.apache.hadoop.hive.serde2.{io => hiveIo}
 import org.apache.hadoop.{io => hadoopIo}
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.types
-import org.apache.spark.sql.catalyst.types._
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
+import org.apache.spark.sql.types
+import org.apache.spark.sql.types._
 
 /* Implicit conversions */
 import scala.collection.JavaConversions._
 
+/**
+ * 1. The Underlying data type in catalyst and in Hive
+ * In catalyst:
+ *  Primitive  =>
+ *     java.lang.String
+ *     int / scala.Int
+ *     boolean / scala.Boolean
+ *     float / scala.Float
+ *     double / scala.Double
+ *     long / scala.Long
+ *     short / scala.Short
+ *     byte / scala.Byte
+ *     org.apache.spark.sql.types.Decimal
+ *     Array[Byte]
+ *     java.sql.Date
+ *     java.sql.Timestamp
+ *  Complex Types =>
+ *    Map: scala.collection.immutable.Map
+ *    List: scala.collection.immutable.Seq
+ *    Struct:
+ *           org.apache.spark.sql.catalyst.expression.Row
+ *    Union: NOT SUPPORTED YET
+ *  The Complex types plays as a container, which can hold arbitrary data types.
+ *
+ * In Hive, the native data types are various, in UDF/UDAF/UDTF, and associated with
+ * Object Inspectors, in Hive expression evaluation framework, the underlying data are
+ * Primitive Type
+ *   Java Boxed Primitives:
+ *       org.apache.hadoop.hive.common.type.HiveVarchar
+ *       java.lang.String
+ *       java.lang.Integer
+ *       java.lang.Boolean
+ *       java.lang.Float
+ *       java.lang.Double
+ *       java.lang.Long
+ *       java.lang.Short
+ *       java.lang.Byte
+ *       org.apache.hadoop.hive.common.`type`.HiveDecimal
+ *       byte[]
+ *       java.sql.Date
+ *       java.sql.Timestamp
+ *   Writables:
+ *       org.apache.hadoop.hive.serde2.io.HiveVarcharWritable
+ *       org.apache.hadoop.io.Text
+ *       org.apache.hadoop.io.IntWritable
+ *       org.apache.hadoop.hive.serde2.io.DoubleWritable
+ *       org.apache.hadoop.io.BooleanWritable
+ *       org.apache.hadoop.io.LongWritable
+ *       org.apache.hadoop.io.FloatWritable
+ *       org.apache.hadoop.hive.serde2.io.ShortWritable
+ *       org.apache.hadoop.hive.serde2.io.ByteWritable
+ *       org.apache.hadoop.io.BytesWritable
+ *       org.apache.hadoop.hive.serde2.io.DateWritable
+ *       org.apache.hadoop.hive.serde2.io.TimestampWritable
+ *       org.apache.hadoop.hive.serde2.io.HiveDecimalWritable
+ * Complex Type
+ *   List: Object[] / java.util.List
+ *   Map: java.util.Map
+ *   Struct: Object[] / java.util.List / java POJO
+ *   Union: class StandardUnion { byte tag; Object object }
+ *
+ * NOTICE: HiveVarchar is not supported by catalyst, it will be simply considered as String type.
+ *
+ *
+ * 2. Hive ObjectInspector is a group of flexible APIs to inspect value in different data
+ *  representation, and developers can extend those API as needed, so technically,
+ *  object inspector supports arbitrary data type in java.
+ *
+ * Fortunately, only few built-in Hive Object Inspectors are used in generic udf/udaf/udtf
+ * evaluation.
+ * 1) Primitive Types (PrimitiveObjectInspector & its sub classes)
+  {{{
+   public interface PrimitiveObjectInspector {
+     // Java Primitives (java.lang.Integer, java.lang.String etc.)
+     Object getPrimitiveJavaObject(Object o);
+     // Writables (hadoop.io.IntWritable, hadoop.io.Text etc.)
+     Object getPrimitiveWritableObject(Object o);
+     // ObjectInspector only inspect the `writable` always return true, we need to check it
+     // before invoking the methods above.
+     boolean preferWritable();
+     ...
+   }
+  }}}
+
+ * 2) Complex Types:
+ *   ListObjectInspector: inspects java array or [[java.util.List]]
+ *   MapObjectInspector: inspects [[java.util.Map]]
+ *   Struct.StructObjectInspector: inspects java array, [[java.util.List]] and
+ *                                 even a normal java object (POJO)
+ *   UnionObjectInspector: (tag: Int, object data) (TODO: not supported by SparkSQL yet)
+ *
+ * 3) ConstantObjectInspector: 
+ * Constant object inspector can be either primitive type or Complex type, and it bundles a
+ * constant value as its property, usually the value is created when the constant object inspector
+ * constructed.
+ * {{{
+   public interface ConstantObjectInspector extends ObjectInspector {
+      Object getWritableConstantValue();
+      ...
+    }
+  }}}
+ * Hive provides 3 built-in constant object inspectors:
+ * Primitive Object Inspectors: 
+ *     WritableConstantStringObjectInspector
+ *     WritableConstantHiveVarcharObjectInspector
+ *     WritableConstantHiveDecimalObjectInspector
+ *     WritableConstantTimestampObjectInspector
+ *     WritableConstantIntObjectInspector
+ *     WritableConstantDoubleObjectInspector
+ *     WritableConstantBooleanObjectInspector
+ *     WritableConstantLongObjectInspector
+ *     WritableConstantFloatObjectInspector
+ *     WritableConstantShortObjectInspector
+ *     WritableConstantByteObjectInspector
+ *     WritableConstantBinaryObjectInspector
+ *     WritableConstantDateObjectInspector
+ * Map Object Inspector: 
+ *     StandardConstantMapObjectInspector
+ * List Object Inspector: 
+ *     StandardConstantListObjectInspector]]
+ * Struct Object Inspector: Hive doesn't provide the built-in constant object inspector for Struct
+ * Union Object Inspector: Hive doesn't provide the built-in constant object inspector for Union
+ *
+ *
+ * 3. This trait facilitates:
+ *    Data Unwrapping: Hive Data => Catalyst Data (unwrap)
+ *    Data Wrapping: Catalyst Data => Hive Data (wrap)
+ *    Binding the Object Inspector for Catalyst Data (toInspector)
+ *    Retrieving the Catalyst Data Type from Object Inspector (inspectorToDataType)
+ *
+ *
+ * 4. Future Improvement (TODO)
+ *   This implementation is quite ugly and inefficient:
+ *     a. Pattern matching in runtime
+ *     b. Small objects creation in catalyst data => writable
+ *     c. Unnecessary unwrap / wrap for nested UDF invoking:
+ *       e.g. date_add(printf("%s-%s-%s", a,b,c), 3)
+ *       We don't need to unwrap the data for printf and wrap it again and passes in data_add
+ */
 private[hive] trait HiveInspectors {
 
   def javaClassToDataType(clz: Class[_]): DataType = clz match {
@@ -87,10 +223,23 @@ private[hive] trait HiveInspectors {
    * @param oi   the ObjectInspector associated with the Hive Type
    * @return     convert the data into catalyst type
    * TODO return the function of (data => Any) instead for performance consideration
+   *
+   * Strictly follows the following order in unwrapping (constant OI has the higher priority):
+   *  Constant Null object inspector =>
+   *    return null
+   *  Constant object inspector =>
+   *    extract the value from constant object inspector
+   *  Check whether the `data` is null =>
+   *    return null if true
+   *  If object inspector prefers writable =>
+   *    extract writable from `data` and then get the catalyst type from the writable
+   *  Extract the java object directly from the object inspector
+   *
+   *  NOTICE: the complex data type requires recursive unwrapping.
    */
   def unwrap(data: Any, oi: ObjectInspector): Any = oi match {
-    case _ if data == null => null
-    case poi: VoidObjectInspector => null
+    case coi: ConstantObjectInspector if coi.getWritableConstantValue == null => null
+    case poi: WritableConstantStringObjectInspector => poi.getWritableConstantValue.toString
     case poi: WritableConstantHiveVarcharObjectInspector =>
       poi.getWritableConstantValue.getHiveVarchar.getValue
     case poi: WritableConstantHiveDecimalObjectInspector =>
@@ -119,12 +268,50 @@ private[hive] trait HiveInspectors {
       System.arraycopy(writable.getBytes, 0, temp, 0, temp.length)
       temp
     case poi: WritableConstantDateObjectInspector => poi.getWritableConstantValue.get()
-    case hvoi: HiveVarcharObjectInspector => hvoi.getPrimitiveJavaObject(data).getValue
-    case hdoi: HiveDecimalObjectInspector => HiveShim.toCatalystDecimal(hdoi, data)
-    // org.apache.hadoop.hive.serde2.io.TimestampWritable.set will reset current time object
-    // if next timestamp is null, so Timestamp object is cloned
-    case ti: TimestampObjectInspector => ti.getPrimitiveJavaObject(data).clone()
-    case pi: PrimitiveObjectInspector => pi.getPrimitiveJavaObject(data)
+    case mi: StandardConstantMapObjectInspector =>
+      // take the value from the map inspector object, rather than the input data
+      mi.getWritableConstantValue.map { case (k, v) =>
+        (unwrap(k, mi.getMapKeyObjectInspector),
+          unwrap(v, mi.getMapValueObjectInspector))
+      }.toMap
+    case li: StandardConstantListObjectInspector =>
+      // take the value from the list inspector object, rather than the input data
+      li.getWritableConstantValue.map(unwrap(_, li.getListElementObjectInspector)).toSeq
+    // if the value is null, we don't care about the object inspector type
+    case _ if data == null => null
+    case poi: VoidObjectInspector => null // always be null for void object inspector
+    case pi: PrimitiveObjectInspector => pi match {
+      // We think HiveVarchar is also a String
+      case hvoi: HiveVarcharObjectInspector if hvoi.preferWritable() =>
+        hvoi.getPrimitiveWritableObject(data).getHiveVarchar.getValue
+      case hvoi: HiveVarcharObjectInspector => hvoi.getPrimitiveJavaObject(data).getValue
+      case x: StringObjectInspector if x.preferWritable() =>
+        x.getPrimitiveWritableObject(data).toString
+      case x: IntObjectInspector if x.preferWritable() => x.get(data)
+      case x: BooleanObjectInspector if x.preferWritable() => x.get(data)
+      case x: FloatObjectInspector if x.preferWritable() => x.get(data)
+      case x: DoubleObjectInspector if x.preferWritable() => x.get(data)
+      case x: LongObjectInspector if x.preferWritable() => x.get(data)
+      case x: ShortObjectInspector if x.preferWritable() => x.get(data)
+      case x: ByteObjectInspector if x.preferWritable() => x.get(data)
+      case x: HiveDecimalObjectInspector => HiveShim.toCatalystDecimal(x, data)
+      case x: BinaryObjectInspector if x.preferWritable() =>
+        // BytesWritable.copyBytes() only available since Hadoop2
+        // In order to keep backward-compatible, we have to copy the
+        // bytes with old apis
+        val bw = x.getPrimitiveWritableObject(data)
+        val result = new Array[Byte](bw.getLength()) 
+        System.arraycopy(bw.getBytes(), 0, result, 0, bw.getLength())
+        result
+      case x: DateObjectInspector if x.preferWritable() =>
+        x.getPrimitiveWritableObject(data).get()
+      // org.apache.hadoop.hive.serde2.io.TimestampWritable.set will reset current time object
+      // if next timestamp is null, so Timestamp object is cloned
+      case x: TimestampObjectInspector if x.preferWritable() =>
+        x.getPrimitiveWritableObject(data).getTimestamp.clone()
+      case ti: TimestampObjectInspector => ti.getPrimitiveJavaObject(data).clone()
+      case _ => pi.getPrimitiveJavaObject(data)
+    }
     case li: ListObjectInspector =>
       Option(li.getList(data))
         .map(_.map(unwrap(_, li.getListElementObjectInspector)).toSeq)
@@ -132,10 +319,11 @@ private[hive] trait HiveInspectors {
     case mi: MapObjectInspector =>
       Option(mi.getMap(data)).map(
         _.map {
-          case (k,v) =>
+          case (k, v) =>
             (unwrap(k, mi.getMapKeyObjectInspector),
               unwrap(v, mi.getMapValueObjectInspector))
         }.toMap).orNull
+    // currently, hive doesn't provide the ConstantStructObjectInspector
     case si: StructObjectInspector =>
       val allRefs = si.getAllStructFieldRefs
       new GenericRow(
@@ -153,21 +341,25 @@ private[hive] trait HiveInspectors {
       (o: Any) => new HiveVarchar(o.asInstanceOf[String], o.asInstanceOf[String].size)
 
     case _: JavaHiveDecimalObjectInspector =>
-      (o: Any) => HiveShim.createDecimal(o.asInstanceOf[Decimal].toBigDecimal.underlying())
+      (o: Any) => HiveShim.createDecimal(o.asInstanceOf[Decimal].toJavaBigDecimal)
 
     case soi: StandardStructObjectInspector =>
       val wrappers = soi.getAllStructFieldRefs.map(ref => wrapperFor(ref.getFieldObjectInspector))
       (o: Any) => {
-        val struct = soi.create()
-        (soi.getAllStructFieldRefs, wrappers, o.asInstanceOf[Row]).zipped.foreach {
-          (field, wrapper, data) => soi.setStructFieldData(struct, field, wrapper(data))
+        if (o != null) {
+          val struct = soi.create()
+          (soi.getAllStructFieldRefs, wrappers, o.asInstanceOf[Row].toSeq).zipped.foreach {
+            (field, wrapper, data) => soi.setStructFieldData(struct, field, wrapper(data))
+          }
+          struct
+        } else {
+          null
         }
-        struct
       }
 
     case loi: ListObjectInspector =>
       val wrapper = wrapperFor(loi.getListElementObjectInspector)
-      (o: Any) => seqAsJavaList(o.asInstanceOf[Seq[_]].map(wrapper))
+      (o: Any) => if (o != null) seqAsJavaList(o.asInstanceOf[Seq[_]].map(wrapper)) else null
 
     case moi: MapObjectInspector =>
       // The Predef.Map is scala.collection.immutable.Map.
@@ -176,9 +368,15 @@ private[hive] trait HiveInspectors {
 
       val keyWrapper = wrapperFor(moi.getMapKeyObjectInspector)
       val valueWrapper = wrapperFor(moi.getMapValueObjectInspector)
-      (o: Any) => mapAsJavaMap(o.asInstanceOf[Map[_, _]].map { case (key, value) =>
-        keyWrapper(key) -> valueWrapper(value)
-      })
+      (o: Any) => {
+        if (o != null) {
+          mapAsJavaMap(o.asInstanceOf[Map[_, _]].map { case (key, value) =>
+            keyWrapper(key) -> valueWrapper(value)
+          })
+        } else {
+          null
+        }
+      }
 
     case _ =>
       identity[Any]
@@ -191,59 +389,93 @@ private[hive] trait HiveInspectors {
    *           the ObjectInspector should also be consistent with those returned from
    *           toInspector: DataType => ObjectInspector and
    *           toInspector: Expression => ObjectInspector
+   *
+   * Strictly follows the following order in wrapping (constant OI has the higher priority):
+   *   Constant object inspector => return the bundled value of Constant object inspector
+   *   Check whether the `a` is null => return null if true
+   *   If object inspector prefers writable object => return a Writable for the given data `a`
+   *   Map the catalyst data to the boxed java primitive
+   *
+   *  NOTICE: the complex data type requires recursive wrapping.
    */
-  def wrap(a: Any, oi: ObjectInspector): AnyRef = if (a == null) {
-    null
-  } else {
-    oi match {
-      case x: ConstantObjectInspector => x.getWritableConstantValue
-      case x: PrimitiveObjectInspector => a match {
-        // TODO what if x.preferWritable() == true? reuse the writable?
-        case s: String => s: java.lang.String
-        case i: Int => i: java.lang.Integer
-        case b: Boolean => b: java.lang.Boolean
-        case f: Float => f: java.lang.Float
-        case d: Double => d: java.lang.Double
-        case l: Long => l: java.lang.Long
-        case l: Short => l: java.lang.Short
-        case l: Byte => l: java.lang.Byte
-        case b: BigDecimal => HiveShim.createDecimal(b.underlying())
-        case d: Decimal => HiveShim.createDecimal(d.toBigDecimal.underlying())
-        case b: Array[Byte] => b
-        case d: java.sql.Date => d
-        case t: java.sql.Timestamp => t
+  def wrap(a: Any, oi: ObjectInspector): AnyRef = oi match {
+    case x: ConstantObjectInspector => x.getWritableConstantValue
+    case _ if a == null => null
+    case x: PrimitiveObjectInspector => x match {
+      // TODO we don't support the HiveVarcharObjectInspector yet.
+      case _: StringObjectInspector if x.preferWritable() => HiveShim.getStringWritable(a)
+      case _: StringObjectInspector => a.asInstanceOf[java.lang.String]
+      case _: IntObjectInspector if x.preferWritable() => HiveShim.getIntWritable(a)
+      case _: IntObjectInspector => a.asInstanceOf[java.lang.Integer]
+      case _: BooleanObjectInspector if x.preferWritable() => HiveShim.getBooleanWritable(a)
+      case _: BooleanObjectInspector => a.asInstanceOf[java.lang.Boolean]
+      case _: FloatObjectInspector if x.preferWritable() => HiveShim.getFloatWritable(a)
+      case _: FloatObjectInspector => a.asInstanceOf[java.lang.Float]
+      case _: DoubleObjectInspector if x.preferWritable() => HiveShim.getDoubleWritable(a)
+      case _: DoubleObjectInspector => a.asInstanceOf[java.lang.Double]
+      case _: LongObjectInspector if x.preferWritable() => HiveShim.getLongWritable(a)
+      case _: LongObjectInspector => a.asInstanceOf[java.lang.Long]
+      case _: ShortObjectInspector if x.preferWritable() => HiveShim.getShortWritable(a)
+      case _: ShortObjectInspector => a.asInstanceOf[java.lang.Short]
+      case _: ByteObjectInspector if x.preferWritable() => HiveShim.getByteWritable(a)
+      case _: ByteObjectInspector => a.asInstanceOf[java.lang.Byte]
+      case _: HiveDecimalObjectInspector if x.preferWritable() =>
+        HiveShim.getDecimalWritable(a.asInstanceOf[Decimal])
+      case _: HiveDecimalObjectInspector =>
+        HiveShim.createDecimal(a.asInstanceOf[Decimal].toJavaBigDecimal)
+      case _: BinaryObjectInspector if x.preferWritable() => HiveShim.getBinaryWritable(a)
+      case _: BinaryObjectInspector => a.asInstanceOf[Array[Byte]]
+      case _: DateObjectInspector if x.preferWritable() => HiveShim.getDateWritable(a)
+      case _: DateObjectInspector => a.asInstanceOf[java.sql.Date]
+      case _: TimestampObjectInspector if x.preferWritable() => HiveShim.getTimestampWritable(a)
+      case _: TimestampObjectInspector => a.asInstanceOf[java.sql.Timestamp]
+    }
+    case x: SettableStructObjectInspector =>
+      val fieldRefs = x.getAllStructFieldRefs
+      val row = a.asInstanceOf[Row]
+      // 1. create the pojo (most likely) object
+      val result = x.create()
+      var i = 0
+      while (i < fieldRefs.length) {
+        // 2. set the property for the pojo
+        x.setStructFieldData(
+          result,
+          fieldRefs.get(i),
+          wrap(row(i), fieldRefs.get(i).getFieldObjectInspector))
+        i += 1
       }
-      case x: StructObjectInspector =>
-        val fieldRefs = x.getAllStructFieldRefs
-        val row = a.asInstanceOf[Seq[_]]
-        val result = new java.util.ArrayList[AnyRef](fieldRefs.length)
-        var i = 0
-        while (i < fieldRefs.length) {
-          result.add(wrap(row(i), fieldRefs.get(i).getFieldObjectInspector))
-          i += 1
-        }
 
-        result
-      case x: ListObjectInspector =>
-        val list = new java.util.ArrayList[Object]
-        a.asInstanceOf[Seq[_]].foreach {
-          v => list.add(wrap(v, x.getListElementObjectInspector))
-        }
-        list
-      case x: MapObjectInspector =>
-        // Some UDFs seem to assume we pass in a HashMap.
-        val hashMap = new java.util.HashMap[AnyRef, AnyRef]()
-        hashMap.putAll(a.asInstanceOf[Map[_, _]].map {
-          case (k, v) =>
-            wrap(k, x.getMapKeyObjectInspector) -> wrap(v, x.getMapValueObjectInspector)
-        })
+      result
+    case x: StructObjectInspector =>
+      val fieldRefs = x.getAllStructFieldRefs
+      val row = a.asInstanceOf[Row]
+      val result = new java.util.ArrayList[AnyRef](fieldRefs.length)
+      var i = 0
+      while (i < fieldRefs.length) {
+        result.add(wrap(row(i), fieldRefs.get(i).getFieldObjectInspector))
+        i += 1
+      }
 
-        hashMap
-    }
+      result
+    case x: ListObjectInspector =>
+      val list = new java.util.ArrayList[Object]
+      a.asInstanceOf[Seq[_]].foreach {
+        v => list.add(wrap(v, x.getListElementObjectInspector))
+      }
+      list
+    case x: MapObjectInspector =>
+      // Some UDFs seem to assume we pass in a HashMap.
+      val hashMap = new java.util.HashMap[AnyRef, AnyRef]()
+      hashMap.putAll(a.asInstanceOf[Map[_, _]].map {
+        case (k, v) =>
+          wrap(k, x.getMapKeyObjectInspector) -> wrap(v, x.getMapValueObjectInspector)
+      })
+
+      hashMap
   }
 
   def wrap(
-      row: Seq[Any],
+      row: Row,
       inspectors: Seq[ObjectInspector],
       cache: Array[AnyRef]): Array[AnyRef] = {
     var i = 0
@@ -254,6 +486,23 @@ private[hive] trait HiveInspectors {
     cache
   }
 
+  def wrap(
+    row: Seq[Any],
+    inspectors: Seq[ObjectInspector],
+    cache: Array[AnyRef]): Array[AnyRef] = {
+    var i = 0
+    while (i < inspectors.length) {
+      cache(i) = wrap(row(i), inspectors(i))
+      i += 1
+    }
+    cache
+  }
+
+  /**
+   * @param dataType Catalyst data type
+   * @return Hive java object inspector (recursively), not the Writable ObjectInspector
+   * We can easily map to the Hive built-in object inspector according to the data type.
+   */
   def toInspector(dataType: DataType): ObjectInspector = dataType match {
     case ArrayType(tpe, _) =>
       ObjectInspectorFactory.getStandardListObjectInspector(toInspector(tpe))
@@ -272,12 +521,21 @@ private[hive] trait HiveInspectors {
     case BinaryType => PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector
     case DateType => PrimitiveObjectInspectorFactory.javaDateObjectInspector
     case TimestampType => PrimitiveObjectInspectorFactory.javaTimestampObjectInspector
+    // TODO decimal precision?
     case DecimalType() => PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector
     case StructType(fields) =>
       ObjectInspectorFactory.getStandardStructObjectInspector(
-        fields.map(f => f.name), fields.map(f => toInspector(f.dataType)))
+        java.util.Arrays.asList(fields.map(f => f.name) :_*),
+        java.util.Arrays.asList(fields.map(f => toInspector(f.dataType)) :_*))
   }
 
+  /**
+   * Map the catalyst expression to ObjectInspector, however,
+   * if the expression is [[Literal]] or foldable, a constant writable object inspector returns;
+   * Otherwise, we always get the object inspector according to its data type(in catalyst)
+   * @param expr Catalyst expression to be mapped
+   * @return Hive java objectinspector (recursively).
+   */
   def toInspector(expr: Expression): ObjectInspector = expr match {
     case Literal(value, StringType) =>
       HiveShim.getStringWritableConstantObjectInspector(value)
@@ -326,8 +584,12 @@ private[hive] trait HiveInspectors {
         })
         ObjectInspectorFactory.getStandardConstantMapObjectInspector(keyOI, valueOI, map)
       }
+    // We will enumerate all of the possible constant expressions, throw exception if we missed
     case Literal(_, dt) => sys.error(s"Hive doesn't support the constant type [$dt].")
+    // ideally, we don't test the foldable here(but in optimizer), however, some of the
+    // Hive UDF / UDAF requires its argument to be constant objectinspector, we do it eagerly.
     case _ if expr.foldable => toInspector(Literal(expr.eval(), expr.dataType))
+    // For those non constant expression, map to object inspector according to its data type
     case _ => toInspector(expr.dataType)
   }
 
@@ -378,7 +640,9 @@ private[hive] trait HiveInspectors {
       case ArrayType(elemType, _) =>
         getListTypeInfo(elemType.toTypeInfo)
       case StructType(fields) =>
-        getStructTypeInfo(fields.map(_.name), fields.map(_.dataType.toTypeInfo))
+        getStructTypeInfo(
+          java.util.Arrays.asList(fields.map(_.name) :_*),
+          java.util.Arrays.asList(fields.map(_.dataType.toTypeInfo) :_*))
       case MapType(keyType, valueType, _) =>
         getMapTypeInfo(keyType.toTypeInfo, valueType.toTypeInfo)
       case BinaryType => binaryTypeInfo
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 60865638e1073..1a49f09bd9988 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -20,28 +20,27 @@ package org.apache.spark.sql.hive
 import java.io.IOException
 import java.util.{List => JList}
 
-import scala.util.parsing.combinator.RegexParsers
+import com.google.common.cache.{LoadingCache, CacheLoader, CacheBuilder}
 
 import org.apache.hadoop.util.ReflectionUtils
-
 import org.apache.hadoop.hive.metastore.TableType
-import org.apache.hadoop.hive.metastore.api.FieldSchema
-import org.apache.hadoop.hive.metastore.api.{Table => TTable, Partition => TPartition}
+import org.apache.hadoop.hive.metastore.api.{Table => TTable, Partition => TPartition, FieldSchema}
 import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table, HiveException}
+import org.apache.hadoop.hive.ql.metadata.InvalidTableException
 import org.apache.hadoop.hive.ql.plan.CreateTableDesc
 import org.apache.hadoop.hive.serde.serdeConstants
 import org.apache.hadoop.hive.serde2.{Deserializer, SerDeException}
 import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.analysis.{Catalog, OverrideCatalog}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.sources.{DDLParser, LogicalRelation, ResolvedDataSource}
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
 /* Implicit conversions */
@@ -53,22 +52,100 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
   /** Connection to hive metastore.  Usages should lock on `this`. */
   protected[hive] val client = Hive.get(hive.hiveconf)
 
+  // TODO: Use this everywhere instead of tuples or databaseName, tableName,.
+  /** A fully qualified identifier for a table (i.e., database.tableName) */
+  case class QualifiedTableName(database: String, name: String) {
+    def toLowerCase = QualifiedTableName(database.toLowerCase, name.toLowerCase)
+  }
+
+  /** A cache of Spark SQL data source tables that have been accessed. */
+  protected[hive] val cachedDataSourceTables: LoadingCache[QualifiedTableName, LogicalPlan] = {
+    val cacheLoader = new CacheLoader[QualifiedTableName, LogicalPlan]() {
+      override def load(in: QualifiedTableName): LogicalPlan = {
+        logDebug(s"Creating new cached data source for $in")
+        val table = client.getTable(in.database, in.name)
+        val schemaString = table.getProperty("spark.sql.sources.schema")
+        val userSpecifiedSchema =
+          if (schemaString == null) {
+            None
+          } else {
+            Some(DataType.fromJson(schemaString).asInstanceOf[StructType])
+          }
+        // It does not appear that the ql client for the metastore has a way to enumerate all the
+        // SerDe properties directly...
+        val options = table.getTTable.getSd.getSerdeInfo.getParameters.toMap
+
+        val resolvedRelation =
+          ResolvedDataSource(
+            hive,
+            userSpecifiedSchema,
+            table.getProperty("spark.sql.sources.provider"),
+            options)
+
+        LogicalRelation(resolvedRelation.relation)
+      }
+    }
+
+    CacheBuilder.newBuilder().maximumSize(1000).build(cacheLoader)
+  }
+
+  def refreshTable(databaseName: String, tableName: String): Unit = {
+    cachedDataSourceTables.refresh(QualifiedTableName(databaseName, tableName).toLowerCase)
+  }
+
+  def invalidateTable(databaseName: String, tableName: String): Unit = {
+    cachedDataSourceTables.invalidate(QualifiedTableName(databaseName, tableName).toLowerCase)
+  }
+
   val caseSensitive: Boolean = false
 
-  def tableExists(db: Option[String], tableName: String): Boolean = {
-    val (databaseName, tblName) = processDatabaseAndTableName(
-      db.getOrElse(hive.sessionState.getCurrentDatabase), tableName)
-    client.getTable(databaseName, tblName, false) != null
+  def createDataSourceTable(
+      tableName: String,
+      userSpecifiedSchema: Option[StructType],
+      provider: String,
+      options: Map[String, String]) = {
+    val (dbName, tblName) = processDatabaseAndTableName("default", tableName)
+    val tbl = new Table(dbName, tblName)
+
+    tbl.setProperty("spark.sql.sources.provider", provider)
+    if (userSpecifiedSchema.isDefined) {
+      tbl.setProperty("spark.sql.sources.schema", userSpecifiedSchema.get.json)
+    }
+    options.foreach { case (key, value) => tbl.setSerdeParam(key, value) }
+
+    tbl.setProperty("EXTERNAL", "TRUE")
+    tbl.setTableType(TableType.EXTERNAL_TABLE)
+
+    // create the table
+    synchronized {
+      client.createTable(tbl, false)
+    }
+  }
+
+  def tableExists(tableIdentifier: Seq[String]): Boolean = {
+    val tableIdent = processTableIdentifier(tableIdentifier)
+    val databaseName = tableIdent.lift(tableIdent.size - 2).getOrElse(
+      hive.sessionState.getCurrentDatabase)
+    val tblName = tableIdent.last
+    try {
+      client.getTable(databaseName, tblName) != null
+    } catch {
+      case ie: InvalidTableException => false
+    }
   }
 
   def lookupRelation(
-      db: Option[String],
-      tableName: String,
+      tableIdentifier: Seq[String],
       alias: Option[String]): LogicalPlan = synchronized {
-    val (databaseName, tblName) =
-      processDatabaseAndTableName(db.getOrElse(hive.sessionState.getCurrentDatabase), tableName)
+    val tableIdent = processTableIdentifier(tableIdentifier)
+    val databaseName = tableIdent.lift(tableIdent.size - 2).getOrElse(
+      hive.sessionState.getCurrentDatabase)
+    val tblName = tableIdent.last
     val table = client.getTable(databaseName, tblName)
-    if (table.isView) {
+
+    if (table.getProperty("spark.sql.sources.provider") != null) {
+      cachedDataSourceTables(QualifiedTableName(databaseName, tblName).toLowerCase)
+    } else if (table.isView) {
       // if the unresolved relation is from hive view
       // parse the text into logic node.
       HiveQl.createPlanForView(table, alias)
@@ -249,6 +326,26 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
     }
   }
 
+  protected def processDatabaseAndTableName(
+      databaseName: Option[String],
+      tableName: String): (Option[String], String) = {
+    if (!caseSensitive) {
+      (databaseName.map(_.toLowerCase), tableName.toLowerCase)
+    } else {
+      (databaseName, tableName)
+    }
+  }
+
+  protected def processDatabaseAndTableName(
+      databaseName: String,
+      tableName: String): (String, String) = {
+    if (!caseSensitive) {
+      (databaseName.toLowerCase, tableName.toLowerCase)
+    } else {
+      (databaseName, tableName)
+    }
+  }
+
   /**
    * Creates any tables required for query execution.
    * For example, because of a CREATE TABLE X AS statement.
@@ -261,12 +358,14 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
       // Wait until children are resolved.
       case p: LogicalPlan if !p.childrenResolved => p
 
+      // TODO extra is in type of ASTNode which means the logical plan is not resolved
+      // Need to think about how to implement the CreateTableAsSelect.resolved
       case CreateTableAsSelect(db, tableName, child, allowExisting, Some(extra: ASTNode)) =>
         val (dbName, tblName) = processDatabaseAndTableName(db, tableName)
         val databaseName = dbName.getOrElse(hive.sessionState.getCurrentDatabase)
 
         // Get the CreateTableDesc from Hive SemanticAnalyzer
-        val desc: Option[CreateTableDesc] = if (tableExists(Some(databaseName), tblName)) {
+        val desc: Option[CreateTableDesc] = if (tableExists(Seq(databaseName, tblName))) {
           None
         } else {
           val sa = new SemanticAnalyzer(hive.hiveconf) {
@@ -284,7 +383,24 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
           Some(sa.getQB().getTableDesc)
         }
 
-        CreateTableAsSelect(Some(databaseName), tblName, child, allowExisting, desc)
+        execution.CreateTableAsSelect(
+          databaseName,
+          tableName,
+          child,
+          allowExisting,
+          desc)
+
+      case p: LogicalPlan if p.resolved => p
+
+      case p @ CreateTableAsSelect(db, tableName, child, allowExisting, None) =>
+        val (dbName, tblName) = processDatabaseAndTableName(db, tableName)
+        val databaseName = dbName.getOrElse(hive.sessionState.getCurrentDatabase)
+        execution.CreateTableAsSelect(
+          databaseName,
+          tableName,
+          child,
+          allowExisting,
+          None)
     }
   }
 
@@ -331,15 +447,13 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
    * UNIMPLEMENTED: It needs to be decided how we will persist in-memory tables to the metastore.
    * For now, if this functionality is desired mix in the in-memory [[OverrideCatalog]].
    */
-  override def registerTable(
-      databaseName: Option[String], tableName: String, plan: LogicalPlan): Unit = ???
+  override def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit = ???
 
   /**
    * UNIMPLEMENTED: It needs to be decided how we will persist in-memory tables to the metastore.
    * For now, if this functionality is desired mix in the in-memory [[OverrideCatalog]].
    */
-  override def unregisterTable(
-      databaseName: Option[String], tableName: String): Unit = ???
+  override def unregisterTable(tableIdentifier: Seq[String]): Unit = ???
 
   override def unregisterAllTables() = {}
 }
@@ -365,88 +479,6 @@ private[hive] case class InsertIntoHiveTable(
   }
 }
 
-/**
- * :: DeveloperApi ::
- * Provides conversions between Spark SQL data types and Hive Metastore types.
- */
-@DeveloperApi
-object HiveMetastoreTypes extends RegexParsers {
-  protected lazy val primitiveType: Parser[DataType] =
-    "string" ^^^ StringType |
-    "float" ^^^ FloatType |
-    "int" ^^^ IntegerType |
-    "tinyint" ^^^ ByteType |
-    "smallint" ^^^ ShortType |
-    "double" ^^^ DoubleType |
-    "bigint" ^^^ LongType |
-    "binary" ^^^ BinaryType |
-    "boolean" ^^^ BooleanType |
-    fixedDecimalType |                     // Hive 0.13+ decimal with precision/scale
-    "decimal" ^^^ DecimalType.Unlimited |  // Hive 0.12 decimal with no precision/scale
-    "date" ^^^ DateType |
-    "timestamp" ^^^ TimestampType |
-    "varchar\\((\\d+)\\)".r ^^^ StringType
-
-  protected lazy val fixedDecimalType: Parser[DataType] =
-    ("decimal" ~> "(" ~> "\\d+".r) ~ ("," ~> "\\d+".r <~ ")") ^^ {
-      case precision ~ scale =>
-        DecimalType(precision.toInt, scale.toInt)
-    }
-
-  protected lazy val arrayType: Parser[DataType] =
-    "array" ~> "<" ~> dataType <~ ">" ^^ {
-      case tpe => ArrayType(tpe)
-    }
-
-  protected lazy val mapType: Parser[DataType] =
-    "map" ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ {
-      case t1 ~ _ ~ t2 => MapType(t1, t2)
-    }
-
-  protected lazy val structField: Parser[StructField] =
-    "[a-zA-Z0-9_]*".r ~ ":" ~ dataType ^^ {
-      case name ~ _ ~ tpe => StructField(name, tpe, nullable = true)
-    }
-
-  protected lazy val structType: Parser[DataType] =
-    "struct" ~> "<" ~> repsep(structField,",") <~ ">"  ^^ {
-      case fields => new StructType(fields)
-    }
-
-  protected lazy val dataType: Parser[DataType] =
-    arrayType |
-    mapType |
-    structType |
-    primitiveType
-
-  def toDataType(metastoreType: String): DataType = parseAll(dataType, metastoreType) match {
-    case Success(result, _) => result
-    case failure: NoSuccess => sys.error(s"Unsupported dataType: $metastoreType")
-  }
-
-  def toMetastoreType(dt: DataType): String = dt match {
-    case ArrayType(elementType, _) => s"array<${toMetastoreType(elementType)}>"
-    case StructType(fields) =>
-      s"struct<${fields.map(f => s"${f.name}:${toMetastoreType(f.dataType)}").mkString(",")}>"
-    case MapType(keyType, valueType, _) =>
-      s"map<${toMetastoreType(keyType)},${toMetastoreType(valueType)}>"
-    case StringType => "string"
-    case FloatType => "float"
-    case IntegerType => "int"
-    case ByteType => "tinyint"
-    case ShortType => "smallint"
-    case DoubleType => "double"
-    case LongType => "bigint"
-    case BinaryType => "binary"
-    case BooleanType => "boolean"
-    case DateType => "date"
-    case d: DecimalType => HiveShim.decimalMetastoreString(d)
-    case TimestampType => "timestamp"
-    case NullType => "void"
-    case udt: UserDefinedType[_] => toMetastoreType(udt.sqlType)
-  }
-}
-
 private[hive] case class MetastoreRelation
     (databaseName: String, tableName: String, alias: Option[String])
     (val table: TTable, val partitions: Seq[TPartition])
@@ -483,7 +515,7 @@ private[hive] case class MetastoreRelation
         // if the size is still less than zero, we use default size
         Option(totalSize).map(_.toLong).filter(_ > 0)
           .getOrElse(Option(rawDataSize).map(_.toLong).filter(_ > 0)
-          .getOrElse(sqlContext.defaultSizeInBytes)))
+          .getOrElse(sqlContext.conf.defaultSizeInBytes)))
     }
   )
 
@@ -504,7 +536,7 @@ private[hive] case class MetastoreRelation
   implicit class SchemaAttribute(f: FieldSchema) {
     def toAttribute = AttributeReference(
       f.getName,
-      HiveMetastoreTypes.toDataType(f.getType),
+      sqlContext.ddlParser.parseType(f.getType),
       // Since data can be dumped in randomly with no validation, everything is nullable.
       nullable = true
     )(qualifiers = Seq(alias.getOrElse(tableName)))
@@ -524,3 +556,33 @@ private[hive] case class MetastoreRelation
   /** An attribute map for determining the ordinal for non-partition columns. */
   val columnOrdinals = AttributeMap(attributes.zipWithIndex)
 }
+
+object HiveMetastoreTypes {
+  protected val ddlParser = new DDLParser
+
+  def toDataType(metastoreType: String): DataType = synchronized {
+    ddlParser.parseType(metastoreType)
+  }
+
+  def toMetastoreType(dt: DataType): String = dt match {
+    case ArrayType(elementType, _) => s"array<${toMetastoreType(elementType)}>"
+    case StructType(fields) =>
+      s"struct<${fields.map(f => s"${f.name}:${toMetastoreType(f.dataType)}").mkString(",")}>"
+    case MapType(keyType, valueType, _) =>
+      s"map<${toMetastoreType(keyType)},${toMetastoreType(valueType)}>"
+    case StringType => "string"
+    case FloatType => "float"
+    case IntegerType => "int"
+    case ByteType => "tinyint"
+    case ShortType => "smallint"
+    case DoubleType => "double"
+    case LongType => "bigint"
+    case BinaryType => "binary"
+    case BooleanType => "boolean"
+    case DateType => "date"
+    case d: DecimalType => HiveShim.decimalMetastoreString(d)
+    case TimestampType => "timestamp"
+    case NullType => "void"
+    case udt: UserDefinedType[_] => toMetastoreType(udt.sqlType)
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index cd4e5a239ec66..5e29e57d93585 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -18,21 +18,23 @@
 package org.apache.spark.sql.hive
 
 import java.sql.Date
+
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.ql.Context
 import org.apache.hadoop.hive.ql.lib.Node
 import org.apache.hadoop.hive.ql.metadata.Table
 import org.apache.hadoop.hive.ql.parse._
 import org.apache.hadoop.hive.ql.plan.PlanUtils
+import org.apache.spark.sql.SparkSQLParser
 
-import org.apache.spark.sql.catalyst.SparkSQLParser
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.types._
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
+import org.apache.spark.sql.execution.ExplainCommand
+import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DropTable, AnalyzeTable}
+import org.apache.spark.sql.types._
 
 /* Implicit conversions */
 import scala.collection.JavaConversions._
@@ -44,13 +46,21 @@ import scala.collection.JavaConversions._
  */
 private[hive] case object NativePlaceholder extends Command
 
-private[hive] case class AddFile(filePath: String) extends Command
-
-private[hive] case class AddJar(path: String) extends Command
-
-private[hive] case class DropTable(tableName: String, ifExists: Boolean) extends Command
-
-private[hive] case class AnalyzeTable(tableName: String) extends Command
+/**
+ * Returned for the "DESCRIBE [EXTENDED] [dbName.]tableName" command.
+ * @param table The table to be described.
+ * @param isExtended True if "DESCRIBE EXTENDED" is used. Otherwise, false.
+ *                   It is effective only when the table is a Hive table.
+ */
+case class DescribeCommand(
+    table: LogicalPlan,
+    isExtended: Boolean) extends Command {
+  override def output = Seq(
+    // Column names are based on Hive.
+    AttributeReference("col_name", StringType, nullable = false)(),
+    AttributeReference("data_type", StringType, nullable = false)(),
+    AttributeReference("comment", StringType, nullable = false)())
+}
 
 /** Provides a mapping from HiveQL statements to catalyst logical plans and expression trees. */
 private[hive] object HiveQl {
@@ -239,10 +249,10 @@ private[hive] object HiveQl {
     try {
       val tree = getAst(sql)
       if (nativeCommands contains tree.getText) {
-        NativeCommand(sql)
+        HiveNativeCommand(sql)
       } else {
         nodeToPlan(tree) match {
-          case NativePlaceholder => NativeCommand(sql)
+          case NativePlaceholder => HiveNativeCommand(sql)
           case other => other
         }
       }
@@ -393,6 +403,51 @@ private[hive] object HiveQl {
     (db, tableName)
   }
 
+  protected def extractTableIdent(tableNameParts: Node): Seq[String] = {
+    tableNameParts.getChildren.map { case Token(part, Nil) => cleanIdentifier(part) } match {
+      case Seq(tableOnly) => Seq(tableOnly)
+      case Seq(databaseName, table) => Seq(databaseName, table)
+      case other => sys.error("Hive only supports tables names like 'tableName' " +
+        s"or 'databaseName.tableName', found '$other'")
+    }
+  }
+
+  /**
+   * SELECT MAX(value) FROM src GROUP BY k1, k2, k3 GROUPING SETS((k1, k2), (k2)) 
+   * is equivalent to 
+   * SELECT MAX(value) FROM src GROUP BY k1, k2 UNION SELECT MAX(value) FROM src GROUP BY k2
+   * Check the following link for details.
+   * 
+https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C+Grouping+and+Rollup
+   *
+   * The bitmask denotes the grouping expressions validity for a grouping set,
+   * the bitmask also be called as grouping id (`GROUPING__ID`, the virtual column in Hive)
+   * e.g. In superset (k1, k2, k3), (bit 0: k1, bit 1: k2, and bit 2: k3), the grouping id of 
+   * GROUPING SETS (k1, k2) and (k2) should be 3 and 2 respectively.
+   */
+  protected def extractGroupingSet(children: Seq[ASTNode]): (Seq[Expression], Seq[Int]) = {
+    val (keyASTs, setASTs) = children.partition( n => n match {
+        case Token("TOK_GROUPING_SETS_EXPRESSION", children) => false // grouping sets
+        case _ => true // grouping keys
+      })
+
+    val keys = keyASTs.map(nodeToExpr).toSeq
+    val keyMap = keyASTs.map(_.toStringTree).zipWithIndex.toMap
+
+    val bitmasks: Seq[Int] = setASTs.map(set => set match {
+      case Token("TOK_GROUPING_SETS_EXPRESSION", null) => 0
+      case Token("TOK_GROUPING_SETS_EXPRESSION", children) => 
+        children.foldLeft(0)((bitmap, col) => {
+          val colString = col.asInstanceOf[ASTNode].toStringTree()
+          require(keyMap.contains(colString), s"$colString doens't show up in the GROUP BY list")
+          bitmap | 1 << keyMap(colString)
+        })
+      case _ => sys.error("Expect GROUPING SETS clause")
+    })
+
+    (keys, bitmasks)
+  }
+
   protected def nodeToPlan(node: Node): LogicalPlan = node match {
     // Special drop table that also uncaches.
     case Token("TOK_DROPTABLE",
@@ -419,17 +474,23 @@ private[hive] object HiveQl {
     // Just fake explain for any of the native commands.
     case Token("TOK_EXPLAIN", explainArgs)
       if noExplainCommands.contains(explainArgs.head.getText) =>
-      ExplainCommand(NoRelation)
+      ExplainCommand(NoRelation, Seq(AttributeReference("plan", StringType, nullable = false)()))
     case Token("TOK_EXPLAIN", explainArgs)
       if "TOK_CREATETABLE" == explainArgs.head.getText =>
       val Some(crtTbl) :: _ :: extended :: Nil =
         getClauses(Seq("TOK_CREATETABLE", "FORMATTED", "EXTENDED"), explainArgs)
-      ExplainCommand(nodeToPlan(crtTbl), extended != None)
+      ExplainCommand(
+        nodeToPlan(crtTbl),
+        Seq(AttributeReference("plan", StringType,nullable = false)()),
+        extended != None)
     case Token("TOK_EXPLAIN", explainArgs) =>
       // Ignore FORMATTED if present.
       val Some(query) :: _ :: extended :: Nil =
         getClauses(Seq("TOK_QUERY", "FORMATTED", "EXTENDED"), explainArgs)
-      ExplainCommand(nodeToPlan(query), extended != None)
+      ExplainCommand(
+        nodeToPlan(query),
+        Seq(AttributeReference("plan", StringType, nullable = false)()),
+        extended != None)
 
     case Token("TOK_DESCTABLE", describeArgs) =>
       // Reference: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL
@@ -446,16 +507,16 @@ private[hive] object HiveQl {
               case Token(".", dbName :: tableName :: Nil) =>
                 // It is describing a table with the format like "describe db.table".
                 // TODO: Actually, a user may mean tableName.columnName. Need to resolve this issue.
-                val (db, tableName) = extractDbNameTableName(nameParts.head)
+                val tableIdent = extractTableIdent(nameParts.head)
                 DescribeCommand(
-                  UnresolvedRelation(db, tableName, None), extended.isDefined)
+                  UnresolvedRelation(tableIdent, None), extended.isDefined)
               case Token(".", dbName :: tableName :: colName :: Nil) =>
                 // It is describing a column with the format like "describe db.table column".
                 NativePlaceholder
               case tableName =>
                 // It is describing a table with the format like "describe table".
                 DescribeCommand(
-                  UnresolvedRelation(None, tableName.getText, None),
+                  UnresolvedRelation(Seq(tableName.getText), None),
                   extended.isDefined)
             }
           }
@@ -520,6 +581,9 @@ private[hive] object HiveQl {
             selectDistinctClause ::
             whereClause ::
             groupByClause ::
+            rollupGroupByClause ::
+            cubeGroupByClause ::
+            groupingSetsClause ::
             orderByClause ::
             havingClause ::
             sortByClause ::
@@ -535,6 +599,9 @@ private[hive] object HiveQl {
               "TOK_SELECTDI",
               "TOK_WHERE",
               "TOK_GROUPBY",
+              "TOK_ROLLUP_GROUPBY",
+              "TOK_CUBE_GROUPBY",
+              "TOK_GROUPING_SETS",
               "TOK_ORDERBY",
               "TOK_HAVING",
               "TOK_SORTBY",
@@ -603,16 +670,33 @@ private[hive] object HiveQl {
 
         // The projection of the query can either be a normal projection, an aggregation
         // (if there is a group by) or a script transformation.
-        val withProject = transformation.getOrElse {
-          // Not a transformation so must be either project or aggregation.
-          val selectExpressions = nameExpressions(select.getChildren.flatMap(selExprNodeToExpr))
-
-          groupByClause match {
-            case Some(groupBy) =>
-              Aggregate(groupBy.getChildren.map(nodeToExpr), selectExpressions, withLateralView)
-            case None =>
-              Project(selectExpressions, withLateralView)
-          }
+        val withProject: LogicalPlan = transformation.getOrElse {
+          val selectExpressions = 
+            nameExpressions(select.getChildren.flatMap(selExprNodeToExpr).toSeq)
+          Seq(
+            groupByClause.map(e => e match {
+              case Token("TOK_GROUPBY", children) =>
+                // Not a transformation so must be either project or aggregation.
+                Aggregate(children.map(nodeToExpr), selectExpressions, withLateralView)
+              case _ => sys.error("Expect GROUP BY")
+            }),
+            groupingSetsClause.map(e => e match {
+              case Token("TOK_GROUPING_SETS", children) =>
+                val(groupByExprs, masks) = extractGroupingSet(children)
+                GroupingSets(masks, groupByExprs, withLateralView, selectExpressions)
+              case _ => sys.error("Expect GROUPING SETS")
+            }),
+            rollupGroupByClause.map(e => e match {
+              case Token("TOK_ROLLUP_GROUPBY", children) =>
+                Rollup(children.map(nodeToExpr), withLateralView, selectExpressions)
+              case _ => sys.error("Expect WITH ROLLUP")
+            }),
+            cubeGroupByClause.map(e => e match {
+              case Token("TOK_CUBE_GROUPBY", children) =>
+                Cube(children.map(nodeToExpr), withLateralView, selectExpressions)
+              case _ => sys.error("Expect WITH CUBE")
+            }), 
+            Some(Project(selectExpressions, withLateralView))).flatten.head
         }
 
         val withDistinct =
@@ -628,16 +712,16 @@ private[hive] object HiveQl {
         val withSort =
           (orderByClause, sortByClause, distributeByClause, clusterByClause) match {
             case (Some(totalOrdering), None, None, None) =>
-              Sort(totalOrdering.getChildren.map(nodeToSortOrder), withHaving)
+              Sort(totalOrdering.getChildren.map(nodeToSortOrder), true, withHaving)
             case (None, Some(perPartitionOrdering), None, None) =>
-              SortPartitions(perPartitionOrdering.getChildren.map(nodeToSortOrder), withHaving)
+              Sort(perPartitionOrdering.getChildren.map(nodeToSortOrder), false, withHaving)
             case (None, None, Some(partitionExprs), None) =>
               Repartition(partitionExprs.getChildren.map(nodeToExpr), withHaving)
             case (None, Some(perPartitionOrdering), Some(partitionExprs), None) =>
-              SortPartitions(perPartitionOrdering.getChildren.map(nodeToSortOrder),
+              Sort(perPartitionOrdering.getChildren.map(nodeToSortOrder), false,
                 Repartition(partitionExprs.getChildren.map(nodeToExpr), withHaving))
             case (None, None, None, Some(clusterExprs)) =>
-              SortPartitions(clusterExprs.getChildren.map(nodeToExpr).map(SortOrder(_, Ascending)),
+              Sort(clusterExprs.getChildren.map(nodeToExpr).map(SortOrder(_, Ascending)), false,
                 Repartition(clusterExprs.getChildren.map(nodeToExpr), withHaving))
             case (None, None, None, None) => withHaving
             case _ => sys.error("Unsupported set of ordering / distribution clauses.")
@@ -705,13 +789,15 @@ private[hive] object HiveQl {
           nonAliasClauses)
       }
 
-      val (db, tableName) =
+      val tableIdent =
         tableNameParts.getChildren.map{ case Token(part, Nil) => cleanIdentifier(part)} match {
-          case Seq(tableOnly) => (None, tableOnly)
-          case Seq(databaseName, table) => (Some(databaseName), table)
+          case Seq(tableOnly) => Seq(tableOnly)
+          case Seq(databaseName, table) => Seq(databaseName, table)
+          case other => sys.error("Hive only supports tables names like 'tableName' " +
+            s"or 'databaseName.tableName', found '$other'")
       }
       val alias = aliasClause.map { case Token(a, Nil) => cleanIdentifier(a) }
-      val relation = UnresolvedRelation(db, tableName, alias)
+      val relation = UnresolvedRelation(tableIdent, alias)
 
       // Apply sampling if requested.
       (bucketSampleClause orElse splitSampleClause).map {
@@ -830,7 +916,7 @@ private[hive] object HiveQl {
       val Some(tableNameParts) :: partitionClause :: Nil =
         getClauses(Seq("TOK_TABNAME", "TOK_PARTSPEC"), tableArgs)
 
-      val (db, tableName) = extractDbNameTableName(tableNameParts)
+      val tableIdent = extractTableIdent(tableNameParts)
 
       val partitionKeys = partitionClause.map(_.getChildren.map {
         // Parse partitions. We also make keys case insensitive.
@@ -840,7 +926,7 @@ private[hive] object HiveQl {
           cleanIdentifier(key.toLowerCase) -> None
       }.toMap).getOrElse(Map.empty)
 
-      InsertIntoTable(UnresolvedRelation(db, tableName, None), partitionKeys, query, overwrite)
+      InsertIntoTable(UnresolvedRelation(tableIdent, None), partitionKeys, query, overwrite)
 
     case a: ASTNode =>
       throw new NotImplementedError(s"No parse rules for:\n ${dumpTree(a).toString} ")
@@ -1025,6 +1111,7 @@ private[hive] object HiveQl {
     case Token(AND(), left :: right:: Nil) => And(nodeToExpr(left), nodeToExpr(right))
     case Token(OR(), left :: right:: Nil) => Or(nodeToExpr(left), nodeToExpr(right))
     case Token(NOT(), child :: Nil) => Not(nodeToExpr(child))
+    case Token("!", child :: Nil) => Not(nodeToExpr(child))
 
     /* Case statements */
     case Token("TOK_FUNCTION", Token(WHEN(), Nil) :: branches) =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 5f02e95ac3c34..6952b126cf894 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -17,24 +17,24 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.hadoop.hive.ql.parse.ASTNode
-import org.apache.hadoop.hive.ql.plan.CreateTableDesc
+import scala.collection.JavaConversions._
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.{SQLContext, SchemaRDD, Strategy}
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.types.StringType
-import org.apache.spark.sql.execution.{DescribeCommand, OutputFaker, SparkPlan, PhysicalRDD}
+import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand}
+import org.apache.spark.sql.execution._
 import org.apache.spark.sql.hive
 import org.apache.spark.sql.hive.execution._
 import org.apache.spark.sql.parquet.ParquetRelation
-import org.apache.spark.sql.{SQLContext, SchemaRDD, Strategy}
+import org.apache.spark.sql.sources.CreateTableUsing
+import org.apache.spark.sql.types.StringType
 
-import scala.collection.JavaConversions._
 
 private[hive] trait HiveStrategies {
   // Possibly being too clever with types here... or not clever enough.
@@ -177,25 +177,10 @@ private[hive] trait HiveStrategies {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case logical.InsertIntoTable(table: MetastoreRelation, partition, child, overwrite) =>
         execution.InsertIntoHiveTable(
-          table, partition, planLater(child), overwrite)(hiveContext) :: Nil
+          table, partition, planLater(child), overwrite) :: Nil
       case hive.InsertIntoHiveTable(table: MetastoreRelation, partition, child, overwrite) =>
         execution.InsertIntoHiveTable(
-          table, partition, planLater(child), overwrite)(hiveContext) :: Nil
-      case logical.CreateTableAsSelect(
-             Some(database), tableName, child, allowExisting, Some(desc: CreateTableDesc)) =>
-        execution.CreateTableAsSelect(
-          database,
-          tableName,
-          child,
-          allowExisting,
-          Some(desc)) :: Nil
-      case logical.CreateTableAsSelect(Some(database), tableName, child, allowExisting, None) =>
-        execution.CreateTableAsSelect(
-          database,
-          tableName,
-          child,
-          allowExisting,
-          None) :: Nil
+          table, partition, planLater(child), overwrite) :: Nil
       case _ => Nil
     }
   }
@@ -210,8 +195,9 @@ private[hive] trait HiveStrategies {
         // Filter out all predicates that only deal with partition keys, these are given to the
         // hive table scan operator to be used for partition pruning.
         val partitionKeyIds = AttributeSet(relation.partitionKeys)
-        val (pruningPredicates, otherPredicates) = predicates.partition {
-          _.references.subsetOf(partitionKeyIds)
+        val (pruningPredicates, otherPredicates) = predicates.partition { predicate =>
+          !predicate.references.isEmpty &&
+          predicate.references.subsetOf(partitionKeyIds)
         }
 
         pruneFilterProject(
@@ -224,25 +210,26 @@ private[hive] trait HiveStrategies {
     }
   }
 
-  case class HiveCommandStrategy(context: HiveContext) extends Strategy {
+  object HiveDDLStrategy extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case logical.NativeCommand(sql) => NativeCommand(sql, plan.output)(context) :: Nil
-
-      case hive.DropTable(tableName, ifExists) => execution.DropTable(tableName, ifExists) :: Nil
+      case CreateTableUsing(tableName, userSpecifiedSchema, provider, false, options) =>
+        ExecutedCommand(
+          CreateMetastoreDataSource(tableName, userSpecifiedSchema, provider, options)) :: Nil
 
-      case hive.AddJar(path) => execution.AddJar(path) :: Nil
-
-      case hive.AddFile(path) => execution.AddFile(path) :: Nil
-
-      case hive.AnalyzeTable(tableName) => execution.AnalyzeTable(tableName) :: Nil
+      case _ => Nil
+    }
+  }
 
-      case describe: logical.DescribeCommand =>
+  case class HiveCommandStrategy(context: HiveContext) extends Strategy {
+    def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+      case describe: DescribeCommand =>
         val resolvedTable = context.executePlan(describe.table).analyzed
         resolvedTable match {
           case t: MetastoreRelation =>
-            Seq(DescribeHiveTableCommand(t, describe.output, describe.isExtended)(context))
+            ExecutedCommand(
+              DescribeHiveTableCommand(t, describe.output, describe.isExtended)) :: Nil
           case o: LogicalPlan =>
-            Seq(DescribeCommand(planLater(o), describe.output)(context))
+            ExecutedCommand(RunnableDescribeCommand(planLater(o), describe.output)) :: Nil
         }
 
       case _ => Nil
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index f60bc3788e3e4..c368715f7c6f5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -57,10 +57,15 @@ class HadoopTableReader(
     @transient hiveExtraConf: HiveConf)
   extends TableReader {
 
-  // Choose the minimum number of splits. If mapred.map.tasks is set, then use that unless
-  // it is smaller than what Spark suggests.
-  private val _minSplitsPerRDD = math.max(
-    sc.hiveconf.getInt("mapred.map.tasks", 1), sc.sparkContext.defaultMinPartitions)
+  // Hadoop honors "mapred.map.tasks" as hint, but will ignore when mapred.job.tracker is "local".
+  // https://hadoop.apache.org/docs/r1.0.4/mapred-default.html
+  //
+  // In order keep consistency with Hive, we will let it be 0 in local mode also.
+  private val _minSplitsPerRDD = if (sc.sparkContext.isLocal) {
+    0 // will splitted based on block by default.
+  } else {
+    math.max(sc.hiveconf.getInt("mapred.map.tasks", 1), sc.sparkContext.defaultMinPartitions)
+  }
 
   // TODO: set aws s3 credentials.
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
index bb79ad5538046..47431cef03e13 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
@@ -20,9 +20,6 @@ package org.apache.spark.sql.hive.test
 import java.io.File
 import java.util.{Set => JavaSet}
 
-import org.apache.hadoop.hive.conf.HiveConf
-import org.apache.hadoop.hive.ql.session.SessionState
-
 import scala.collection.mutable
 import scala.language.implicitConversions
 
@@ -37,10 +34,12 @@ import org.apache.hadoop.hive.serde2.avro.AvroSerDe
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.util.Utils
 import org.apache.spark.sql.catalyst.analysis._
-import org.apache.spark.sql.catalyst.plans.logical.{CacheTableCommand, LogicalPlan, NativeCommand}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.execution.CacheTableCommand
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.hive.execution.HiveNativeCommand
 
 /* Implicit conversions */
 import scala.collection.JavaConversions._
@@ -103,8 +102,10 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
     new this.QueryExecution { val logical = plan }
 
   /** Fewer partitions to speed up testing. */
-  override private[spark] def numShufflePartitions: Int =
-    getConf(SQLConf.SHUFFLE_PARTITIONS, "5").toInt
+  protected[sql] override lazy val conf: SQLConf = new SQLConf {
+    override def numShufflePartitions: Int = getConf(SQLConf.SHUFFLE_PARTITIONS, "5").toInt
+    override def dialect: String = getConf(SQLConf.DIALECT, "hiveql")
+  }
 
   /**
    * Returns the value of specified environmental variable as a [[java.io.File]] after checking
@@ -161,7 +162,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
   abstract class QueryExecution extends super.QueryExecution {
     override lazy val analyzed = {
       val describedTables = logical match {
-        case NativeCommand(describedTable(tbl)) => tbl :: Nil
+        case HiveNativeCommand(describedTable(tbl)) => tbl :: Nil
         case CacheTableCommand(tbl, _, _) => tbl :: Nil
         case _ => Nil
       }
@@ -169,7 +170,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
       // Make sure any test tables referenced are loaded.
       val referencedTables =
         describedTables ++
-        logical.collect { case UnresolvedRelation(databaseName, name, _) => name }
+        logical.collect { case UnresolvedRelation(tableIdent, _) => tableIdent.last }
       val referencedTestTables = referencedTables.filter(testTables.contains)
       logDebug(s"Query references test tables: ${referencedTestTables.mkString(", ")}")
       referencedTestTables.foreach(loadTestTable)
@@ -396,6 +397,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
 
       clearCache()
       loadedTables.clear()
+      catalog.cachedDataSourceTables.invalidateAll()
       catalog.client.getAllTables("default").foreach { t =>
         logDebug(s"Deleting table $t")
         val table = catalog.client.getTable("default", t)
@@ -426,6 +428,8 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
       // other sql exec here.
       runSqlHive("RESET")
       // For some reason, RESET does not reset the following variables...
+      // https://issues.apache.org/jira/browse/HIVE-9004
+      runSqlHive("set hive.table.parameters.default=")
       runSqlHive("set datanucleus.cache.collections=true")
       runSqlHive("set datanucleus.cache.collections.lazy=true")
       // Lots of tests fail if we do not change the partition whitelist from the default.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/api/java/JavaHiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/api/java/JavaHiveContext.scala
deleted file mode 100644
index 1817c7832490e..0000000000000
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/api/java/JavaHiveContext.scala
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.api.java
-
-import org.apache.spark.api.java.JavaSparkContext
-import org.apache.spark.sql.api.java.{JavaSQLContext, JavaSchemaRDD}
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.sql.hive.{HiveContext, HiveQl}
-
-/**
- * The entry point for executing Spark SQL queries from a Java program.
- */
-class JavaHiveContext(sqlContext: SQLContext) extends JavaSQLContext(sqlContext) {
-
-  def this(sparkContext: JavaSparkContext) = this(new HiveContext(sparkContext))
-
-  override def sql(sqlText: String): JavaSchemaRDD = {
-    // TODO: Create a framework for registering parsers instead of just hardcoding if statements.
-    if (sqlContext.dialect == "sql") {
-      super.sql(sqlText)
-    } else if (sqlContext.dialect == "hiveql") {
-      new JavaSchemaRDD(sqlContext, HiveQl.parseSql(sqlText))
-    }  else {
-      sys.error(s"Unsupported SQL dialect: ${sqlContext.dialect}.  Try 'sql' or 'hiveql'")
-    }
-  }
-
-  /**
-    * DEPRECATED: Use sql(...) Instead
-    */
-  @Deprecated
-  def hql(hqlQuery: String): JavaSchemaRDD =
-    new JavaSchemaRDD(sqlContext, HiveQl.parseSql(hqlQuery))
-}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
index b83689ceabb84..a547babcebfff 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
@@ -20,10 +20,10 @@ package org.apache.spark.sql.hive.execution
 import org.apache.hadoop.hive.ql.plan.CreateTableDesc
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
-import org.apache.spark.sql.execution.{SparkPlan, Command, LeafNode}
+import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.sql.hive.MetastoreRelation
 
@@ -44,28 +44,23 @@ case class CreateTableAsSelect(
     tableName: String,
     query: LogicalPlan,
     allowExisting: Boolean,
-    desc: Option[CreateTableDesc]) extends LeafNode with Command {
+    desc: Option[CreateTableDesc]) extends RunnableCommand {
 
-  def output = Seq.empty
+  override def run(sqlContext: SQLContext) = {
+    val hiveContext = sqlContext.asInstanceOf[HiveContext]
+    lazy val metastoreRelation: MetastoreRelation = {
+      // Create Hive Table
+      hiveContext.catalog.createTable(database, tableName, query.output, allowExisting, desc)
 
-  private[this] def sc = sqlContext.asInstanceOf[HiveContext]
-
-  // A lazy computing of the metastoreRelation
-  private[this] lazy val metastoreRelation: MetastoreRelation = {
-    // Create Hive Table
-    sc.catalog.createTable(database, tableName, query.output, allowExisting, desc)
-
-    // Get the Metastore Relation
-    sc.catalog.lookupRelation(Some(database), tableName, None) match {
-      case r: MetastoreRelation => r
+      // Get the Metastore Relation
+      hiveContext.catalog.lookupRelation(Seq(database, tableName), None) match {
+        case r: MetastoreRelation => r
+      }
     }
-  }
-
-  override protected[sql] lazy val sideEffectResult: Seq[Row] = {
     // TODO ideally, we should get the output data ready first and then
     // add the relation into catalog, just in case of failure occurs while data
     // processing.
-    if (sc.catalog.tableExists(Some(database), tableName)) {
+    if (hiveContext.catalog.tableExists(Seq(database, tableName))) {
       if (allowExisting) {
         // table already exists, will do nothing, to keep consistent with Hive
       } else {
@@ -73,17 +68,12 @@ case class CreateTableAsSelect(
           new org.apache.hadoop.hive.metastore.api.AlreadyExistsException(s"$database.$tableName")
       }
     } else {
-      sc.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true)).toRdd
+      hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true)).toRdd
     }
 
     Seq.empty[Row]
   }
 
-  override def execute(): RDD[Row] = {
-    sideEffectResult
-    sparkContext.emptyRDD[Row]
-  }
-
   override def argString: String = {
     s"[Database:$database, TableName: $tableName, InsertIntoHiveTable]\n" + query.toString
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala
index 5d98834c6fb33..bfacc51ef57ab 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala
@@ -22,11 +22,11 @@ import scala.collection.JavaConversions._
 import org.apache.hadoop.hive.metastore.api.FieldSchema
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
-import org.apache.spark.sql.execution.{Command, LeafNode}
+import org.apache.spark.sql.execution.{SparkPlan, RunnableCommand}
 import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation}
 import org.apache.spark.sql.hive.HiveShim
+import org.apache.spark.sql.SQLContext
 
 /**
  * Implementation for "describe [extended] table".
@@ -36,21 +36,10 @@ import org.apache.spark.sql.hive.HiveShim
 @DeveloperApi
 case class DescribeHiveTableCommand(
     table: MetastoreRelation,
-    output: Seq[Attribute],
-    isExtended: Boolean)(
-    @transient context: HiveContext)
-  extends LeafNode with Command {
+    override val output: Seq[Attribute],
+    isExtended: Boolean) extends RunnableCommand {
 
-  // Strings with the format like Hive. It is used for result comparison in our unit tests.
-  lazy val hiveString: Seq[String] = sideEffectResult.map {
-    case Row(name: String, dataType: String, comment) =>
-      Seq(name, dataType,
-        Option(comment.asInstanceOf[String]).getOrElse(""))
-        .map(s => String.format(s"%-20s", s))
-        .mkString("\t")
-  }
-
-  override protected lazy val sideEffectResult: Seq[Row] = {
+  override def run(sqlContext: SQLContext) = {
     // Trying to mimic the format of Hive's output. But not exactly the same.
     var results: Seq[(String, String, String)] = Nil
 
@@ -75,6 +64,4 @@ case class DescribeHiveTableCommand(
       Row(name, dataType, comment)
     }
   }
-
-  override def otherCopyArgs = context :: Nil
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/NativeCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveNativeCommand.scala
similarity index 65%
rename from sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/NativeCommand.scala
rename to sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveNativeCommand.scala
index 6930c2babd117..781a2e9164c82 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/NativeCommand.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveNativeCommand.scala
@@ -1,38 +1,38 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.execution
-
-import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericRow, Row}
-import org.apache.spark.sql.execution.{Command, LeafNode}
-import org.apache.spark.sql.hive.HiveContext
-
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
-case class NativeCommand(
-    sql: String, output: Seq[Attribute])(
-    @transient context: HiveContext)
-  extends LeafNode with Command {
-
-  override protected lazy val sideEffectResult: Seq[Row] = context.runSqlHive(sql).map(Row(_))
-
-  override def otherCopyArgs = context :: Nil
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Row}
+import org.apache.spark.sql.execution.RunnableCommand
+import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.types.StringType
+
+/**
+ * :: DeveloperApi ::
+ */
+@DeveloperApi
+case class HiveNativeCommand(sql: String) extends RunnableCommand {
+
+  override def output =
+    Seq(AttributeReference("result", StringType, nullable = false)())
+
+  override def run(sqlContext: SQLContext) =
+    sqlContext.asInstanceOf[HiveContext].runSqlHive(sql).map(Row(_))
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
index 8bbcd6fec1f3b..b56175fe76376 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
@@ -19,20 +19,18 @@ package org.apache.spark.sql.hive.execution
 
 import scala.collection.JavaConversions._
 
-import org.apache.hadoop.hive.common.`type`.{HiveDecimal, HiveVarchar}
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.ql.metadata.{Partition => HivePartition}
 import org.apache.hadoop.hive.serde.serdeConstants
 import org.apache.hadoop.hive.serde2.objectinspector._
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption
-import org.apache.hadoop.hive.serde2.objectinspector.primitive._
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.types.{BooleanType, DataType}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.hive._
+import org.apache.spark.sql.types.{BooleanType, DataType}
 
 /**
  * :: DeveloperApi ::
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 81390f626726c..42bc8a0b67933 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -21,7 +21,6 @@ import java.util
 
 import scala.collection.JavaConversions._
 
-import org.apache.hadoop.hive.common.`type`.HiveVarchar
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.metastore.MetaStoreUtils
@@ -31,14 +30,12 @@ import org.apache.hadoop.hive.ql.{Context, ErrorMsg}
 import org.apache.hadoop.hive.serde2.Serializer
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption
 import org.apache.hadoop.hive.serde2.objectinspector._
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.{JavaHiveDecimalObjectInspector, JavaHiveVarcharObjectInspector}
 import org.apache.hadoop.mapred.{FileOutputCommitter, FileOutputFormat, JobConf}
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.expressions.Row
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
-import org.apache.spark.sql.execution.{Command, SparkPlan, UnaryNode}
+import org.apache.spark.sql.execution.{UnaryNode, SparkPlan}
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.{ ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive.HiveShim._
@@ -52,10 +49,9 @@ case class InsertIntoHiveTable(
     table: MetastoreRelation,
     partition: Map[String, Option[String]],
     child: SparkPlan,
-    overwrite: Boolean)
-    (@transient sc: HiveContext)
-  extends UnaryNode with Command with HiveInspectors {
+    overwrite: Boolean) extends UnaryNode with HiveInspectors {
 
+  @transient val sc: HiveContext = sqlContext.asInstanceOf[HiveContext]
   @transient lazy val outputClass = newSerializer(table.tableDesc).getSerializedClass
   @transient private lazy val hiveContext = new Context(sc.hiveconf)
   @transient private lazy val db = Hive.get(sc.hiveconf)
@@ -66,8 +62,6 @@ case class InsertIntoHiveTable(
     serializer
   }
 
-  override def otherCopyArgs = sc :: Nil
-
   def output = child.output
 
   def saveAsHiveFile(
@@ -106,10 +100,7 @@ case class InsertIntoHiveTable(
       val wrappers = fieldOIs.map(wrapperFor)
       val outputData = new Array[Any](fieldOIs.length)
 
-      // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
-      // around by taking a mod. We expect that no task will be attempted 2 billion times.
-      val attemptNumber = (context.attemptId % Int.MaxValue).toInt
-      writerContainer.executorSideSetup(context.stageId, context.partitionId, attemptNumber)
+      writerContainer.executorSideSetup(context.stageId, context.partitionId, context.attemptNumber)
 
       iterator.foreach { row =>
         var i = 0
@@ -134,7 +125,7 @@ case class InsertIntoHiveTable(
    *
    * Note: this is run once and then kept to avoid double insertions.
    */
-  override protected[sql] lazy val sideEffectResult: Seq[Row] = {
+  protected[sql] lazy val sideEffectResult: Seq[Row] = {
     // Have to pass the TableDesc object to RDD.mapPartitions and then instantiate new serializer
     // instances within the closure, since Serializer is not serializable while TableDesc is.
     val tableDesc = table.tableDesc
@@ -256,4 +247,8 @@ case class InsertIntoHiveTable(
     // TODO: implement hive compatibility as rules.
     Seq.empty[Row]
   }
+
+  override def executeCollect(): Array[Row] = sideEffectResult.toArray
+
+  override def execute(): RDD[Row] = sqlContext.sparkContext.parallelize(sideEffectResult, 1)
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 903075edf7e04..91f9da35abeee 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -18,10 +18,11 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.expressions.Row
-import org.apache.spark.sql.execution.{Command, LeafNode}
+import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.sql.types.StructType
 
 /**
  * :: DeveloperApi ::
@@ -32,13 +33,10 @@ import org.apache.spark.sql.hive.HiveContext
  * in the Hive metastore.
  */
 @DeveloperApi
-case class AnalyzeTable(tableName: String) extends LeafNode with Command {
-  def hiveContext = sqlContext.asInstanceOf[HiveContext]
+case class AnalyzeTable(tableName: String) extends RunnableCommand {
 
-  def output = Seq.empty
-
-  override protected lazy val sideEffectResult: Seq[Row] = {
-    hiveContext.analyze(tableName)
+  override def run(sqlContext: SQLContext) = {
+    sqlContext.asInstanceOf[HiveContext].analyze(tableName)
     Seq.empty[Row]
   }
 }
@@ -48,15 +46,26 @@ case class AnalyzeTable(tableName: String) extends LeafNode with Command {
  * Drops a table from the metastore and removes it if it is cached.
  */
 @DeveloperApi
-case class DropTable(tableName: String, ifExists: Boolean) extends LeafNode with Command {
-  def hiveContext = sqlContext.asInstanceOf[HiveContext]
-
-  def output = Seq.empty
+case class DropTable(
+    tableName: String,
+    ifExists: Boolean) extends RunnableCommand {
 
-  override protected lazy val sideEffectResult: Seq[Row] = {
+  override def run(sqlContext: SQLContext) = {
+    val hiveContext = sqlContext.asInstanceOf[HiveContext]
     val ifExistsClause = if (ifExists) "IF EXISTS " else ""
+    try {
+      hiveContext.tryUncacheQuery(hiveContext.table(tableName))
+    } catch {
+      // This table's metadata is not in
+      case _: org.apache.hadoop.hive.ql.metadata.InvalidTableException =>
+      // Other exceptions can be caused by users providing wrong parameters in OPTIONS
+      // (e.g. invalid paths). We catch it and log a warning message.
+      // Users should be able to drop such kinds of tables regardless if there is an exception.
+      case e: Exception => log.warn(s"${e.getMessage}")
+    }
+    hiveContext.invalidateTable(tableName)
     hiveContext.runSqlHive(s"DROP TABLE $ifExistsClause$tableName")
-    hiveContext.catalog.unregisterTable(None, tableName)
+    hiveContext.catalog.unregisterTable(Seq(tableName))
     Seq.empty[Row]
   }
 }
@@ -65,12 +74,10 @@ case class DropTable(tableName: String, ifExists: Boolean) extends LeafNode with
  * :: DeveloperApi ::
  */
 @DeveloperApi
-case class AddJar(path: String) extends LeafNode with Command {
-  def hiveContext = sqlContext.asInstanceOf[HiveContext]
-
-  override def output = Seq.empty
+case class AddJar(path: String) extends RunnableCommand {
 
-  override protected lazy val sideEffectResult: Seq[Row] = {
+  override def run(sqlContext: SQLContext) = {
+    val hiveContext = sqlContext.asInstanceOf[HiveContext]
     hiveContext.runSqlHive(s"ADD JAR $path")
     hiveContext.sparkContext.addJar(path)
     Seq.empty[Row]
@@ -81,14 +88,26 @@ case class AddJar(path: String) extends LeafNode with Command {
  * :: DeveloperApi ::
  */
 @DeveloperApi
-case class AddFile(path: String) extends LeafNode with Command {
-  def hiveContext = sqlContext.asInstanceOf[HiveContext]
+case class AddFile(path: String) extends RunnableCommand {
 
-  override def output = Seq.empty
-
-  override protected lazy val sideEffectResult: Seq[Row] = {
+  override def run(sqlContext: SQLContext) = {
+    val hiveContext = sqlContext.asInstanceOf[HiveContext]
     hiveContext.runSqlHive(s"ADD FILE $path")
     hiveContext.sparkContext.addFile(path)
     Seq.empty[Row]
   }
 }
+
+case class CreateMetastoreDataSource(
+    tableName: String,
+    userSpecifiedSchema: Option[StructType],
+    provider: String,
+    options: Map[String, String]) extends RunnableCommand {
+
+  override def run(sqlContext: SQLContext) = {
+    val hiveContext = sqlContext.asInstanceOf[HiveContext]
+    hiveContext.catalog.createDataSourceTable(tableName, userSpecifiedSchema, provider, options)
+
+    Seq.empty[Row]
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
index ed2e96df8ad77..76d2140372197 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
@@ -33,7 +33,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDF._
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils.getContextOrSparkClassLoader
 
 /* Implicit conversions */
@@ -166,6 +166,7 @@ private[hive] case class HiveGenericUdf(funcWrapper: HiveFunctionWrapper, childr
 
   override def eval(input: Row): Any = {
     returnInspector // Make sure initialized.
+
     var i = 0
     while (i < children.length) {
       val idx = i
@@ -193,12 +194,13 @@ private[hive] case class HiveGenericUdaf(
 
   @transient
   protected lazy val objectInspector  = {
-    resolver.getEvaluator(children.map(_.dataType.toTypeInfo).toArray)
+    val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors.toArray, false, false)
+    resolver.getEvaluator(parameterInfo)
       .init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors.toArray)
   }
 
   @transient
-  protected lazy val inspectors = children.map(_.dataType).map(toInspector)
+  protected lazy val inspectors = children.map(toInspector)
 
   def dataType: DataType = inspectorToDataType(objectInspector)
 
@@ -223,12 +225,13 @@ private[hive] case class HiveUdaf(
 
   @transient
   protected lazy val objectInspector  = {
-    resolver.getEvaluator(children.map(_.dataType.toTypeInfo).toArray)
+    val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors.toArray, false, false)
+    resolver.getEvaluator(parameterInfo)
       .init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors.toArray)
   }
 
   @transient
-  protected lazy val inspectors = children.map(_.dataType).map(toInspector)
+  protected lazy val inspectors = children.map(toInspector)
 
   def dataType: DataType = inspectorToDataType(objectInspector)
 
@@ -261,7 +264,7 @@ private[hive] case class HiveGenericUdtf(
   protected lazy val function: GenericUDTF = funcWrapper.createFunction()
 
   @transient
-  protected lazy val inputInspectors = children.map(_.dataType).map(toInspector)
+  protected lazy val inputInspectors = children.map(toInspector)
 
   @transient
   protected lazy val outputInspector = function.initialize(inputInspectors.toArray)
@@ -334,10 +337,13 @@ private[hive] case class HiveUdafFunction(
     } else {
       funcWrapper.createFunction[AbstractGenericUDAFResolver]()
     }
-
-  private val inspectors = exprs.map(_.dataType).map(toInspector).toArray
-
-  private val function = resolver.getEvaluator(exprs.map(_.dataType.toTypeInfo).toArray)
+  
+  private val inspectors = exprs.map(toInspector).toArray
+    
+  private val function = { 
+    val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors, false, false)
+    resolver.getEvaluator(parameterInfo) 
+  }
 
   private val returnInspector = function.init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors)
 
@@ -350,9 +356,12 @@ private[hive] case class HiveUdafFunction(
   @transient
   val inputProjection = new InterpretedProjection(exprs)
 
+  @transient
+  protected lazy val cached = new Array[AnyRef](exprs.length)
+  
   def update(input: Row): Unit = {
-    val inputs = inputProjection(input).asInstanceOf[Seq[AnyRef]].toArray
-    function.iterate(buffer, inputs)
+    val inputs = inputProjection(input)
+    function.iterate(buffer, wrap(inputs, inspectors, cached))
   }
 }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index cc8bb3e172c6e..aae175e426ade 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -209,7 +209,7 @@ private[spark] class SparkHiveDynamicPartitionWriterContainer(
 
   override def getLocalFileWriter(row: Row): FileSinkOperator.RecordWriter = {
     val dynamicPartPath = dynamicPartColNames
-      .zip(row.takeRight(dynamicPartColNames.length))
+      .zip(row.toSeq.takeRight(dynamicPartColNames.length))
       .map { case (col, rawVal) =>
         val string = if (rawVal == null) null else String.valueOf(rawVal)
         s"/$col=${if (string == null || string.isEmpty) defaultPartName else string}"
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/parquet/FakeParquetSerDe.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/parquet/FakeParquetSerDe.scala
index abed299cd957f..2a16c9d1a27c9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/parquet/FakeParquetSerDe.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/parquet/FakeParquetSerDe.scala
@@ -32,7 +32,7 @@ import org.apache.hadoop.io.Writable
  * when "spark.sql.hive.convertMetastoreParquet" is set to true.
  */
 @deprecated("No code should depend on FakeParquetHiveSerDe as it is only intended as a " +
-            "placeholder in the Hive MetaStore")
+            "placeholder in the Hive MetaStore", "1.2.0")
 class FakeParquetSerDe extends SerDe {
   override def getObjectInspector: ObjectInspector = new ObjectInspector {
     override def getCategory: Category = Category.PRIMITIVE
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListListInt.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListListInt.java
index d2d39a8c4dc28..808e2986d3b77 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListListInt.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListListInt.java
@@ -23,25 +23,21 @@
 
 public class UDFListListInt extends UDF {
   /**
-   *
    * @param obj
-   *   SQL schema: array<struct<x: int, y: int, z: int>>
-   *   Java Type: List<List<Integer>>
-   * @return
+   *   SQL schema: array&lt;struct&lt;x: int, y: int, z: int&gt;&gt;
+   *   Java Type: List&lt;List&lt;Integer&gt;&gt;
    */
+  @SuppressWarnings("unchecked")
   public long evaluate(Object obj) {
     if (obj == null) {
-      return 0l;
+      return 0L;
     }
-    List<List> listList = (List<List>) obj;
+    List<List<?>> listList = (List<List<?>>) obj;
     long retVal = 0;
-    for (List aList : listList) {
-      @SuppressWarnings("unchecked")
-      List<Object> list = (List<Object>) aList;
-      @SuppressWarnings("unchecked")
-      Integer someInt = (Integer) list.get(1);
+    for (List<?> aList : listList) {
+      Number someInt = (Number) aList.get(1);
       try {
-        retVal += (long) (someInt.intValue());
+        retVal += someInt.longValue();
       } catch (NullPointerException e) {
         System.out.println(e);
       }
diff --git a/sql/hive/src/test/resources/golden/! operator-0-81d1a187c7f4a6337baf081510a5dc5e b/sql/hive/src/test/resources/golden/! operator-0-81d1a187c7f4a6337baf081510a5dc5e
new file mode 100644
index 0000000000000..d00491fd7e5bb
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/! operator-0-81d1a187c7f4a6337baf081510a5dc5e	
@@ -0,0 +1 @@
+1
diff --git a/sql/hive/src/test/resources/golden/create_like_tbl_props-0-432a3ade72afd99cfb4b886692c15e55 b/sql/hive/src/test/resources/golden/create_like_tbl_props-0-432a3ade72afd99cfb4b886692c15e55
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_like_tbl_props-1-f8481dcbc8f2731bab8ac3894511ff9f b/sql/hive/src/test/resources/golden/create_like_tbl_props-1-f8481dcbc8f2731bab8ac3894511ff9f
new file mode 100644
index 0000000000000..7d8744672aa11
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/create_like_tbl_props-1-f8481dcbc8f2731bab8ac3894511ff9f
@@ -0,0 +1,28 @@
+# col_name            	data_type           	comment             
+	 	 
+key                 	int                 	                    
+value               	string              	                    
+	 	 
+# Detailed Table Information	 	 
+Database:           	default             	 
+Owner:              	hcheng              	 
+CreateTime:         	Fri Nov 28 00:04:15 PST 2014	 
+LastAccessTime:     	UNKNOWN             	 
+Protect Mode:       	None                	 
+Retention:          	0                   	 
+Location:           	file:/tmp/sparkHiveWarehouse3490012261419180285/test_table	 
+Table Type:         	MANAGED_TABLE       	 
+Table Parameters:	 	 
+	key                 	value               
+	transient_lastDdlTime	1417161855          
+	 	 
+# Storage Information	 	 
+SerDe Library:      	org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe	 
+InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	-1                  	 
+Bucket Columns:     	[]                  	 
+Sort Columns:       	[]                  	 
+Storage Desc Params:	 	 
+	serialization.format	1                   
diff --git a/sql/hive/src/test/resources/golden/create_like_tbl_props-10-2a2d6f2c92e32285dd4c4dd3d0faa9 b/sql/hive/src/test/resources/golden/create_like_tbl_props-10-2a2d6f2c92e32285dd4c4dd3d0faa9
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_like_tbl_props-11-b4f47dcb46073bda6fb1d9f96e8b36e6 b/sql/hive/src/test/resources/golden/create_like_tbl_props-11-b4f47dcb46073bda6fb1d9f96e8b36e6
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_like_tbl_props-12-184ab0f730b53d1b8b4f4e1feade9824 b/sql/hive/src/test/resources/golden/create_like_tbl_props-12-184ab0f730b53d1b8b4f4e1feade9824
new file mode 100644
index 0000000000000..b55c2dcfe4934
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/create_like_tbl_props-12-184ab0f730b53d1b8b4f4e1feade9824
@@ -0,0 +1,29 @@
+# col_name            	data_type           	comment             
+	 	 
+key                 	int                 	                    
+value               	string              	                    
+	 	 
+# Detailed Table Information	 	 
+Database:           	default             	 
+Owner:              	hcheng              	 
+CreateTime:         	Fri Nov 28 00:04:16 PST 2014	 
+LastAccessTime:     	UNKNOWN             	 
+Protect Mode:       	None                	 
+Retention:          	0                   	 
+Location:           	file:/tmp/sparkHiveWarehouse3490012261419180285/test_table4	 
+Table Type:         	MANAGED_TABLE       	 
+Table Parameters:	 	 
+	key                 	value               
+	key1                	value1              
+	transient_lastDdlTime	1417161856          
+	 	 
+# Storage Information	 	 
+SerDe Library:      	org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe	 
+InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	-1                  	 
+Bucket Columns:     	[]                  	 
+Sort Columns:       	[]                  	 
+Storage Desc Params:	 	 
+	serialization.format	1                   
diff --git a/sql/hive/src/test/resources/golden/create_like_tbl_props-2-62c728aff7df8cd2bd2c114c9076a1ff b/sql/hive/src/test/resources/golden/create_like_tbl_props-2-62c728aff7df8cd2bd2c114c9076a1ff
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/create_like_tbl_props-2-62c728aff7df8cd2bd2c114c9076a1ff
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/create_like_tbl_props-3-3320f357baaadfe13820349b8d941865 b/sql/hive/src/test/resources/golden/create_like_tbl_props-3-3320f357baaadfe13820349b8d941865
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_like_tbl_props-4-f59c262efb0482b555ae867abef4040f b/sql/hive/src/test/resources/golden/create_like_tbl_props-4-f59c262efb0482b555ae867abef4040f
new file mode 100644
index 0000000000000..8c94e5fb15561
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/create_like_tbl_props-4-f59c262efb0482b555ae867abef4040f
@@ -0,0 +1,28 @@
+# col_name            	data_type           	comment             
+	 	 
+key                 	int                 	                    
+value               	string              	                    
+	 	 
+# Detailed Table Information	 	 
+Database:           	default             	 
+Owner:              	hcheng              	 
+CreateTime:         	Fri Nov 28 00:04:15 PST 2014	 
+LastAccessTime:     	UNKNOWN             	 
+Protect Mode:       	None                	 
+Retention:          	0                   	 
+Location:           	file:/tmp/sparkHiveWarehouse3490012261419180285/test_table1	 
+Table Type:         	MANAGED_TABLE       	 
+Table Parameters:	 	 
+	key1                	value1              
+	transient_lastDdlTime	1417161855          
+	 	 
+# Storage Information	 	 
+SerDe Library:      	org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe	 
+InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	-1                  	 
+Bucket Columns:     	[]                  	 
+Sort Columns:       	[]                  	 
+Storage Desc Params:	 	 
+	serialization.format	1                   
diff --git a/sql/hive/src/test/resources/golden/create_like_tbl_props-5-11e1ff4a0cf4ee27f5ccb5f267643cfd b/sql/hive/src/test/resources/golden/create_like_tbl_props-5-11e1ff4a0cf4ee27f5ccb5f267643cfd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_like_tbl_props-6-c49698cf69779ee8a519e2566c6b2acb b/sql/hive/src/test/resources/golden/create_like_tbl_props-6-c49698cf69779ee8a519e2566c6b2acb
new file mode 100644
index 0000000000000..ddec982d168c0
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/create_like_tbl_props-6-c49698cf69779ee8a519e2566c6b2acb
@@ -0,0 +1,29 @@
+# col_name            	data_type           	comment             
+	 	 
+key                 	int                 	                    
+value               	string              	                    
+	 	 
+# Detailed Table Information	 	 
+Database:           	default             	 
+Owner:              	hcheng              	 
+CreateTime:         	Fri Nov 28 00:04:16 PST 2014	 
+LastAccessTime:     	UNKNOWN             	 
+Protect Mode:       	None                	 
+Retention:          	0                   	 
+Location:           	file:/tmp/sparkHiveWarehouse3490012261419180285/test_table2	 
+Table Type:         	MANAGED_TABLE       	 
+Table Parameters:	 	 
+	key1                	value1              
+	key2                	value2              
+	transient_lastDdlTime	1417161856          
+	 	 
+# Storage Information	 	 
+SerDe Library:      	org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe	 
+InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	-1                  	 
+Bucket Columns:     	[]                  	 
+Sort Columns:       	[]                  	 
+Storage Desc Params:	 	 
+	serialization.format	1                   
diff --git a/sql/hive/src/test/resources/golden/create_like_tbl_props-7-25f0c8b81d949d73737ee3a5398fc9f7 b/sql/hive/src/test/resources/golden/create_like_tbl_props-7-25f0c8b81d949d73737ee3a5398fc9f7
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/create_like_tbl_props-7-25f0c8b81d949d73737ee3a5398fc9f7
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/create_like_tbl_props-8-69b6bc0b259beb299874e7cdfc5edb1b b/sql/hive/src/test/resources/golden/create_like_tbl_props-8-69b6bc0b259beb299874e7cdfc5edb1b
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_like_tbl_props-9-9461431e44ae60a529cc309d8f325dbc b/sql/hive/src/test/resources/golden/create_like_tbl_props-9-9461431e44ae60a529cc309d8f325dbc
new file mode 100644
index 0000000000000..547d4fbdf34d3
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/create_like_tbl_props-9-9461431e44ae60a529cc309d8f325dbc
@@ -0,0 +1,29 @@
+# col_name            	data_type           	comment             
+	 	 
+key                 	int                 	                    
+value               	string              	                    
+	 	 
+# Detailed Table Information	 	 
+Database:           	default             	 
+Owner:              	hcheng              	 
+CreateTime:         	Fri Nov 28 00:04:16 PST 2014	 
+LastAccessTime:     	UNKNOWN             	 
+Protect Mode:       	None                	 
+Retention:          	0                   	 
+Location:           	file:/tmp/sparkHiveWarehouse3490012261419180285/test_table3	 
+Table Type:         	MANAGED_TABLE       	 
+Table Parameters:	 	 
+	key1                	value1              
+	key2                	value3              
+	transient_lastDdlTime	1417161856          
+	 	 
+# Storage Information	 	 
+SerDe Library:      	org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe	 
+InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	-1                  	 
+Bucket Columns:     	[]                  	 
+Sort Columns:       	[]                  	 
+Storage Desc Params:	 	 
+	serialization.format	1                   
diff --git a/sql/hive/src/test/resources/golden/create_view_translate-0-dc7fc9ce5109ef459ee84ccfbb12d2c0 b/sql/hive/src/test/resources/golden/create_view_translate-0-dc7fc9ce5109ef459ee84ccfbb12d2c0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_view_translate-1-3896ae0e680a5fdc01833533b11c07bb b/sql/hive/src/test/resources/golden/create_view_translate-1-3896ae0e680a5fdc01833533b11c07bb
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_view_translate-10-7016e1e3a4248564f3d08cddad7ae116 b/sql/hive/src/test/resources/golden/create_view_translate-10-7016e1e3a4248564f3d08cddad7ae116
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_view_translate-11-e27c6a59a833dcbc2e5cdb7ff7972828 b/sql/hive/src/test/resources/golden/create_view_translate-11-e27c6a59a833dcbc2e5cdb7ff7972828
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_view_translate-2-6b4caec6d7e3a91e61720bbd6b7697f0 b/sql/hive/src/test/resources/golden/create_view_translate-2-6b4caec6d7e3a91e61720bbd6b7697f0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_view_translate-3-30dc3e80e3873af5115e4f5e39078a13 b/sql/hive/src/test/resources/golden/create_view_translate-3-30dc3e80e3873af5115e4f5e39078a13
new file mode 100644
index 0000000000000..cec5f77033aa4
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/create_view_translate-3-30dc3e80e3873af5115e4f5e39078a13
@@ -0,0 +1,27 @@
+# col_name            	data_type           	comment             
+	 	 
+key                 	string              	                    
+	 	 
+# Detailed Table Information	 	 
+Database:           	default             	 
+Owner:              	animal              	 
+CreateTime:         	Mon Dec 29 00:57:55 PST 2014	 
+LastAccessTime:     	UNKNOWN             	 
+Protect Mode:       	None                	 
+Retention:          	0                   	 
+Table Type:         	VIRTUAL_VIEW        	 
+Table Parameters:	 	 
+	transient_lastDdlTime	1419843475          
+	 	 
+# Storage Information	 	 
+SerDe Library:      	null                	 
+InputFormat:        	org.apache.hadoop.mapred.SequenceFileInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	-1                  	 
+Bucket Columns:     	[]                  	 
+Sort Columns:       	[]                  	 
+	 	 
+# View Information	 	 
+View Original Text: 	select cast(key as string) from src	 
+View Expanded Text: 	select cast(`src`.`key` as string) from `default`.`src`	 
diff --git a/sql/hive/src/test/resources/golden/create_view_translate-4-cefb7530126f9e60cb4a29441d578f23 b/sql/hive/src/test/resources/golden/create_view_translate-4-cefb7530126f9e60cb4a29441d578f23
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_view_translate-5-856ea995681b18a543dc0e53b8b43a8e b/sql/hive/src/test/resources/golden/create_view_translate-5-856ea995681b18a543dc0e53b8b43a8e
new file mode 100644
index 0000000000000..bf582fc0964a3
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/create_view_translate-5-856ea995681b18a543dc0e53b8b43a8e
@@ -0,0 +1,32 @@
+# col_name            	data_type           	comment             
+	 	 
+key                 	int                 	                    
+value               	string              	                    
+	 	 
+# Detailed Table Information	 	 
+Database:           	default             	 
+Owner:              	animal              	 
+CreateTime:         	Mon Dec 29 00:57:55 PST 2014	 
+LastAccessTime:     	UNKNOWN             	 
+Protect Mode:       	None                	 
+Retention:          	0                   	 
+Table Type:         	VIRTUAL_VIEW        	 
+Table Parameters:	 	 
+	transient_lastDdlTime	1419843475          
+	 	 
+# Storage Information	 	 
+SerDe Library:      	null                	 
+InputFormat:        	org.apache.hadoop.mapred.SequenceFileInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	-1                  	 
+Bucket Columns:     	[]                  	 
+Sort Columns:       	[]                  	 
+	 	 
+# View Information	 	 
+View Original Text: 	select key, value from (	 
+  select key, value from src	 	 
+) a	 	 
+View Expanded Text: 	select key, value from (	 
+  select `src`.`key`, `src`.`value` from `default`.`src`	 	 
+) `a`	 	 
diff --git a/sql/hive/src/test/resources/golden/create_view_translate-6-a14cfe3eff322066e61023ec06c7735d b/sql/hive/src/test/resources/golden/create_view_translate-6-a14cfe3eff322066e61023ec06c7735d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_view_translate-7-e947bf2dacc907825df154a4131a3fcc b/sql/hive/src/test/resources/golden/create_view_translate-7-e947bf2dacc907825df154a4131a3fcc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_view_translate-8-b1a99b0beffb0b298aec9233ecc0707f b/sql/hive/src/test/resources/golden/create_view_translate-8-b1a99b0beffb0b298aec9233ecc0707f
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/create_view_translate-9-fc0dc39c4796d917685e0797bc4a9786 b/sql/hive/src/test/resources/golden/create_view_translate-9-fc0dc39c4796d917685e0797bc4a9786
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/decimal_1-0-50131c0ba7b7a6b65c789a5a8497bada b/sql/hive/src/test/resources/golden/decimal_1-0-50131c0ba7b7a6b65c789a5a8497bada
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/decimal_1-0-50131c0ba7b7a6b65c789a5a8497bada
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/decimal_1-1-6742a91ba2b9fa9c906d30d4d0ad0972 b/sql/hive/src/test/resources/golden/decimal_1-1-6742a91ba2b9fa9c906d30d4d0ad0972
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/decimal_1-10-c20dea9d716bef1bdbdef71323b1cc5b b/sql/hive/src/test/resources/golden/decimal_1-10-c20dea9d716bef1bdbdef71323b1cc5b
new file mode 100644
index 0000000000000..98d9bcb75a685
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/decimal_1-10-c20dea9d716bef1bdbdef71323b1cc5b
@@ -0,0 +1 @@
+17
diff --git a/sql/hive/src/test/resources/golden/decimal_1-11-f2f975b73220512d4bf2b9bd93354aba b/sql/hive/src/test/resources/golden/decimal_1-11-f2f975b73220512d4bf2b9bd93354aba
new file mode 100644
index 0000000000000..53aca7545dac7
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/decimal_1-11-f2f975b73220512d4bf2b9bd93354aba
@@ -0,0 +1 @@
+17.29
diff --git a/sql/hive/src/test/resources/golden/decimal_1-12-2c2325880ea79c8e308398d46c8565f8 b/sql/hive/src/test/resources/golden/decimal_1-12-2c2325880ea79c8e308398d46c8565f8
new file mode 100644
index 0000000000000..53aca7545dac7
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/decimal_1-12-2c2325880ea79c8e308398d46c8565f8
@@ -0,0 +1 @@
+17.29
diff --git a/sql/hive/src/test/resources/golden/decimal_1-13-c4c33bdb9f3c6cad77552f0f353092d3 b/sql/hive/src/test/resources/golden/decimal_1-13-c4c33bdb9f3c6cad77552f0f353092d3
new file mode 100644
index 0000000000000..53aca7545dac7
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/decimal_1-13-c4c33bdb9f3c6cad77552f0f353092d3
@@ -0,0 +1 @@
+17.29
diff --git a/sql/hive/src/test/resources/golden/decimal_1-14-e45935cfffb9045394e804d0d1fc52f0 b/sql/hive/src/test/resources/golden/decimal_1-14-e45935cfffb9045394e804d0d1fc52f0
new file mode 100644
index 0000000000000..c4a17c1b14c88
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/decimal_1-14-e45935cfffb9045394e804d0d1fc52f0
@@ -0,0 +1 @@
+1969-12-31 16:00:17.29
diff --git a/sql/hive/src/test/resources/golden/decimal_1-15-31ecaab3afa056fcc656d6e54f845cf4 b/sql/hive/src/test/resources/golden/decimal_1-15-31ecaab3afa056fcc656d6e54f845cf4
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/decimal_1-2-ee665100ca5de3a006df43e97cfa707 b/sql/hive/src/test/resources/golden/decimal_1-2-ee665100ca5de3a006df43e97cfa707
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/decimal_1-3-80fc87cab17ceffea334afbb230a6653 b/sql/hive/src/test/resources/golden/decimal_1-3-80fc87cab17ceffea334afbb230a6653
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/decimal_1-4-5dd925bba25f735bfd6442a841afe119 b/sql/hive/src/test/resources/golden/decimal_1-4-5dd925bba25f735bfd6442a841afe119
new file mode 100644
index 0000000000000..711809abcc925
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/decimal_1-4-5dd925bba25f735bfd6442a841afe119
@@ -0,0 +1,3 @@
+t                   	decimal(4,2)        	                    
+u                   	decimal(5,0)        	                    
+v                   	decimal(10,0)       	                    
diff --git a/sql/hive/src/test/resources/golden/decimal_1-5-bfab296ca5693e647e33899dfeeb256 b/sql/hive/src/test/resources/golden/decimal_1-5-bfab296ca5693e647e33899dfeeb256
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/decimal_1-6-a402201ed5159941384d40e09dc367a5 b/sql/hive/src/test/resources/golden/decimal_1-6-a402201ed5159941384d40e09dc367a5
new file mode 100644
index 0000000000000..27ba77ddaf615
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/decimal_1-6-a402201ed5159941384d40e09dc367a5
@@ -0,0 +1 @@
+true
diff --git a/sql/hive/src/test/resources/golden/decimal_1-7-2cfd7d00bc37a8e433ad005896173c1 b/sql/hive/src/test/resources/golden/decimal_1-7-2cfd7d00bc37a8e433ad005896173c1
new file mode 100644
index 0000000000000..98d9bcb75a685
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/decimal_1-7-2cfd7d00bc37a8e433ad005896173c1
@@ -0,0 +1 @@
+17
diff --git a/sql/hive/src/test/resources/golden/decimal_1-8-84cd75e494d113a48c4145298177d6d8 b/sql/hive/src/test/resources/golden/decimal_1-8-84cd75e494d113a48c4145298177d6d8
new file mode 100644
index 0000000000000..98d9bcb75a685
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/decimal_1-8-84cd75e494d113a48c4145298177d6d8
@@ -0,0 +1 @@
+17
diff --git a/sql/hive/src/test/resources/golden/decimal_1-9-e4e90927ac59f5920de3dc61c3288dde b/sql/hive/src/test/resources/golden/decimal_1-9-e4e90927ac59f5920de3dc61c3288dde
new file mode 100644
index 0000000000000..98d9bcb75a685
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/decimal_1-9-e4e90927ac59f5920de3dc61c3288dde
@@ -0,0 +1 @@
+17
diff --git a/sql/hive/src/test/resources/golden/empty aggregate input-0-bbd21aa0c1faf4c1fe6d8a822b416349 b/sql/hive/src/test/resources/golden/empty aggregate input-0-bbd21aa0c1faf4c1fe6d8a822b416349
new file mode 100644
index 0000000000000..7951defec192a
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/empty aggregate input-0-bbd21aa0c1faf4c1fe6d8a822b416349	
@@ -0,0 +1 @@
+NULL
diff --git a/sql/hive/src/test/resources/golden/file_split_for_small_table-0-7a45831bf96814d9a7fc3d78fb7bd8dc b/sql/hive/src/test/resources/golden/file_split_for_small_table-0-7a45831bf96814d9a7fc3d78fb7bd8dc
new file mode 100644
index 0000000000000..b70e127e82d05
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/file_split_for_small_table-0-7a45831bf96814d9a7fc3d78fb7bd8dc
@@ -0,0 +1,500 @@
+0	val_0
+0	val_0
+0	val_0
+2	val_2
+4	val_4
+5	val_5
+5	val_5
+5	val_5
+8	val_8
+9	val_9
+10	val_10
+11	val_11
+12	val_12
+12	val_12
+15	val_15
+15	val_15
+17	val_17
+18	val_18
+18	val_18
+19	val_19
+20	val_20
+24	val_24
+24	val_24
+26	val_26
+26	val_26
+27	val_27
+28	val_28
+30	val_30
+33	val_33
+34	val_34
+35	val_35
+35	val_35
+35	val_35
+37	val_37
+37	val_37
+41	val_41
+42	val_42
+42	val_42
+43	val_43
+44	val_44
+47	val_47
+51	val_51
+51	val_51
+53	val_53
+54	val_54
+57	val_57
+58	val_58
+58	val_58
+64	val_64
+65	val_65
+66	val_66
+67	val_67
+67	val_67
+69	val_69
+70	val_70
+70	val_70
+70	val_70
+72	val_72
+72	val_72
+74	val_74
+76	val_76
+76	val_76
+77	val_77
+78	val_78
+80	val_80
+82	val_82
+83	val_83
+83	val_83
+84	val_84
+84	val_84
+85	val_85
+86	val_86
+87	val_87
+90	val_90
+90	val_90
+90	val_90
+92	val_92
+95	val_95
+95	val_95
+96	val_96
+97	val_97
+97	val_97
+98	val_98
+98	val_98
+100	val_100
+100	val_100
+103	val_103
+103	val_103
+104	val_104
+104	val_104
+105	val_105
+111	val_111
+113	val_113
+113	val_113
+114	val_114
+116	val_116
+118	val_118
+118	val_118
+119	val_119
+119	val_119
+119	val_119
+120	val_120
+120	val_120
+125	val_125
+125	val_125
+126	val_126
+128	val_128
+128	val_128
+128	val_128
+129	val_129
+129	val_129
+131	val_131
+133	val_133
+134	val_134
+134	val_134
+136	val_136
+137	val_137
+137	val_137
+138	val_138
+138	val_138
+138	val_138
+138	val_138
+143	val_143
+145	val_145
+146	val_146
+146	val_146
+149	val_149
+149	val_149
+150	val_150
+152	val_152
+152	val_152
+153	val_153
+155	val_155
+156	val_156
+157	val_157
+158	val_158
+160	val_160
+162	val_162
+163	val_163
+164	val_164
+164	val_164
+165	val_165
+165	val_165
+166	val_166
+167	val_167
+167	val_167
+167	val_167
+168	val_168
+169	val_169
+169	val_169
+169	val_169
+169	val_169
+170	val_170
+172	val_172
+172	val_172
+174	val_174
+174	val_174
+175	val_175
+175	val_175
+176	val_176
+176	val_176
+177	val_177
+178	val_178
+179	val_179
+179	val_179
+180	val_180
+181	val_181
+183	val_183
+186	val_186
+187	val_187
+187	val_187
+187	val_187
+189	val_189
+190	val_190
+191	val_191
+191	val_191
+192	val_192
+193	val_193
+193	val_193
+193	val_193
+194	val_194
+195	val_195
+195	val_195
+196	val_196
+197	val_197
+197	val_197
+199	val_199
+199	val_199
+199	val_199
+200	val_200
+200	val_200
+201	val_201
+202	val_202
+203	val_203
+203	val_203
+205	val_205
+205	val_205
+207	val_207
+207	val_207
+208	val_208
+208	val_208
+208	val_208
+209	val_209
+209	val_209
+213	val_213
+213	val_213
+214	val_214
+216	val_216
+216	val_216
+217	val_217
+217	val_217
+218	val_218
+219	val_219
+219	val_219
+221	val_221
+221	val_221
+222	val_222
+223	val_223
+223	val_223
+224	val_224
+224	val_224
+226	val_226
+228	val_228
+229	val_229
+229	val_229
+230	val_230
+230	val_230
+230	val_230
+230	val_230
+230	val_230
+233	val_233
+233	val_233
+235	val_235
+237	val_237
+237	val_237
+238	val_238
+238	val_238
+239	val_239
+239	val_239
+241	val_241
+242	val_242
+242	val_242
+244	val_244
+247	val_247
+248	val_248
+249	val_249
+252	val_252
+255	val_255
+255	val_255
+256	val_256
+256	val_256
+257	val_257
+258	val_258
+260	val_260
+262	val_262
+263	val_263
+265	val_265
+265	val_265
+266	val_266
+272	val_272
+272	val_272
+273	val_273
+273	val_273
+273	val_273
+274	val_274
+275	val_275
+277	val_277
+277	val_277
+277	val_277
+277	val_277
+278	val_278
+278	val_278
+280	val_280
+280	val_280
+281	val_281
+281	val_281
+282	val_282
+282	val_282
+283	val_283
+284	val_284
+285	val_285
+286	val_286
+287	val_287
+288	val_288
+288	val_288
+289	val_289
+291	val_291
+292	val_292
+296	val_296
+298	val_298
+298	val_298
+298	val_298
+302	val_302
+305	val_305
+306	val_306
+307	val_307
+307	val_307
+308	val_308
+309	val_309
+309	val_309
+310	val_310
+311	val_311
+311	val_311
+311	val_311
+315	val_315
+316	val_316
+316	val_316
+316	val_316
+317	val_317
+317	val_317
+318	val_318
+318	val_318
+318	val_318
+321	val_321
+321	val_321
+322	val_322
+322	val_322
+323	val_323
+325	val_325
+325	val_325
+327	val_327
+327	val_327
+327	val_327
+331	val_331
+331	val_331
+332	val_332
+333	val_333
+333	val_333
+335	val_335
+336	val_336
+338	val_338
+339	val_339
+341	val_341
+342	val_342
+342	val_342
+344	val_344
+344	val_344
+345	val_345
+348	val_348
+348	val_348
+348	val_348
+348	val_348
+348	val_348
+351	val_351
+353	val_353
+353	val_353
+356	val_356
+360	val_360
+362	val_362
+364	val_364
+365	val_365
+366	val_366
+367	val_367
+367	val_367
+368	val_368
+369	val_369
+369	val_369
+369	val_369
+373	val_373
+374	val_374
+375	val_375
+377	val_377
+378	val_378
+379	val_379
+382	val_382
+382	val_382
+384	val_384
+384	val_384
+384	val_384
+386	val_386
+389	val_389
+392	val_392
+393	val_393
+394	val_394
+395	val_395
+395	val_395
+396	val_396
+396	val_396
+396	val_396
+397	val_397
+397	val_397
+399	val_399
+399	val_399
+400	val_400
+401	val_401
+401	val_401
+401	val_401
+401	val_401
+401	val_401
+402	val_402
+403	val_403
+403	val_403
+403	val_403
+404	val_404
+404	val_404
+406	val_406
+406	val_406
+406	val_406
+406	val_406
+407	val_407
+409	val_409
+409	val_409
+409	val_409
+411	val_411
+413	val_413
+413	val_413
+414	val_414
+414	val_414
+417	val_417
+417	val_417
+417	val_417
+418	val_418
+419	val_419
+421	val_421
+424	val_424
+424	val_424
+427	val_427
+429	val_429
+429	val_429
+430	val_430
+430	val_430
+430	val_430
+431	val_431
+431	val_431
+431	val_431
+432	val_432
+435	val_435
+436	val_436
+437	val_437
+438	val_438
+438	val_438
+438	val_438
+439	val_439
+439	val_439
+443	val_443
+444	val_444
+446	val_446
+448	val_448
+449	val_449
+452	val_452
+453	val_453
+454	val_454
+454	val_454
+454	val_454
+455	val_455
+457	val_457
+458	val_458
+458	val_458
+459	val_459
+459	val_459
+460	val_460
+462	val_462
+462	val_462
+463	val_463
+463	val_463
+466	val_466
+466	val_466
+466	val_466
+467	val_467
+468	val_468
+468	val_468
+468	val_468
+468	val_468
+469	val_469
+469	val_469
+469	val_469
+469	val_469
+469	val_469
+470	val_470
+472	val_472
+475	val_475
+477	val_477
+478	val_478
+478	val_478
+479	val_479
+480	val_480
+480	val_480
+480	val_480
+481	val_481
+482	val_482
+483	val_483
+484	val_484
+485	val_485
+487	val_487
+489	val_489
+489	val_489
+489	val_489
+489	val_489
+490	val_490
+491	val_491
+492	val_492
+492	val_492
+493	val_493
+494	val_494
+495	val_495
+496	val_496
+497	val_497
+498	val_498
+498	val_498
+498	val_498
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id1-0-caf1c5fd299fdbdb655234d01d44caf2 b/sql/hive/src/test/resources/golden/groupby_grouping_id1-0-caf1c5fd299fdbdb655234d01d44caf2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id1-1-b76bf9f6c92f83c9a5f351f8460d1e3b b/sql/hive/src/test/resources/golden/groupby_grouping_id1-1-b76bf9f6c92f83c9a5f351f8460d1e3b
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id1-1-c0f14def6a135cc50cba364e810ce28e b/sql/hive/src/test/resources/golden/groupby_grouping_id1-1-c0f14def6a135cc50cba364e810ce28e
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id1-2-3dfbff77a9b56d44312814d3fb0d07fa b/sql/hive/src/test/resources/golden/groupby_grouping_id1-2-3dfbff77a9b56d44312814d3fb0d07fa
new file mode 100644
index 0000000000000..76280c6f3a1c8
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_id1-2-3dfbff77a9b56d44312814d3fb0d07fa
@@ -0,0 +1,18 @@
+NULL	NULL	0
+NULL	11	2
+NULL	12	2
+NULL	13	2
+NULL	17	2
+NULL	18	2
+NULL	28	2
+1	NULL	1
+1	11	3
+2	NULL	1
+2	12	3
+3	NULL	1
+3	13	3
+7	NULL	1
+7	17	3
+8	NULL	1
+8	18	3
+8	28	3
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id1-3-d113d984a30ad7b98c50a46158605a51 b/sql/hive/src/test/resources/golden/groupby_grouping_id1-3-d113d984a30ad7b98c50a46158605a51
new file mode 100644
index 0000000000000..b18af4e5dd637
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_id1-3-d113d984a30ad7b98c50a46158605a51
@@ -0,0 +1,12 @@
+0	NULL	NULL
+1	1	NULL
+3	1	11
+1	2	NULL
+3	2	12
+1	3	NULL
+3	3	13
+1	7	NULL
+3	7	17
+1	8	NULL
+3	8	18
+3	8	28
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id1-4-903a499840102e2cb722dd8b83820391 b/sql/hive/src/test/resources/golden/groupby_grouping_id1-4-903a499840102e2cb722dd8b83820391
new file mode 100644
index 0000000000000..5a7ac193cb11b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_id1-4-903a499840102e2cb722dd8b83820391
@@ -0,0 +1,18 @@
+NULL	NULL	0	0
+NULL	11	2	2
+NULL	12	2	2
+NULL	13	2	2
+NULL	17	2	2
+NULL	18	2	2
+NULL	28	2	2
+1	NULL	1	1
+1	11	3	3
+2	NULL	1	1
+2	12	3	3
+3	NULL	1	1
+3	13	3	3
+7	NULL	1	1
+7	17	3	3
+8	NULL	1	1
+8	18	3	3
+8	28	3	3
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id2-0-20539c642c514a590caca2f11395007e b/sql/hive/src/test/resources/golden/groupby_grouping_id2-0-20539c642c514a590caca2f11395007e
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id2-1-d1f3215e349f056c8de60b87a6a9855e b/sql/hive/src/test/resources/golden/groupby_grouping_id2-1-d1f3215e349f056c8de60b87a6a9855e
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id2-1-d6fb6c164cda6a13a71290dbf95fcc6e b/sql/hive/src/test/resources/golden/groupby_grouping_id2-1-d6fb6c164cda6a13a71290dbf95fcc6e
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id2-2-a00d1791b7fa7ac5a0505d95c3d12257 b/sql/hive/src/test/resources/golden/groupby_grouping_id2-2-a00d1791b7fa7ac5a0505d95c3d12257
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_id2-2-a00d1791b7fa7ac5a0505d95c3d12257
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id2-3-b3cb95405b1200603f40aaab24be7586 b/sql/hive/src/test/resources/golden/groupby_grouping_id2-3-b3cb95405b1200603f40aaab24be7586
new file mode 100644
index 0000000000000..66ac2d65ce245
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_id2-3-b3cb95405b1200603f40aaab24be7586
@@ -0,0 +1,11 @@
+NULL	NULL	0	6
+1	NULL	1	2
+1	NULL	3	1
+1	1	3	1
+2	NULL	1	1
+2	2	3	1
+3	NULL	1	2
+3	NULL	3	1
+3	3	3	1
+4	NULL	1	1
+4	5	3	1
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id2-4-9bdcf67baa5d7cf70cd2eb1a3ec6de74 b/sql/hive/src/test/resources/golden/groupby_grouping_id2-4-9bdcf67baa5d7cf70cd2eb1a3ec6de74
new file mode 100644
index 0000000000000..8c1e9630ebfd7
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_id2-4-9bdcf67baa5d7cf70cd2eb1a3ec6de74
@@ -0,0 +1,3 @@
+0	1
+1	4
+3	6
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id2-5-7dd97bda7e2a900dfc509a6133620b82 b/sql/hive/src/test/resources/golden/groupby_grouping_id2-5-7dd97bda7e2a900dfc509a6133620b82
new file mode 100644
index 0000000000000..42fc2290c2cba
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_id2-5-7dd97bda7e2a900dfc509a6133620b82
@@ -0,0 +1,53 @@
+0	0
+1	1
+1	1
+1	1
+1	1
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+1	1
+1	1
+1	1
+1	1
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+1	1
+1	1
+1	1
+1	1
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+1	1
+1	1
+1	1
+1	1
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id2-6-a7dc16cb82c595b18d4258a38a304b1e b/sql/hive/src/test/resources/golden/groupby_grouping_id2-6-a7dc16cb82c595b18d4258a38a304b1e
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_id2-6-a7dc16cb82c595b18d4258a38a304b1e
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id2-7-b3cb95405b1200603f40aaab24be7586 b/sql/hive/src/test/resources/golden/groupby_grouping_id2-7-b3cb95405b1200603f40aaab24be7586
new file mode 100644
index 0000000000000..66ac2d65ce245
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_id2-7-b3cb95405b1200603f40aaab24be7586
@@ -0,0 +1,11 @@
+NULL	NULL	0	6
+1	NULL	1	2
+1	NULL	3	1
+1	1	3	1
+2	NULL	1	1
+2	2	3	1
+3	NULL	1	2
+3	NULL	3	1
+3	3	3	1
+4	NULL	1	1
+4	5	3	1
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id2-8-9bdcf67baa5d7cf70cd2eb1a3ec6de74 b/sql/hive/src/test/resources/golden/groupby_grouping_id2-8-9bdcf67baa5d7cf70cd2eb1a3ec6de74
new file mode 100644
index 0000000000000..8c1e9630ebfd7
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_id2-8-9bdcf67baa5d7cf70cd2eb1a3ec6de74
@@ -0,0 +1,3 @@
+0	1
+1	4
+3	6
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_id2-9-7dd97bda7e2a900dfc509a6133620b82 b/sql/hive/src/test/resources/golden/groupby_grouping_id2-9-7dd97bda7e2a900dfc509a6133620b82
new file mode 100644
index 0000000000000..42fc2290c2cba
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_id2-9-7dd97bda7e2a900dfc509a6133620b82
@@ -0,0 +1,53 @@
+0	0
+1	1
+1	1
+1	1
+1	1
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+1	1
+1	1
+1	1
+1	1
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+1	1
+1	1
+1	1
+1	1
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+1	1
+1	1
+1	1
+1	1
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets1-0-91128722f50ec00b51e0bf6fe5695cd1 b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-0-91128722f50ec00b51e0bf6fe5695cd1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets1-1-3673d61d0944adeba77438d882839de4 b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-1-3673d61d0944adeba77438d882839de4
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets1-1-d1c300ea08361fb8237689c6cf8cc1b5 b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-1-d1c300ea08361fb8237689c6cf8cc1b5
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets1-2-5c14fabebc5b4c526c459a6e867ec61a b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-2-5c14fabebc5b4c526c459a6e867ec61a
new file mode 100644
index 0000000000000..7967c04c92149
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-2-5c14fabebc5b4c526c459a6e867ec61a
@@ -0,0 +1,6 @@
+8	1	1
+5	2	2
+1	1	3
+2	2	4
+2	3	5
+3	2	8
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets1-3-9f501f293fe180bf6322e93d8dea025a b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-3-9f501f293fe180bf6322e93d8dea025a
new file mode 100644
index 0000000000000..da4a754efa0e0
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-3-9f501f293fe180bf6322e93d8dea025a
@@ -0,0 +1,15 @@
+NULL	NULL	6
+NULL	1	2
+NULL	2	3
+NULL	3	1
+1	NULL	1
+1	1	1
+2	NULL	2
+2	2	1
+2	3	1
+3	NULL	1
+3	2	1
+5	NULL	1
+5	2	1
+8	NULL	1
+8	1	1
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets1-4-505ac6611b332d6cf4a364739075d49c b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-4-505ac6611b332d6cf4a364739075d49c
new file mode 100644
index 0000000000000..da4a754efa0e0
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-4-505ac6611b332d6cf4a364739075d49c
@@ -0,0 +1,15 @@
+NULL	NULL	6
+NULL	1	2
+NULL	2	3
+NULL	3	1
+1	NULL	1
+1	1	1
+2	NULL	2
+2	2	1
+2	3	1
+3	NULL	1
+3	2	1
+5	NULL	1
+5	2	1
+8	NULL	1
+8	1	1
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets1-5-a33cc72bf5adee428eea079847034b62 b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-5-a33cc72bf5adee428eea079847034b62
new file mode 100644
index 0000000000000..2d1b73d564955
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-5-a33cc72bf5adee428eea079847034b62
@@ -0,0 +1,11 @@
+1	NULL	1
+1	1	1
+2	NULL	2
+2	2	1
+2	3	1
+3	NULL	1
+3	2	1
+5	NULL	1
+5	2	1
+8	NULL	1
+8	1	1
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets1-6-5f0c2e1d5489c867261e575625349542 b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-6-5f0c2e1d5489c867261e575625349542
new file mode 100644
index 0000000000000..fe6c4d001180a
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-6-5f0c2e1d5489c867261e575625349542
@@ -0,0 +1,14 @@
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+1
+2
+3
+5
+8
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets1-7-9b3bda02733476012e2cda434d936423 b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-7-9b3bda02733476012e2cda434d936423
new file mode 100644
index 0000000000000..24d5fc2851703
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-7-9b3bda02733476012e2cda434d936423
@@ -0,0 +1,5 @@
+1
+2
+3
+5
+8
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets1-8-1273ad6760f8c3ddad07819362dcc324 b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-8-1273ad6760f8c3ddad07819362dcc324
new file mode 100644
index 0000000000000..8ba0d6d25a6f0
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets1-8-1273ad6760f8c3ddad07819362dcc324
@@ -0,0 +1,5 @@
+2.0	1
+4.0	1
+5.0	2
+7.0	1
+9.0	1
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets2-0-bb96e87a0d6b0d1a6167e424b086acf6 b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-0-bb96e87a0d6b0d1a6167e424b086acf6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-0-bb96e87a0d6b0d1a6167e424b086acf6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets2-1-91128722f50ec00b51e0bf6fe5695cd1 b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-1-91128722f50ec00b51e0bf6fe5695cd1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets2-10-f39e49610430c91e5af3876d15fbdfe3 b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-10-f39e49610430c91e5af3876d15fbdfe3
new file mode 100644
index 0000000000000..cabc9bb1d918f
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-10-f39e49610430c91e5af3876d15fbdfe3
@@ -0,0 +1,15 @@
+NULL	NULL	46
+NULL	1	8
+NULL	2	28
+NULL	3	10
+1	NULL	6
+1	1	6
+2	NULL	18
+2	2	8
+2	3	10
+3	NULL	16
+3	2	16
+5	NULL	4
+5	2	4
+8	NULL	2
+8	1	2
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets2-2-3673d61d0944adeba77438d882839de4 b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-2-3673d61d0944adeba77438d882839de4
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets2-2-d1c300ea08361fb8237689c6cf8cc1b5 b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-2-d1c300ea08361fb8237689c6cf8cc1b5
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets2-3-59adefab34d80e8e185b2ad03877d381 b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-3-59adefab34d80e8e185b2ad03877d381
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets2-4-9f501f293fe180bf6322e93d8dea025a b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-4-9f501f293fe180bf6322e93d8dea025a
new file mode 100644
index 0000000000000..da4a754efa0e0
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-4-9f501f293fe180bf6322e93d8dea025a
@@ -0,0 +1,15 @@
+NULL	NULL	6
+NULL	1	2
+NULL	2	3
+NULL	3	1
+1	NULL	1
+1	1	1
+2	NULL	2
+2	2	1
+2	3	1
+3	NULL	1
+3	2	1
+5	NULL	1
+5	2	1
+8	NULL	1
+8	1	1
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets2-5-1163c486fd7e2c4346805fb035e2f268 b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-5-1163c486fd7e2c4346805fb035e2f268
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets2-6-9459dc3e8ab1f09d6d912b686e7f37fc b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-6-9459dc3e8ab1f09d6d912b686e7f37fc
new file mode 100644
index 0000000000000..b20db4c79aa70
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-6-9459dc3e8ab1f09d6d912b686e7f37fc
@@ -0,0 +1,15 @@
+NULL	NULL	23.0
+NULL	1	4.0
+NULL	2	14.0
+NULL	3	5.0
+1	NULL	3.0
+1	1	3.0
+2	NULL	9.0
+2	2	4.0
+2	3	5.0
+3	NULL	8.0
+3	2	8.0
+5	NULL	2.0
+5	2	2.0
+8	NULL	1.0
+8	1	1.0
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets2-7-39db88427f92cb770b6daa38610c04e7 b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-7-39db88427f92cb770b6daa38610c04e7
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets2-8-e464ec3d5461bda47eac3d1ef8617786 b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-8-e464ec3d5461bda47eac3d1ef8617786
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets2-9-74126b100714164d13cbb3bff436c2ff b/sql/hive/src/test/resources/golden/groupby_grouping_sets2-9-74126b100714164d13cbb3bff436c2ff
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets3-0-4fe85ca1e68a895ef403afdcfbbf61bc b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-0-4fe85ca1e68a895ef403afdcfbbf61bc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets3-1-38373b67d392924967a4695689d2164e b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-1-38373b67d392924967a4695689d2164e
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets3-1-e8964b2aaeb388064c9fdac5ec687824 b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-1-e8964b2aaeb388064c9fdac5ec687824
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets3-2-c1942a377b1a440d4ed3dd05fed445d b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-2-c1942a377b1a440d4ed3dd05fed445d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets3-2-d95cf9dfae402d369f338b8516845e02 b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-2-d95cf9dfae402d369f338b8516845e02
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets3-3-b89ea2173180c8ae423d856f943e061f b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-3-b89ea2173180c8ae423d856f943e061f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-3-b89ea2173180c8ae423d856f943e061f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets3-4-77c57e1b4ca37c2dc715b65668cd0c59 b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-4-77c57e1b4ca37c2dc715b65668cd0c59
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-4-77c57e1b4ca37c2dc715b65668cd0c59
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets3-5-6623f95d90d929a6d2c8171a0698d4fd b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-5-6623f95d90d929a6d2c8171a0698d4fd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets3-6-aec59088408cc57248851d3ce04e2eef b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-6-aec59088408cc57248851d3ce04e2eef
new file mode 100644
index 0000000000000..b2d08949e9795
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-6-aec59088408cc57248851d3ce04e2eef
@@ -0,0 +1,16 @@
+NULL	NULL	3.8333333333333335	12
+NULL	1	2.0	5
+NULL	2	5.2	5
+NULL	3	5.0	2
+1	NULL	2.6666666666666665	3
+1	1	3.0	2
+1	2	2.0	1
+2	NULL	5.2	5
+2	2	5.333333333333333	3
+2	3	5.0	2
+3	NULL	8.0	1
+3	2	8.0	1
+5	NULL	2.0	1
+5	1	2.0	1
+8	NULL	1.0	2
+8	1	1.0	2
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets3-7-bb96e87a0d6b0d1a6167e424b086acf6 b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-7-bb96e87a0d6b0d1a6167e424b086acf6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-7-bb96e87a0d6b0d1a6167e424b086acf6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets3-8-f32c4a191759237733a10cd721b49966 b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-8-f32c4a191759237733a10cd721b49966
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets3-9-aec59088408cc57248851d3ce04e2eef b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-9-aec59088408cc57248851d3ce04e2eef
new file mode 100644
index 0000000000000..b2d08949e9795
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets3-9-aec59088408cc57248851d3ce04e2eef
@@ -0,0 +1,16 @@
+NULL	NULL	3.8333333333333335	12
+NULL	1	2.0	5
+NULL	2	5.2	5
+NULL	3	5.0	2
+1	NULL	2.6666666666666665	3
+1	1	3.0	2
+1	2	2.0	1
+2	NULL	5.2	5
+2	2	5.333333333333333	3
+2	3	5.0	2
+3	NULL	8.0	1
+3	2	8.0	1
+5	NULL	2.0	1
+5	1	2.0	1
+8	NULL	1.0	2
+8	1	1.0	2
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets4-0-d8ae5a3e613dd2dda392995b90d47565 b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-0-d8ae5a3e613dd2dda392995b90d47565
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-0-d8ae5a3e613dd2dda392995b90d47565
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets4-1-6ad4e855adb49babfa3ae6abac190be3 b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-1-6ad4e855adb49babfa3ae6abac190be3
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-1-6ad4e855adb49babfa3ae6abac190be3
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets4-2-1789808269c8bd0f6259227f07da1a6a b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-2-1789808269c8bd0f6259227f07da1a6a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets4-3-3673d61d0944adeba77438d882839de4 b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-3-3673d61d0944adeba77438d882839de4
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets4-3-d1c300ea08361fb8237689c6cf8cc1b5 b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-3-d1c300ea08361fb8237689c6cf8cc1b5
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets4-4-b51d813d0c5a410d8cf9765d85005a01 b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-4-b51d813d0c5a410d8cf9765d85005a01
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets4-5-9be98faf8588a3c8e7436f14c638e438 b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-5-9be98faf8588a3c8e7436f14c638e438
new file mode 100644
index 0000000000000..33060f0d51729
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-5-9be98faf8588a3c8e7436f14c638e438
@@ -0,0 +1,13 @@
+1	NULL	1	1	NULL	1
+1	NULL	1	1	1	1
+1	1	1	1	NULL	1
+1	1	1	1	1	1
+2	NULL	2	2	NULL	2
+2	NULL	2	2	2	1
+2	NULL	2	2	3	1
+2	2	1	2	NULL	2
+2	2	1	2	2	1
+2	2	1	2	3	1
+2	3	1	2	NULL	2
+2	3	1	2	2	1
+2	3	1	2	3	1
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets4-6-bb96e87a0d6b0d1a6167e424b086acf6 b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-6-bb96e87a0d6b0d1a6167e424b086acf6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-6-bb96e87a0d6b0d1a6167e424b086acf6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets4-7-98f7522ce136cdffb0c14163e613c250 b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-7-98f7522ce136cdffb0c14163e613c250
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets4-8-9be98faf8588a3c8e7436f14c638e438 b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-8-9be98faf8588a3c8e7436f14c638e438
new file mode 100644
index 0000000000000..33060f0d51729
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets4-8-9be98faf8588a3c8e7436f14c638e438
@@ -0,0 +1,13 @@
+1	NULL	1	1	NULL	1
+1	NULL	1	1	1	1
+1	1	1	1	NULL	1
+1	1	1	1	1	1
+2	NULL	2	2	NULL	2
+2	NULL	2	2	2	1
+2	NULL	2	2	3	1
+2	2	1	2	NULL	2
+2	2	1	2	2	1
+2	2	1	2	3	1
+2	3	1	2	NULL	2
+2	3	1	2	2	1
+2	3	1	2	3	1
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets5-0-d8ae5a3e613dd2dda392995b90d47565 b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-0-d8ae5a3e613dd2dda392995b90d47565
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-0-d8ae5a3e613dd2dda392995b90d47565
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets5-1-6ad4e855adb49babfa3ae6abac190be3 b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-1-6ad4e855adb49babfa3ae6abac190be3
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-1-6ad4e855adb49babfa3ae6abac190be3
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets5-2-1789808269c8bd0f6259227f07da1a6a b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-2-1789808269c8bd0f6259227f07da1a6a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets5-3-3673d61d0944adeba77438d882839de4 b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-3-3673d61d0944adeba77438d882839de4
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets5-3-d1c300ea08361fb8237689c6cf8cc1b5 b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-3-d1c300ea08361fb8237689c6cf8cc1b5
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets5-4-6a89a34347f1c0eb9e0763ecedddb6f9 b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-4-6a89a34347f1c0eb9e0763ecedddb6f9
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets5-5-af3569757b9f52fb9b1ead33130e1b4f b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-5-af3569757b9f52fb9b1ead33130e1b4f
new file mode 100644
index 0000000000000..da4a754efa0e0
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-5-af3569757b9f52fb9b1ead33130e1b4f
@@ -0,0 +1,15 @@
+NULL	NULL	6
+NULL	1	2
+NULL	2	3
+NULL	3	1
+1	NULL	1
+1	1	1
+2	NULL	2
+2	2	1
+2	3	1
+3	NULL	1
+3	2	1
+5	NULL	1
+5	2	1
+8	NULL	1
+8	1	1
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets5-6-bb96e87a0d6b0d1a6167e424b086acf6 b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-6-bb96e87a0d6b0d1a6167e424b086acf6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-6-bb96e87a0d6b0d1a6167e424b086acf6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets5-7-6544a382d851f916616c4386fdcf0ed8 b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-7-6544a382d851f916616c4386fdcf0ed8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_grouping_sets5-8-af3569757b9f52fb9b1ead33130e1b4f b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-8-af3569757b9f52fb9b1ead33130e1b4f
new file mode 100644
index 0000000000000..da4a754efa0e0
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/groupby_grouping_sets5-8-af3569757b9f52fb9b1ead33130e1b4f
@@ -0,0 +1,15 @@
+NULL	NULL	6
+NULL	1	2
+NULL	2	3
+NULL	3	1
+1	NULL	1
+1	1	1
+2	NULL	2
+2	2	1
+2	3	1
+3	NULL	1
+3	2	1
+5	NULL	1
+5	2	1
+8	NULL	1
+8	1	1
diff --git a/sql/hive/src/test/resources/golden/udf5-0-2a18d9570d9b676e240cda76df818c42 b/sql/hive/src/test/resources/golden/udf5-0-2a18d9570d9b676e240cda76df818c42
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/udf5-1-f60851dc36f579e83d6848d7d3c589e6 b/sql/hive/src/test/resources/golden/udf5-1-f60851dc36f579e83d6848d7d3c589e6
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/udf5-2-e08fad5ccbf165f44ecabb9356e58b24 b/sql/hive/src/test/resources/golden/udf5-2-e08fad5ccbf165f44ecabb9356e58b24
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/udf5-3-d23017942dc49be9f5a61430201371bf b/sql/hive/src/test/resources/golden/udf5-3-d23017942dc49be9f5a61430201371bf
new file mode 100644
index 0000000000000..4cdf7737e251f
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf5-3-d23017942dc49be9f5a61430201371bf
@@ -0,0 +1 @@
+2008-11-11 15:32:20	2008-11-11	1	11	2008	1	11	2008
diff --git a/sql/hive/src/test/resources/golden/udf5-4-1b35f4ee3febf99804db1f481af80b23 b/sql/hive/src/test/resources/golden/udf5-4-1b35f4ee3febf99804db1f481af80b23
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/udf5-5-2125da6f09799cf7f10b838fc8f24e71 b/sql/hive/src/test/resources/golden/udf5-5-2125da6f09799cf7f10b838fc8f24e71
new file mode 100644
index 0000000000000..bfd616764b235
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf5-5-2125da6f09799cf7f10b838fc8f24e71
@@ -0,0 +1 @@
+01/13/10 11:57:40	2010-01-13 11:57:40
diff --git a/sql/hive/src/test/resources/golden/udf_java_method-0-50131c0ba7b7a6b65c789a5a8497bada b/sql/hive/src/test/resources/golden/udf_java_method-0-50131c0ba7b7a6b65c789a5a8497bada
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_java_method-0-50131c0ba7b7a6b65c789a5a8497bada
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/udf_java_method-0-991b98a25032b21802bc2a1efde606c7 b/sql/hive/src/test/resources/golden/udf_java_method-0-991b98a25032b21802bc2a1efde606c7
new file mode 100644
index 0000000000000..91e538becfc96
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_java_method-0-991b98a25032b21802bc2a1efde606c7
@@ -0,0 +1 @@
+java_method(class,method[,arg1[,arg2..]]) calls method with reflection
diff --git a/sql/hive/src/test/resources/golden/udf_java_method-1-991b98a25032b21802bc2a1efde606c7 b/sql/hive/src/test/resources/golden/udf_java_method-1-991b98a25032b21802bc2a1efde606c7
new file mode 100644
index 0000000000000..91e538becfc96
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_java_method-1-991b98a25032b21802bc2a1efde606c7
@@ -0,0 +1 @@
+java_method(class,method[,arg1[,arg2..]]) calls method with reflection
diff --git a/sql/hive/src/test/resources/golden/udf_java_method-1-a3b94d9f2c2caf85a588b6686a64630a b/sql/hive/src/test/resources/golden/udf_java_method-1-a3b94d9f2c2caf85a588b6686a64630a
new file mode 100644
index 0000000000000..6315f678b46f8
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_java_method-1-a3b94d9f2c2caf85a588b6686a64630a
@@ -0,0 +1,4 @@
+java_method(class,method[,arg1[,arg2..]]) calls method with reflection
+Synonyms: reflect
+Use this UDF to call Java methods by matching the argument signature
+
diff --git a/sql/hive/src/test/resources/golden/udf_java_method-2-69e6b8725086a8fb8f55721705442112 b/sql/hive/src/test/resources/golden/udf_java_method-2-69e6b8725086a8fb8f55721705442112
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/udf_java_method-2-a3b94d9f2c2caf85a588b6686a64630a b/sql/hive/src/test/resources/golden/udf_java_method-2-a3b94d9f2c2caf85a588b6686a64630a
new file mode 100644
index 0000000000000..6315f678b46f8
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_java_method-2-a3b94d9f2c2caf85a588b6686a64630a
@@ -0,0 +1,4 @@
+java_method(class,method[,arg1[,arg2..]]) calls method with reflection
+Synonyms: reflect
+Use this UDF to call Java methods by matching the argument signature
+
diff --git a/sql/hive/src/test/resources/golden/udf_java_method-3-9ae6daaf9783d3d6577231320727582a b/sql/hive/src/test/resources/golden/udf_java_method-3-9ae6daaf9783d3d6577231320727582a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/udf_java_method-3-c526dfd4d9eac718ced9afb3cf9a62fd b/sql/hive/src/test/resources/golden/udf_java_method-3-c526dfd4d9eac718ced9afb3cf9a62fd
new file mode 100644
index 0000000000000..51ff65ea1870f
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_java_method-3-c526dfd4d9eac718ced9afb3cf9a62fd
@@ -0,0 +1 @@
+1	true	3	2	3	2.718281828459045	1.0
diff --git a/sql/hive/src/test/resources/golden/udf_java_method-4-2fc4554258492a1d92c89a8dbad6c1c3 b/sql/hive/src/test/resources/golden/udf_java_method-4-2fc4554258492a1d92c89a8dbad6c1c3
new file mode 100644
index 0000000000000..51ff65ea1870f
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_java_method-4-2fc4554258492a1d92c89a8dbad6c1c3
@@ -0,0 +1 @@
+1	true	3	2	3	2.718281828459045	1.0
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-0-50131c0ba7b7a6b65c789a5a8497bada b/sql/hive/src/test/resources/golden/udf_pmod-0-50131c0ba7b7a6b65c789a5a8497bada
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-0-50131c0ba7b7a6b65c789a5a8497bada
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-0-ed67184beaf84c0542117c26651938e1 b/sql/hive/src/test/resources/golden/udf_pmod-0-ed67184beaf84c0542117c26651938e1
new file mode 100644
index 0000000000000..5d2fc352ee060
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-0-ed67184beaf84c0542117c26651938e1
@@ -0,0 +1 @@
+a pmod b - Compute the positive modulo
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-1-90f75e01dcee85253a501d53b8562dae b/sql/hive/src/test/resources/golden/udf_pmod-1-90f75e01dcee85253a501d53b8562dae
new file mode 100644
index 0000000000000..5d2fc352ee060
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-1-90f75e01dcee85253a501d53b8562dae
@@ -0,0 +1 @@
+a pmod b - Compute the positive modulo
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-1-ed67184beaf84c0542117c26651938e1 b/sql/hive/src/test/resources/golden/udf_pmod-1-ed67184beaf84c0542117c26651938e1
new file mode 100644
index 0000000000000..5d2fc352ee060
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-1-ed67184beaf84c0542117c26651938e1
@@ -0,0 +1 @@
+a pmod b - Compute the positive modulo
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-10-b2c7b3ae343b0a21037fe089c1348bf2 b/sql/hive/src/test/resources/golden/udf_pmod-10-b2c7b3ae343b0a21037fe089c1348bf2
new file mode 100644
index 0000000000000..0b46af11c4516
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-10-b2c7b3ae343b0a21037fe089c1348bf2
@@ -0,0 +1 @@
+6.89	51.7	18.09
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-10-cab94a0c6b36a489aab9f3f305b92772 b/sql/hive/src/test/resources/golden/udf_pmod-10-cab94a0c6b36a489aab9f3f305b92772
new file mode 100644
index 0000000000000..ab842acd48b3c
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-10-cab94a0c6b36a489aab9f3f305b92772
@@ -0,0 +1 @@
+6.890000000000011	51.699999999999996	18.090000000000003
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-11-7ca6baa647215c334419d1bb8a527449 b/sql/hive/src/test/resources/golden/udf_pmod-11-7ca6baa647215c334419d1bb8a527449
new file mode 100644
index 0000000000000..0b46af11c4516
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-11-7ca6baa647215c334419d1bb8a527449
@@ -0,0 +1 @@
+6.89	51.7	18.09
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-2-8ac9813b27801704082c6e9ea4cdc312 b/sql/hive/src/test/resources/golden/udf_pmod-2-8ac9813b27801704082c6e9ea4cdc312
new file mode 100644
index 0000000000000..7951defec192a
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-2-8ac9813b27801704082c6e9ea4cdc312
@@ -0,0 +1 @@
+NULL
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-2-90f75e01dcee85253a501d53b8562dae b/sql/hive/src/test/resources/golden/udf_pmod-2-90f75e01dcee85253a501d53b8562dae
new file mode 100644
index 0000000000000..5d2fc352ee060
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-2-90f75e01dcee85253a501d53b8562dae
@@ -0,0 +1 @@
+a pmod b - Compute the positive modulo
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-3-26d9546f030281a29a50a3e8e5858234 b/sql/hive/src/test/resources/golden/udf_pmod-3-26d9546f030281a29a50a3e8e5858234
new file mode 100644
index 0000000000000..5eb0813b60eb6
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-3-26d9546f030281a29a50a3e8e5858234
@@ -0,0 +1 @@
+8	51	15
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-3-8d6dd8a5e7a519fdc5261e4193e3464f b/sql/hive/src/test/resources/golden/udf_pmod-3-8d6dd8a5e7a519fdc5261e4193e3464f
new file mode 100644
index 0000000000000..7951defec192a
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-3-8d6dd8a5e7a519fdc5261e4193e3464f
@@ -0,0 +1 @@
+NULL
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-4-608e04ca8855780fb9e60486759b19b8 b/sql/hive/src/test/resources/golden/udf_pmod-4-608e04ca8855780fb9e60486759b19b8
new file mode 100644
index 0000000000000..5eb0813b60eb6
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-4-608e04ca8855780fb9e60486759b19b8
@@ -0,0 +1 @@
+8	51	15
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-4-7695df16d24a821224676e6bad3d66d1 b/sql/hive/src/test/resources/golden/udf_pmod-4-7695df16d24a821224676e6bad3d66d1
new file mode 100644
index 0000000000000..e21e4b08e7a62
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-4-7695df16d24a821224676e6bad3d66d1
@@ -0,0 +1 @@
+5	50	0
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-5-cf5311d51d44afb8d73f588e27d5e029 b/sql/hive/src/test/resources/golden/udf_pmod-5-cf5311d51d44afb8d73f588e27d5e029
new file mode 100644
index 0000000000000..e0bc2a844fb46
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-5-cf5311d51d44afb8d73f588e27d5e029
@@ -0,0 +1 @@
+8	51	16
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-5-e3ca9fe032dd6f71e33ddf367ef5e2cf b/sql/hive/src/test/resources/golden/udf_pmod-5-e3ca9fe032dd6f71e33ddf367ef5e2cf
new file mode 100644
index 0000000000000..e21e4b08e7a62
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-5-e3ca9fe032dd6f71e33ddf367ef5e2cf
@@ -0,0 +1 @@
+5	50	0
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-6-3c09a8da2f5645e732c22a45d055125 b/sql/hive/src/test/resources/golden/udf_pmod-6-3c09a8da2f5645e732c22a45d055125
new file mode 100644
index 0000000000000..e0bc2a844fb46
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-6-3c09a8da2f5645e732c22a45d055125
@@ -0,0 +1 @@
+8	51	16
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-6-c5c810e71bed8e56c1bac59b7d9c16c5 b/sql/hive/src/test/resources/golden/udf_pmod-6-c5c810e71bed8e56c1bac59b7d9c16c5
new file mode 100644
index 0000000000000..e0bc2a844fb46
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-6-c5c810e71bed8e56c1bac59b7d9c16c5
@@ -0,0 +1 @@
+8	51	16
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-7-3665a6414590bb2aff522dfe847dbc0e b/sql/hive/src/test/resources/golden/udf_pmod-7-3665a6414590bb2aff522dfe847dbc0e
new file mode 100644
index 0000000000000..e0bc2a844fb46
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-7-3665a6414590bb2aff522dfe847dbc0e
@@ -0,0 +1 @@
+8	51	16
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-7-a5fcbb9c74f9ee98e65b74197b10f618 b/sql/hive/src/test/resources/golden/udf_pmod-7-a5fcbb9c74f9ee98e65b74197b10f618
new file mode 100644
index 0000000000000..e0bc2a844fb46
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-7-a5fcbb9c74f9ee98e65b74197b10f618
@@ -0,0 +1 @@
+8	51	16
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-8-95f85c34076952af0640b596365b27ca b/sql/hive/src/test/resources/golden/udf_pmod-8-95f85c34076952af0640b596365b27ca
new file mode 100644
index 0000000000000..e0bc2a844fb46
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-8-95f85c34076952af0640b596365b27ca
@@ -0,0 +1 @@
+8	51	16
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-8-f49d1f1fab1d9bc19be787efbe6036dd b/sql/hive/src/test/resources/golden/udf_pmod-8-f49d1f1fab1d9bc19be787efbe6036dd
new file mode 100644
index 0000000000000..48371142e9b5d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-8-f49d1f1fab1d9bc19be787efbe6036dd
@@ -0,0 +1 @@
+6.8899984	51.700005	18.089996
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-9-798ef5064b61d0ae403e3e11c8fd749b b/sql/hive/src/test/resources/golden/udf_pmod-9-798ef5064b61d0ae403e3e11c8fd749b
new file mode 100644
index 0000000000000..48371142e9b5d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-9-798ef5064b61d0ae403e3e11c8fd749b
@@ -0,0 +1 @@
+6.8899984	51.700005	18.089996
diff --git a/sql/hive/src/test/resources/golden/udf_pmod-9-e7280393102077442aa1d10eb69a6d57 b/sql/hive/src/test/resources/golden/udf_pmod-9-e7280393102077442aa1d10eb69a6d57
new file mode 100644
index 0000000000000..ab842acd48b3c
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_pmod-9-e7280393102077442aa1d10eb69a6d57
@@ -0,0 +1 @@
+6.890000000000011	51.699999999999996	18.090000000000003
diff --git a/sql/hive/src/test/resources/golden/udf_round-0-10b53ca1f15fd7879365926f86512d15 b/sql/hive/src/test/resources/golden/udf_round-0-10b53ca1f15fd7879365926f86512d15
new file mode 100644
index 0000000000000..49fdc0a774e70
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_round-0-10b53ca1f15fd7879365926f86512d15
@@ -0,0 +1 @@
+round(x[, d]) - round x to d decimal places
diff --git a/sql/hive/src/test/resources/golden/udf_round-1-2367bcc43510dedc80bdb6707e434da8 b/sql/hive/src/test/resources/golden/udf_round-1-2367bcc43510dedc80bdb6707e434da8
new file mode 100644
index 0000000000000..862adeae821ff
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_round-1-2367bcc43510dedc80bdb6707e434da8
@@ -0,0 +1,4 @@
+round(x[, d]) - round x to d decimal places
+Example:
+  > SELECT round(12.3456, 1) FROM src LIMIT 1;
+  12.3'
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-0-50131c0ba7b7a6b65c789a5a8497bada b/sql/hive/src/test/resources/golden/udf_to_double-0-50131c0ba7b7a6b65c789a5a8497bada
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-0-50131c0ba7b7a6b65c789a5a8497bada
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-0-cbe030be095a93a9ae810ce7e66bdca7 b/sql/hive/src/test/resources/golden/udf_to_double-0-cbe030be095a93a9ae810ce7e66bdca7
new file mode 100644
index 0000000000000..7951defec192a
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-0-cbe030be095a93a9ae810ce7e66bdca7
@@ -0,0 +1 @@
+NULL
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-1-98d2ce732277c3a7fb4827d8b221a43a b/sql/hive/src/test/resources/golden/udf_to_double-1-98d2ce732277c3a7fb4827d8b221a43a
new file mode 100644
index 0000000000000..d3827e75a5cad
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-1-98d2ce732277c3a7fb4827d8b221a43a
@@ -0,0 +1 @@
+1.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-1-9d7f149dc28bd312425392c3f2abea72 b/sql/hive/src/test/resources/golden/udf_to_double-1-9d7f149dc28bd312425392c3f2abea72
new file mode 100644
index 0000000000000..7951defec192a
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-1-9d7f149dc28bd312425392c3f2abea72
@@ -0,0 +1 @@
+NULL
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-2-69bf8a5a4cb378bbd54c20cb8aa97abe b/sql/hive/src/test/resources/golden/udf_to_double-2-69bf8a5a4cb378bbd54c20cb8aa97abe
new file mode 100644
index 0000000000000..319fde05380bc
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-2-69bf8a5a4cb378bbd54c20cb8aa97abe
@@ -0,0 +1 @@
+-7.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-2-e5df309104b260ff9145229d119a774d b/sql/hive/src/test/resources/golden/udf_to_double-2-e5df309104b260ff9145229d119a774d
new file mode 100644
index 0000000000000..d3827e75a5cad
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-2-e5df309104b260ff9145229d119a774d
@@ -0,0 +1 @@
+1.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-3-ab23099412d24154ff369d8bd6bde89f b/sql/hive/src/test/resources/golden/udf_to_double-3-ab23099412d24154ff369d8bd6bde89f
new file mode 100644
index 0000000000000..8c1c4fe62b6c2
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-3-ab23099412d24154ff369d8bd6bde89f
@@ -0,0 +1 @@
+-18.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-3-abd1a8fc84fcd692891c1ac242492e45 b/sql/hive/src/test/resources/golden/udf_to_double-3-abd1a8fc84fcd692891c1ac242492e45
new file mode 100644
index 0000000000000..319fde05380bc
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-3-abd1a8fc84fcd692891c1ac242492e45
@@ -0,0 +1 @@
+-7.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-4-293a639a2b61a11da6ca798c04624f68 b/sql/hive/src/test/resources/golden/udf_to_double-4-293a639a2b61a11da6ca798c04624f68
new file mode 100644
index 0000000000000..1b650de78904f
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-4-293a639a2b61a11da6ca798c04624f68
@@ -0,0 +1 @@
+-129.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-4-3651962b1a5fac4f1dc02f0403e68471 b/sql/hive/src/test/resources/golden/udf_to_double-4-3651962b1a5fac4f1dc02f0403e68471
new file mode 100644
index 0000000000000..8c1c4fe62b6c2
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-4-3651962b1a5fac4f1dc02f0403e68471
@@ -0,0 +1 @@
+-18.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-5-39bd92a64ad9d5f57d477bf668e08da5 b/sql/hive/src/test/resources/golden/udf_to_double-5-39bd92a64ad9d5f57d477bf668e08da5
new file mode 100644
index 0000000000000..1b650de78904f
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-5-39bd92a64ad9d5f57d477bf668e08da5
@@ -0,0 +1 @@
+-129.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-5-42d1e80bb3324030c62a23c6d1b786a8 b/sql/hive/src/test/resources/golden/udf_to_double-5-42d1e80bb3324030c62a23c6d1b786a8
new file mode 100644
index 0000000000000..3a3bd0df03b5b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-5-42d1e80bb3324030c62a23c6d1b786a8
@@ -0,0 +1 @@
+-1025.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-6-5bac1a7db00d788fd7d82e3a78e60be6 b/sql/hive/src/test/resources/golden/udf_to_double-6-5bac1a7db00d788fd7d82e3a78e60be6
new file mode 100644
index 0000000000000..38f7ad5afa0ab
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-6-5bac1a7db00d788fd7d82e3a78e60be6
@@ -0,0 +1 @@
+-3.140000104904175
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-6-b65b3f3b72ce068c2b954850fe5fc2a6 b/sql/hive/src/test/resources/golden/udf_to_double-6-b65b3f3b72ce068c2b954850fe5fc2a6
new file mode 100644
index 0000000000000..3a3bd0df03b5b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-6-b65b3f3b72ce068c2b954850fe5fc2a6
@@ -0,0 +1 @@
+-1025.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-7-97080ab9cd416f8acd8803291e9dc9e5 b/sql/hive/src/test/resources/golden/udf_to_double-7-97080ab9cd416f8acd8803291e9dc9e5
new file mode 100644
index 0000000000000..01e913dbfe725
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-7-97080ab9cd416f8acd8803291e9dc9e5
@@ -0,0 +1 @@
+-3.14
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-7-ab78e74674e92847fd44db0d21c2a145 b/sql/hive/src/test/resources/golden/udf_to_double-7-ab78e74674e92847fd44db0d21c2a145
new file mode 100644
index 0000000000000..38f7ad5afa0ab
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-7-ab78e74674e92847fd44db0d21c2a145
@@ -0,0 +1 @@
+-3.140000104904175
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-8-22e97175b71ca7fd8668130f5a757aee b/sql/hive/src/test/resources/golden/udf_to_double-8-22e97175b71ca7fd8668130f5a757aee
new file mode 100644
index 0000000000000..01e913dbfe725
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-8-22e97175b71ca7fd8668130f5a757aee
@@ -0,0 +1 @@
+-3.14
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-8-df51146f6ef960c77cd1722191e4b982 b/sql/hive/src/test/resources/golden/udf_to_double-8-df51146f6ef960c77cd1722191e4b982
new file mode 100644
index 0000000000000..f45d1f04dc920
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-8-df51146f6ef960c77cd1722191e4b982
@@ -0,0 +1 @@
+-38.14
diff --git a/sql/hive/src/test/resources/golden/udf_to_double-9-53d0629f93ae811965bb4658e1aa3cb9 b/sql/hive/src/test/resources/golden/udf_to_double-9-53d0629f93ae811965bb4658e1aa3cb9
new file mode 100644
index 0000000000000..f45d1f04dc920
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_double-9-53d0629f93ae811965bb4658e1aa3cb9
@@ -0,0 +1 @@
+-38.14
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-0-50131c0ba7b7a6b65c789a5a8497bada b/sql/hive/src/test/resources/golden/udf_to_float-0-50131c0ba7b7a6b65c789a5a8497bada
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-0-50131c0ba7b7a6b65c789a5a8497bada
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-0-7646eca02448547eedf84a81bf42be89 b/sql/hive/src/test/resources/golden/udf_to_float-0-7646eca02448547eedf84a81bf42be89
new file mode 100644
index 0000000000000..7951defec192a
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-0-7646eca02448547eedf84a81bf42be89
@@ -0,0 +1 @@
+NULL
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-1-6f0f1660c78aa1318ae1da4a2afdd9dd b/sql/hive/src/test/resources/golden/udf_to_float-1-6f0f1660c78aa1318ae1da4a2afdd9dd
new file mode 100644
index 0000000000000..7951defec192a
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-1-6f0f1660c78aa1318ae1da4a2afdd9dd
@@ -0,0 +1 @@
+NULL
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-1-eac0237ee0294d635c2b538f6e2f0a5c b/sql/hive/src/test/resources/golden/udf_to_float-1-eac0237ee0294d635c2b538f6e2f0a5c
new file mode 100644
index 0000000000000..d3827e75a5cad
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-1-eac0237ee0294d635c2b538f6e2f0a5c
@@ -0,0 +1 @@
+1.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-2-39a67183b6d2a4da005baed849c5e971 b/sql/hive/src/test/resources/golden/udf_to_float-2-39a67183b6d2a4da005baed849c5e971
new file mode 100644
index 0000000000000..319fde05380bc
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-2-39a67183b6d2a4da005baed849c5e971
@@ -0,0 +1 @@
+-7.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-2-ef6719eced842e7efe970665b41f8c0 b/sql/hive/src/test/resources/golden/udf_to_float-2-ef6719eced842e7efe970665b41f8c0
new file mode 100644
index 0000000000000..d3827e75a5cad
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-2-ef6719eced842e7efe970665b41f8c0
@@ -0,0 +1 @@
+1.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-3-5c1ae08cfd9ffd4d3e57b7a6ec4e39ce b/sql/hive/src/test/resources/golden/udf_to_float-3-5c1ae08cfd9ffd4d3e57b7a6ec4e39ce
new file mode 100644
index 0000000000000..8c1c4fe62b6c2
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-3-5c1ae08cfd9ffd4d3e57b7a6ec4e39ce
@@ -0,0 +1 @@
+-18.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-3-bfb661d2179679c317a7b088837258d3 b/sql/hive/src/test/resources/golden/udf_to_float-3-bfb661d2179679c317a7b088837258d3
new file mode 100644
index 0000000000000..319fde05380bc
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-3-bfb661d2179679c317a7b088837258d3
@@ -0,0 +1 @@
+-7.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-4-477519ea558ef60feb754d442c1b13cc b/sql/hive/src/test/resources/golden/udf_to_float-4-477519ea558ef60feb754d442c1b13cc
new file mode 100644
index 0000000000000..8c1c4fe62b6c2
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-4-477519ea558ef60feb754d442c1b13cc
@@ -0,0 +1 @@
+-18.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-4-f3e0ab8ed691a386e6be4ce6993be507 b/sql/hive/src/test/resources/golden/udf_to_float-4-f3e0ab8ed691a386e6be4ce6993be507
new file mode 100644
index 0000000000000..1b650de78904f
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-4-f3e0ab8ed691a386e6be4ce6993be507
@@ -0,0 +1 @@
+-129.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-5-3ea0b65c600c2a6c0a2f20b36bc02c0a b/sql/hive/src/test/resources/golden/udf_to_float-5-3ea0b65c600c2a6c0a2f20b36bc02c0a
new file mode 100644
index 0000000000000..1b650de78904f
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-5-3ea0b65c600c2a6c0a2f20b36bc02c0a
@@ -0,0 +1 @@
+-129.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-5-75f364708c01b5e31f988f19e52b2201 b/sql/hive/src/test/resources/golden/udf_to_float-5-75f364708c01b5e31f988f19e52b2201
new file mode 100644
index 0000000000000..3a3bd0df03b5b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-5-75f364708c01b5e31f988f19e52b2201
@@ -0,0 +1 @@
+-1025.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-6-1d9b690354f7b04df660a9e3c448a002 b/sql/hive/src/test/resources/golden/udf_to_float-6-1d9b690354f7b04df660a9e3c448a002
new file mode 100644
index 0000000000000..01e913dbfe725
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-6-1d9b690354f7b04df660a9e3c448a002
@@ -0,0 +1 @@
+-3.14
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-6-6bf596b8ac0a57d7df844cca1c94a0c7 b/sql/hive/src/test/resources/golden/udf_to_float-6-6bf596b8ac0a57d7df844cca1c94a0c7
new file mode 100644
index 0000000000000..3a3bd0df03b5b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-6-6bf596b8ac0a57d7df844cca1c94a0c7
@@ -0,0 +1 @@
+-1025.0
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-7-cdfefa5173854b647a76383300f8b9d1 b/sql/hive/src/test/resources/golden/udf_to_float-7-cdfefa5173854b647a76383300f8b9d1
new file mode 100644
index 0000000000000..01e913dbfe725
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-7-cdfefa5173854b647a76383300f8b9d1
@@ -0,0 +1 @@
+-3.14
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-7-e4b449ba415538aac9c9ec421d8bcce8 b/sql/hive/src/test/resources/golden/udf_to_float-7-e4b449ba415538aac9c9ec421d8bcce8
new file mode 100644
index 0000000000000..01e913dbfe725
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-7-e4b449ba415538aac9c9ec421d8bcce8
@@ -0,0 +1 @@
+-3.14
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-8-5b0a785185bcaa98b581c5b3dbb3e12c b/sql/hive/src/test/resources/golden/udf_to_float-8-5b0a785185bcaa98b581c5b3dbb3e12c
new file mode 100644
index 0000000000000..f45d1f04dc920
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-8-5b0a785185bcaa98b581c5b3dbb3e12c
@@ -0,0 +1 @@
+-38.14
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-8-c46d1c40e52bef886e56ae1e07892bb7 b/sql/hive/src/test/resources/golden/udf_to_float-8-c46d1c40e52bef886e56ae1e07892bb7
new file mode 100644
index 0000000000000..01e913dbfe725
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-8-c46d1c40e52bef886e56ae1e07892bb7
@@ -0,0 +1 @@
+-3.14
diff --git a/sql/hive/src/test/resources/golden/udf_to_float-9-c5545924be7d13b1f4a13cb2bd0c17cc b/sql/hive/src/test/resources/golden/udf_to_float-9-c5545924be7d13b1f4a13cb2bd0c17cc
new file mode 100644
index 0000000000000..f45d1f04dc920
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_to_float-9-c5545924be7d13b1f4a13cb2bd0c17cc
@@ -0,0 +1 @@
+-38.14
diff --git a/sql/hive/src/test/resources/sample.json b/sql/hive/src/test/resources/sample.json
new file mode 100644
index 0000000000000..a2c2ffd5e0330
--- /dev/null
+++ b/sql/hive/src/test/resources/sample.json
@@ -0,0 +1,2 @@
+{"a" : "2" ,"b" : "blah", "c_!@(3)":1}
+{"<d>" : {"d!" : [4, 5], "=" : [{"Dd2": null}, {"Dd2" : true}]}}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala
index f89c49d292c6c..f320d732fb77a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.util._
  * So, we duplicate this code here.
  */
 class QueryTest extends PlanTest {
+
   /**
    * Runs the plan and makes sure the answer contains all of the keywords, or the
    * none of keywords are listed in the answer
@@ -56,17 +57,20 @@ class QueryTest extends PlanTest {
    * @param rdd the [[SchemaRDD]] to be executed
    * @param expectedAnswer the expected result, can either be an Any, Seq[Product], or Seq[ Seq[Any] ].
    */
-  protected def checkAnswer(rdd: SchemaRDD, expectedAnswer: Any): Unit = {
-    val convertedAnswer = expectedAnswer match {
-      case s: Seq[_] if s.isEmpty => s
-      case s: Seq[_] if s.head.isInstanceOf[Product] &&
-        !s.head.isInstanceOf[Seq[_]] => s.map(_.asInstanceOf[Product].productIterator.toIndexedSeq)
-      case s: Seq[_] => s
-      case singleItem => Seq(Seq(singleItem))
+  protected def checkAnswer(rdd: SchemaRDD, expectedAnswer: Seq[Row]): Unit = {
+    val isSorted = rdd.logicalPlan.collect { case s: logical.Sort => s }.nonEmpty
+    def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
+      // Converts data to types that we can do equality comparison using Scala collections.
+      // For BigDecimal type, the Scala type has a better definition of equality test (similar to
+      // Java's java.math.BigDecimal.compareTo).
+      val converted: Seq[Row] = answer.map { s =>
+        Row.fromSeq(s.toSeq.map {
+          case d: java.math.BigDecimal => BigDecimal(d)
+          case o => o
+        })
+      }
+      if (!isSorted) converted.sortBy(_.toString) else converted
     }
-
-    val isSorted = rdd.logicalPlan.collect { case s: logical.Sort => s}.nonEmpty
-    def prepareAnswer(answer: Seq[Any]) = if (!isSorted) answer.sortBy(_.toString) else answer
     val sparkAnswer = try rdd.collect().toSeq catch {
       case e: Exception =>
         fail(
@@ -74,11 +78,12 @@ class QueryTest extends PlanTest {
             |Exception thrown while executing query:
             |${rdd.queryExecution}
             |== Exception ==
-            |${stackTraceToString(e)}
+            |$e
+            |${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
           """.stripMargin)
     }
 
-    if(prepareAnswer(convertedAnswer) != prepareAnswer(sparkAnswer)) {
+    if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) {
       fail(s"""
         |Results do not match for query:
         |${rdd.logicalPlan}
@@ -88,11 +93,22 @@ class QueryTest extends PlanTest {
         |${rdd.queryExecution.executedPlan}
         |== Results ==
         |${sideBySide(
-            s"== Correct Answer - ${convertedAnswer.size} ==" +:
-              prepareAnswer(convertedAnswer).map(_.toString),
-            s"== Spark Answer - ${sparkAnswer.size} ==" +:
-              prepareAnswer(sparkAnswer).map(_.toString)).mkString("\n")}
+        s"== Correct Answer - ${expectedAnswer.size} ==" +:
+          prepareAnswer(expectedAnswer).map(_.toString),
+        s"== Spark Answer - ${sparkAnswer.size} ==" +:
+          prepareAnswer(sparkAnswer).map(_.toString)).mkString("\n")}
       """.stripMargin)
     }
   }
+
+  protected def checkAnswer(rdd: SchemaRDD, expectedAnswer: Row): Unit = {
+    checkAnswer(rdd, Seq(expectedAnswer))
+  }
+
+  def sqlTest(sqlString: String, expectedAnswer: Seq[Row])(implicit sqlContext: SQLContext): Unit = {
+    test(sqlString) {
+      checkAnswer(sqlContext.sql(sqlString), expectedAnswer)
+    }
+  }
+
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index 2060e1f1a7a4b..f95a6b43af357 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -158,4 +158,10 @@ class CachedTableSuite extends QueryTest {
     uncacheTable("src")
     assert(!isMaterialized(rddId), "Uncached in-memory table should have been unpersisted")
   }
+
+  test("CACHE TABLE with Hive UDF") {
+    sql("CACHE TABLE udfTest AS SELECT * FROM src WHERE floor(key) = 1")
+    assertCached(table("udfTest"))
+    uncacheTable("udfTest")
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
new file mode 100644
index 0000000000000..2d3ff680125ad
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import java.util
+import java.sql.Date
+import java.util.{Locale, TimeZone}
+
+import org.apache.hadoop.hive.ql.udf.UDAFPercentile
+import org.apache.hadoop.hive.serde2.io.DoubleWritable
+import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectInspectorFactory, StructObjectInspector}
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory
+import org.apache.hadoop.io.LongWritable
+import org.scalatest.FunSuite
+
+import org.apache.spark.sql.catalyst.expressions.{Literal, Row}
+import org.apache.spark.sql.types._
+
+class HiveInspectorSuite extends FunSuite with HiveInspectors {
+  test("Test wrap SettableStructObjectInspector") {
+    val udaf = new UDAFPercentile.PercentileLongEvaluator()
+    udaf.init()
+
+    udaf.iterate(new LongWritable(1), 0.1)
+    udaf.iterate(new LongWritable(1), 0.1)
+
+    val state = udaf.terminatePartial()
+
+    val soi = ObjectInspectorFactory.getReflectionObjectInspector(
+      classOf[UDAFPercentile.State],
+      ObjectInspectorOptions.JAVA).asInstanceOf[StructObjectInspector]
+
+    val a = unwrap(state, soi).asInstanceOf[Row]
+    val b = wrap(a, soi).asInstanceOf[UDAFPercentile.State]
+
+    val sfCounts = soi.getStructFieldRef("counts")
+    val sfPercentiles = soi.getStructFieldRef("percentiles")
+
+    assert(2 === soi.getStructFieldData(b, sfCounts)
+      .asInstanceOf[util.Map[LongWritable, LongWritable]]
+      .get(new LongWritable(1L))
+      .get())
+    assert(0.1 === soi.getStructFieldData(b, sfPercentiles)
+      .asInstanceOf[util.ArrayList[DoubleWritable]]
+      .get(0)
+      .get())
+  }
+
+  // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*)
+  TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
+  // Add Locale setting
+  Locale.setDefault(Locale.US)
+
+  val data =
+    Literal(true) ::
+    Literal(0.asInstanceOf[Byte]) ::
+    Literal(0.asInstanceOf[Short]) ::
+    Literal(0) ::
+    Literal(0.asInstanceOf[Long]) ::
+    Literal(0.asInstanceOf[Float]) ::
+    Literal(0.asInstanceOf[Double]) ::
+    Literal("0") ::
+    Literal(new Date(2014, 9, 23)) ::
+    Literal(Decimal(BigDecimal(123.123))) ::
+    Literal(new java.sql.Timestamp(123123)) ::
+    Literal(Array[Byte](1,2,3)) ::
+    Literal(Seq[Int](1,2,3), ArrayType(IntegerType)) ::
+    Literal(Map[Int, Int](1->2, 2->1), MapType(IntegerType, IntegerType)) ::
+    Literal(Row(1,2.0d,3.0f),
+      StructType(StructField("c1", IntegerType) ::
+      StructField("c2", DoubleType) ::
+      StructField("c3", FloatType) :: Nil)) ::
+    Nil
+
+  val row = data.map(_.eval(null))
+  val dataTypes = data.map(_.dataType)
+
+  def toWritableInspector(dataType: DataType): ObjectInspector = dataType match {
+    case ArrayType(tpe, _) =>
+      ObjectInspectorFactory.getStandardListObjectInspector(toWritableInspector(tpe))
+    case MapType(keyType, valueType, _) =>
+      ObjectInspectorFactory.getStandardMapObjectInspector(
+        toWritableInspector(keyType), toWritableInspector(valueType))
+    case StringType => PrimitiveObjectInspectorFactory.writableStringObjectInspector
+    case IntegerType => PrimitiveObjectInspectorFactory.writableIntObjectInspector
+    case DoubleType => PrimitiveObjectInspectorFactory.writableDoubleObjectInspector
+    case BooleanType => PrimitiveObjectInspectorFactory.writableBooleanObjectInspector
+    case LongType => PrimitiveObjectInspectorFactory.writableLongObjectInspector
+    case FloatType => PrimitiveObjectInspectorFactory.writableFloatObjectInspector
+    case ShortType => PrimitiveObjectInspectorFactory.writableShortObjectInspector
+    case ByteType => PrimitiveObjectInspectorFactory.writableByteObjectInspector
+    case NullType => PrimitiveObjectInspectorFactory.writableVoidObjectInspector
+    case BinaryType => PrimitiveObjectInspectorFactory.writableBinaryObjectInspector
+    case DateType => PrimitiveObjectInspectorFactory.writableDateObjectInspector
+    case TimestampType => PrimitiveObjectInspectorFactory.writableTimestampObjectInspector
+    case DecimalType() => PrimitiveObjectInspectorFactory.writableHiveDecimalObjectInspector
+    case StructType(fields) =>
+      ObjectInspectorFactory.getStandardStructObjectInspector(
+        java.util.Arrays.asList(fields.map(f => f.name) :_*),
+        java.util.Arrays.asList(fields.map(f => toWritableInspector(f.dataType)) :_*))
+  }
+
+  def checkDataType(dt1: Seq[DataType], dt2: Seq[DataType]): Unit = {
+    dt1.zip(dt2).map {
+      case (dd1, dd2) =>
+        assert(dd1.getClass === dd2.getClass)  // DecimalType doesn't has the default precision info
+    }
+  }
+
+  def checkValues(row1: Seq[Any], row2: Seq[Any]): Unit = {
+    row1.zip(row2).map {
+      case (r1, r2) => checkValue(r1, r2)
+    }
+  }
+
+  def checkValues(row1: Seq[Any], row2: Row): Unit = {
+    row1.zip(row2.toSeq).map {
+      case (r1, r2) => checkValue(r1, r2)
+    }
+  }
+
+  def checkValue(v1: Any, v2: Any): Unit = {
+    (v1, v2) match {
+      case (r1: Decimal, r2: Decimal) =>
+        // Ignore the Decimal precision
+        assert(r1.compare(r2) === 0)
+      case (r1: Array[Byte], r2: Array[Byte])
+        if r1 != null && r2 != null && r1.length == r2.length =>
+        r1.zip(r2).map { case (b1, b2) => assert(b1 === b2) }
+      case (r1: Date, r2: Date) => assert(r1.compareTo(r2) === 0)
+      case (r1, r2) => assert(r1 === r2)
+    }
+  }
+
+  test("oi => datatype => oi") {
+    val ois = dataTypes.map(toInspector)
+
+    checkDataType(ois.map(inspectorToDataType), dataTypes)
+    checkDataType(dataTypes.map(toWritableInspector).map(inspectorToDataType), dataTypes)
+  }
+
+  test("wrap / unwrap null, constant null and writables") {
+    val writableOIs = dataTypes.map(toWritableInspector)
+    val nullRow = data.map(d => null)
+
+    checkValues(nullRow, nullRow.zip(writableOIs).map {
+      case (d, oi) => unwrap(wrap(d, oi), oi)
+    })
+
+    // struct couldn't be constant, sweep it out
+    val constantExprs = data.filter(!_.dataType.isInstanceOf[StructType])
+    val constantData = constantExprs.map(_.eval())
+    val constantNullData = constantData.map(_ => null)
+    val constantWritableOIs = constantExprs.map(e => toWritableInspector(e.dataType))
+    val constantNullWritableOIs = constantExprs.map(e => toInspector(Literal(null, e.dataType)))
+
+    checkValues(constantData, constantData.zip(constantWritableOIs).map {
+      case (d, oi) => unwrap(wrap(d, oi), oi)
+    })
+
+    checkValues(constantNullData, constantData.zip(constantNullWritableOIs).map {
+      case (d, oi) => unwrap(wrap(d, oi), oi)
+    })
+
+    checkValues(constantNullData, constantNullData.zip(constantWritableOIs).map {
+      case (d, oi) => unwrap(wrap(d, oi), oi)
+    })
+  }
+
+  test("wrap / unwrap primitive writable object inspector") {
+    val writableOIs = dataTypes.map(toWritableInspector)
+
+    checkValues(row, row.zip(writableOIs).map {
+      case (data, oi) => unwrap(wrap(data, oi), oi)
+    })
+  }
+
+  test("wrap / unwrap primitive java object inspector") {
+    val ois = dataTypes.map(toInspector)
+
+    checkValues(row, row.zip(ois).map {
+      case (data, oi) => unwrap(wrap(data, oi), oi)
+    })
+  }
+
+  test("wrap / unwrap Struct Type") {
+    val dt = StructType(dataTypes.zipWithIndex.map {
+      case (t, idx) => StructField(s"c_$idx", t)
+    })
+
+    checkValues(row, unwrap(wrap(Row.fromSeq(row), toInspector(dt)), toInspector(dt)).asInstanceOf[Row])
+    checkValue(null, unwrap(wrap(null, toInspector(dt)), toInspector(dt)))
+  }
+
+  test("wrap / unwrap Array Type") {
+    val dt = ArrayType(dataTypes(0))
+
+    val d = row(0) :: row(0) :: Nil
+    checkValue(d, unwrap(wrap(d, toInspector(dt)), toInspector(dt)))
+    checkValue(null, unwrap(wrap(null, toInspector(dt)), toInspector(dt)))
+    checkValue(d, unwrap(wrap(d, toInspector(Literal(d, dt))), toInspector(Literal(d, dt))))
+    checkValue(d, unwrap(wrap(null, toInspector(Literal(d, dt))), toInspector(Literal(d, dt))))
+  }
+
+  test("wrap / unwrap Map Type") {
+    val dt = MapType(dataTypes(0), dataTypes(1))
+
+    val d = Map(row(0) -> row(1))
+    checkValue(d, unwrap(wrap(d, toInspector(dt)), toInspector(dt)))
+    checkValue(null, unwrap(wrap(null, toInspector(dt)), toInspector(dt)))
+    checkValue(d, unwrap(wrap(d, toInspector(Literal(d, dt))), toInspector(Literal(d, dt))))
+    checkValue(d, unwrap(wrap(null, toInspector(Literal(d, dt))), toInspector(Literal(d, dt))))
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 86535f8dd4f58..aad48ada52642 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.hive
 
 import org.scalatest.FunSuite
 
-import org.apache.spark.sql.catalyst.types.StructType
 import org.apache.spark.sql.test.ExamplePointUDT
+import org.apache.spark.sql.types.StructType
 
 class HiveMetastoreCatalogSuite extends FunSuite {
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index fb481edc853b7..0e6636d38ed3c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -22,6 +22,7 @@ import java.io.File
 import com.google.common.io.Files
 import org.apache.spark.sql.{QueryTest, _}
 import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.types._
 
 /* Implicits */
 import org.apache.spark.sql.hive.test.TestHive._
@@ -42,7 +43,7 @@ class InsertIntoHiveTableSuite extends QueryTest {
     // Make sure the table has also been updated.
     checkAnswer(
       sql("SELECT * FROM createAndInsertTest"),
-      testData.collect().toSeq
+      testData.collect().toSeq.map(Row.fromTuple)
     )
 
     // Add more data.
@@ -51,7 +52,7 @@ class InsertIntoHiveTableSuite extends QueryTest {
     // Make sure the table has been updated.
     checkAnswer(
       sql("SELECT * FROM createAndInsertTest"),
-      testData.collect().toSeq ++ testData.collect().toSeq
+      testData.toSchemaRDD.collect().toSeq ++ testData.toSchemaRDD.collect().toSeq
     )
 
     // Now overwrite.
@@ -60,7 +61,7 @@ class InsertIntoHiveTableSuite extends QueryTest {
     // Make sure the registered table has also been updated.
     checkAnswer(
       sql("SELECT * FROM createAndInsertTest"),
-      testData.collect().toSeq
+      testData.collect().toSeq.map(Row.fromTuple)
     )
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
new file mode 100644
index 0000000000000..7408c7ffd69e8
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import java.io.File
+
+import org.scalatest.BeforeAndAfterEach
+
+import org.apache.commons.io.FileUtils
+
+import org.apache.spark.sql._
+import org.apache.spark.util.Utils
+import org.apache.spark.sql.types._
+
+/* Implicits */
+import org.apache.spark.sql.hive.test.TestHive._
+
+/**
+ * Tests for persisting tables created though the data sources API into the metastore.
+ */
+class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
+  override def afterEach(): Unit = {
+    reset()
+  }
+
+  val filePath = Utils.getSparkClassLoader.getResource("sample.json").getFile
+
+  test ("persistent JSON table") {
+    sql(
+      s"""
+        |CREATE TABLE jsonTable
+        |USING org.apache.spark.sql.json.DefaultSource
+        |OPTIONS (
+        |  path '${filePath}'
+        |)
+      """.stripMargin)
+
+    checkAnswer(
+      sql("SELECT * FROM jsonTable"),
+      jsonFile(filePath).collect().toSeq)
+  }
+
+  test ("persistent JSON table with a user specified schema") {
+    sql(
+      s"""
+        |CREATE TABLE jsonTable (
+        |a string,
+        |b String,
+        |`c_!@(3)` int,
+        |`<d>` Struct<`d!`:array<int>, `=`:array<struct<Dd2: boolean>>>)
+        |USING org.apache.spark.sql.json.DefaultSource
+        |OPTIONS (
+        |  path '${filePath}'
+        |)
+      """.stripMargin)
+
+    jsonFile(filePath).registerTempTable("expectedJsonTable")
+
+    checkAnswer(
+      sql("SELECT a, b, `c_!@(3)`, `<d>`.`d!`, `<d>`.`=` FROM jsonTable"),
+      sql("SELECT a, b, `c_!@(3)`, `<d>`.`d!`, `<d>`.`=` FROM expectedJsonTable").collect().toSeq)
+  }
+
+  test ("persistent JSON table with a user specified schema with a subset of fields") {
+    // This works because JSON objects are self-describing and JSONRelation can get needed
+    // field values based on field names.
+    sql(
+      s"""
+        |CREATE TABLE jsonTable (`<d>` Struct<`=`:array<struct<Dd2: boolean>>>, b String)
+        |USING org.apache.spark.sql.json.DefaultSource
+        |OPTIONS (
+        |  path '${filePath}'
+        |)
+      """.stripMargin)
+
+    val innerStruct = StructType(
+      StructField("=", ArrayType(StructType(StructField("Dd2", BooleanType, true) :: Nil))) :: Nil)
+    val expectedSchema = StructType(
+      StructField("<d>", innerStruct, true) ::
+      StructField("b", StringType, true) :: Nil)
+
+    assert(expectedSchema == table("jsonTable").schema)
+
+    jsonFile(filePath).registerTempTable("expectedJsonTable")
+
+    checkAnswer(
+      sql("SELECT b, `<d>`.`=` FROM jsonTable"),
+      sql("SELECT b, `<d>`.`=` FROM expectedJsonTable").collect().toSeq)
+  }
+
+  test("resolve shortened provider names") {
+    sql(
+      s"""
+        |CREATE TABLE jsonTable
+        |USING org.apache.spark.sql.json
+        |OPTIONS (
+        |  path '${filePath}'
+        |)
+      """.stripMargin)
+
+    checkAnswer(
+      sql("SELECT * FROM jsonTable"),
+      jsonFile(filePath).collect().toSeq)
+  }
+
+  test("drop table") {
+    sql(
+      s"""
+        |CREATE TABLE jsonTable
+        |USING org.apache.spark.sql.json
+        |OPTIONS (
+        |  path '${filePath}'
+        |)
+      """.stripMargin)
+
+    checkAnswer(
+      sql("SELECT * FROM jsonTable"),
+      jsonFile(filePath).collect().toSeq)
+
+    sql("DROP TABLE jsonTable")
+
+    intercept[Exception] {
+      sql("SELECT * FROM jsonTable").collect()
+    }
+  }
+
+  test("check change without refresh") {
+    val tempDir = File.createTempFile("sparksql", "json")
+    tempDir.delete()
+    sparkContext.parallelize(("a", "b") :: Nil).toJSON.saveAsTextFile(tempDir.getCanonicalPath)
+
+    sql(
+      s"""
+        |CREATE TABLE jsonTable
+        |USING org.apache.spark.sql.json
+        |OPTIONS (
+        |  path '${tempDir.getCanonicalPath}'
+        |)
+      """.stripMargin)
+
+    checkAnswer(
+      sql("SELECT * FROM jsonTable"),
+      Row("a", "b"))
+
+    FileUtils.deleteDirectory(tempDir)
+    sparkContext.parallelize(("a1", "b1", "c1") :: Nil).toJSON.saveAsTextFile(tempDir.getCanonicalPath)
+
+    // Schema is cached so the new column does not show. The updated values in existing columns
+    // will show.
+    checkAnswer(
+      sql("SELECT * FROM jsonTable"),
+      Row("a1", "b1"))
+
+    refreshTable("jsonTable")
+
+    // Check that the refresh worked
+    checkAnswer(
+      sql("SELECT * FROM jsonTable"),
+      Row("a1", "b1", "c1"))
+    FileUtils.deleteDirectory(tempDir)
+  }
+
+  test("drop, change, recreate") {
+    val tempDir = File.createTempFile("sparksql", "json")
+    tempDir.delete()
+    sparkContext.parallelize(("a", "b") :: Nil).toJSON.saveAsTextFile(tempDir.getCanonicalPath)
+
+    sql(
+      s"""
+        |CREATE TABLE jsonTable
+        |USING org.apache.spark.sql.json
+        |OPTIONS (
+        |  path '${tempDir.getCanonicalPath}'
+        |)
+      """.stripMargin)
+
+    checkAnswer(
+      sql("SELECT * FROM jsonTable"),
+      Row("a", "b"))
+
+    FileUtils.deleteDirectory(tempDir)
+    sparkContext.parallelize(("a", "b", "c") :: Nil).toJSON.saveAsTextFile(tempDir.getCanonicalPath)
+
+    sql("DROP TABLE jsonTable")
+
+    sql(
+      s"""
+        |CREATE TABLE jsonTable
+        |USING org.apache.spark.sql.json
+        |OPTIONS (
+        |  path '${tempDir.getCanonicalPath}'
+        |)
+      """.stripMargin)
+
+    // New table should reflect new schema.
+    checkAnswer(
+      sql("SELECT * FROM jsonTable"),
+      Row("a", "b", "c"))
+    FileUtils.deleteDirectory(tempDir)
+  }
+
+  test("invalidate cache and reload") {
+    sql(
+      s"""
+        |CREATE TABLE jsonTable (`c_!@(3)` int)
+        |USING org.apache.spark.sql.json.DefaultSource
+        |OPTIONS (
+        |  path '${filePath}'
+        |)
+      """.stripMargin)
+
+    jsonFile(filePath).registerTempTable("expectedJsonTable")
+
+    checkAnswer(
+      sql("SELECT * FROM jsonTable"),
+      sql("SELECT `c_!@(3)` FROM expectedJsonTable").collect().toSeq)
+
+    // Discard the cached relation.
+    invalidateTable("jsonTable")
+
+    checkAnswer(
+      sql("SELECT * FROM jsonTable"),
+      sql("SELECT `c_!@(3)` FROM expectedJsonTable").collect().toSeq)
+
+    invalidateTable("jsonTable")
+    val expectedSchema = StructType(StructField("c_!@(3)", IntegerType, true) :: Nil)
+
+    assert(expectedSchema == table("jsonTable").schema)
+  }
+
+  test("SPARK-5286 Fail to drop an invalid table when using the data source API") {
+    sql(
+      s"""
+        |CREATE TABLE jsonTable
+        |USING org.apache.spark.sql.json.DefaultSource
+        |OPTIONS (
+        |  path 'it is not a path at all!'
+        |)
+      """.stripMargin)
+
+    sql("DROP TABLE jsonTable").collect().foreach(println)
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index a90fc023e67d8..6f07fd5a879c0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -21,11 +21,11 @@ import org.scalatest.BeforeAndAfterAll
 
 import scala.reflect.ClassTag
 
-import org.apache.spark.sql.{SQLConf, QueryTest}
-import org.apache.spark.sql.catalyst.plans.logical.NativeCommand
-import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, ShuffledHashJoin}
+import org.apache.spark.sql.{Row, SQLConf, QueryTest}
+import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.execution._
 
 class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
   TestHive.reset()
@@ -51,19 +51,19 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
     assertAnalyzeCommand(
       "ANALYZE TABLE Table1 COMPUTE STATISTICS",
-      classOf[NativeCommand])
+      classOf[HiveNativeCommand])
     assertAnalyzeCommand(
       "ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS",
-      classOf[NativeCommand])
+      classOf[HiveNativeCommand])
     assertAnalyzeCommand(
       "ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS noscan",
-      classOf[NativeCommand])
+      classOf[HiveNativeCommand])
     assertAnalyzeCommand(
       "ANALYZE TABLE Table1 PARTITION(ds, hr) COMPUTE STATISTICS",
-      classOf[NativeCommand])
+      classOf[HiveNativeCommand])
     assertAnalyzeCommand(
       "ANALYZE TABLE Table1 PARTITION(ds, hr) COMPUTE STATISTICS noscan",
-      classOf[NativeCommand])
+      classOf[HiveNativeCommand])
 
     assertAnalyzeCommand(
       "ANALYZE TABLE Table1 COMPUTE STATISTICS nOscAn",
@@ -72,7 +72,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
   test("analyze MetastoreRelations") {
     def queryTotalSize(tableName: String): BigInt =
-      catalog.lookupRelation(None, tableName).statistics.sizeInBytes
+      catalog.lookupRelation(Seq(tableName)).statistics.sizeInBytes
 
     // Non-partitioned table
     sql("CREATE TABLE analyzeTable (key STRING, value STRING)").collect()
@@ -81,7 +81,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
     // TODO: How does it works? needs to add it back for other hive version.
     if (HiveShim.version =="0.12.0") {
-      assert(queryTotalSize("analyzeTable") === defaultSizeInBytes)
+      assert(queryTotalSize("analyzeTable") === conf.defaultSizeInBytes)
     }
     sql("ANALYZE TABLE analyzeTable COMPUTE STATISTICS noscan")
 
@@ -110,7 +110,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
         |SELECT * FROM src
       """.stripMargin).collect()
 
-    assert(queryTotalSize("analyzeTable_part") === defaultSizeInBytes)
+    assert(queryTotalSize("analyzeTable_part") === conf.defaultSizeInBytes)
 
     sql("ANALYZE TABLE analyzeTable_part COMPUTE STATISTICS noscan")
 
@@ -123,7 +123,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
     intercept[NotImplementedError] {
       analyze("tempTable")
     }
-    catalog.unregisterTable(None, "tempTable")
+    catalog.unregisterTable(Seq("tempTable"))
   }
 
   test("estimates the size of a test MetastoreRelation") {
@@ -141,7 +141,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
         before: () => Unit,
         after: () => Unit,
         query: String,
-        expectedAnswer: Seq[Any],
+        expectedAnswer: Seq[Row],
         ct: ClassTag[_]) = {
       before()
 
@@ -151,8 +151,8 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
       val sizes = rdd.queryExecution.analyzed.collect {
         case r if ct.runtimeClass.isAssignableFrom(r.getClass) => r.statistics.sizeInBytes
       }
-      assert(sizes.size === 2 && sizes(0) <= autoBroadcastJoinThreshold
-        && sizes(1) <= autoBroadcastJoinThreshold,
+      assert(sizes.size === 2 && sizes(0) <= conf.autoBroadcastJoinThreshold
+        && sizes(1) <= conf.autoBroadcastJoinThreshold,
         s"query should contain two relations, each of which has size smaller than autoConvertSize")
 
       // Using `sparkPlan` because for relevant patterns in HashJoin to be
@@ -163,8 +163,8 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
       checkAnswer(rdd, expectedAnswer) // check correctness of output
 
-      TestHive.settings.synchronized {
-        val tmp = autoBroadcastJoinThreshold
+      TestHive.conf.settings.synchronized {
+        val tmp = conf.autoBroadcastJoinThreshold
 
         sql(s"""SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1""")
         rdd = sql(query)
@@ -183,7 +183,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
     /** Tests for MetastoreRelation */
     val metastoreQuery = """SELECT * FROM src a JOIN src b ON a.key = 238 AND a.key = b.key"""
-    val metastoreAnswer = Seq.fill(4)((238, "val_238", 238, "val_238"))
+    val metastoreAnswer = Seq.fill(4)(Row(238, "val_238", 238, "val_238"))
     mkTest(
       () => (),
       () => (),
@@ -193,4 +193,52 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
     )
   }
 
+  test("auto converts to broadcast left semi join, by size estimate of a relation") {
+    val leftSemiJoinQuery =
+      """SELECT * FROM src a
+        |left semi JOIN src b ON a.key=86 and a.key = b.key""".stripMargin
+    val answer = Row(86, "val_86")
+
+    var rdd = sql(leftSemiJoinQuery)
+
+    // Assert src has a size smaller than the threshold.
+    val sizes = rdd.queryExecution.analyzed.collect {
+      case r if implicitly[ClassTag[MetastoreRelation]].runtimeClass
+        .isAssignableFrom(r.getClass) =>
+        r.statistics.sizeInBytes
+    }
+    assert(sizes.size === 2 && sizes(1) <= conf.autoBroadcastJoinThreshold
+      && sizes(0) <= conf.autoBroadcastJoinThreshold,
+      s"query should contain two relations, each of which has size smaller than autoConvertSize")
+
+    // Using `sparkPlan` because for relevant patterns in HashJoin to be
+    // matched, other strategies need to be applied.
+    var bhj = rdd.queryExecution.sparkPlan.collect {
+      case j: BroadcastLeftSemiJoinHash => j
+    }
+    assert(bhj.size === 1,
+      s"actual query plans do not contain broadcast join: ${rdd.queryExecution}")
+
+    checkAnswer(rdd, answer) // check correctness of output
+
+    TestHive.conf.settings.synchronized {
+      val tmp = conf.autoBroadcastJoinThreshold
+
+      sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1")
+      rdd = sql(leftSemiJoinQuery)
+      bhj = rdd.queryExecution.sparkPlan.collect {
+        case j: BroadcastLeftSemiJoinHash => j
+      }
+      assert(bhj.isEmpty, "BroadcastHashJoin still planned even though it is switched off")
+
+      val shj = rdd.queryExecution.sparkPlan.collect {
+        case j: LeftSemiJoinHash => j
+      }
+      assert(shj.size === 1,
+        "LeftSemiJoinHash should be planned when BroadcastHashJoin is turned off")
+
+      sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=$tmp")
+    }
+
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveQLSuite.scala
deleted file mode 100644
index ca78dfba4fa38..0000000000000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveQLSuite.scala
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.api.java
-
-import scala.util.Try
-
-import org.scalatest.FunSuite
-
-import org.apache.spark.api.java.JavaSparkContext
-import org.apache.spark.sql.api.java.{JavaSQLContext, JavaSchemaRDD}
-import org.apache.spark.sql.execution.ExplainCommand
-import org.apache.spark.sql.hive.test.TestHive
-
-// Implicits
-import scala.collection.JavaConversions._
-
-class JavaHiveQLSuite extends FunSuite {
-  lazy val javaCtx = new JavaSparkContext(TestHive.sparkContext)
-
-  // There is a little trickery here to avoid instantiating two HiveContexts in the same JVM
-  lazy val javaHiveCtx = new JavaHiveContext(TestHive)
-
-  test("SELECT * FROM src") {
-    assert(
-      javaHiveCtx.sql("SELECT * FROM src").collect().map(_.getInt(0)) ===
-        TestHive.sql("SELECT * FROM src").collect().map(_.getInt(0)).toSeq)
-  }
-
-  def isExplanation(result: JavaSchemaRDD) = {
-    val explanation = result.collect().map(_.getString(0))
-    explanation.size > 1 && explanation.head.startsWith("== Physical Plan ==")
-  }
-
-  test("Query Hive native command execution result") {
-    val tableName = "test_native_commands"
-
-    assertResult(0) {
-      javaHiveCtx.sql(s"DROP TABLE IF EXISTS $tableName").count()
-    }
-
-    assertResult(0) {
-      javaHiveCtx.sql(s"CREATE TABLE $tableName(key INT, value STRING)").count()
-    }
-
-    assert(
-      javaHiveCtx
-        .sql("SHOW TABLES")
-        .collect()
-        .map(_.getString(0))
-        .contains(tableName))
-
-    assertResult(Array(Array("key", "int"), Array("value", "string"))) {
-      javaHiveCtx
-        .sql(s"describe $tableName")
-        .collect()
-        .map(row => Array(row.get(0).asInstanceOf[String], row.get(1).asInstanceOf[String]))
-        .toArray
-    }
-
-    assert(isExplanation(javaHiveCtx.sql(
-      s"EXPLAIN SELECT key, COUNT(*) FROM $tableName GROUP BY key")))
-
-    TestHive.reset()
-  }
-
-  test("Exactly once semantics for DDL and command statements") {
-    val tableName = "test_exactly_once"
-    val q0 = javaHiveCtx.sql(s"CREATE TABLE $tableName(key INT, value STRING)")
-
-    // If the table was not created, the following assertion would fail
-    assert(Try(TestHive.table(tableName)).isSuccess)
-
-    // If the CREATE TABLE command got executed again, the following assertion would fail
-    assert(Try(q0.count()).isSuccess)
-  }
-}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index 44eb4cfa59335..f8a957d55d57e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -22,9 +22,10 @@ import java.io._
 import org.scalatest.{BeforeAndAfterAll, FunSuite, GivenWhenThen}
 
 import org.apache.spark.Logging
+import org.apache.spark.sql.execution.{SetCommand, ExplainCommand}
+import org.apache.spark.sql.hive.DescribeCommand
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.plans.logical.{NativeCommand => LogicalNativeCommand}
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.hive.test.TestHive
 
@@ -133,7 +134,7 @@ abstract class HiveComparisonTest
 
     def isSorted(plan: LogicalPlan): Boolean = plan match {
       case _: Join | _: Aggregate | _: Generate | _: Sample | _: Distinct => false
-      case PhysicalOperation(_, _, Sort(_, _)) => true
+      case PhysicalOperation(_, _, Sort(_, true, _)) => true
       case _ => plan.children.iterator.exists(isSorted)
     }
 
@@ -142,14 +143,14 @@ abstract class HiveComparisonTest
       // Hack: Hive simply prints the result of a SET command to screen,
       // and does not return it as a query answer.
       case _: SetCommand => Seq("0")
-      case LogicalNativeCommand(c) if c.toLowerCase.contains("desc") =>
+      case HiveNativeCommand(c) if c.toLowerCase.contains("desc") =>
         answer
           .filterNot(nonDeterministicLine)
           .map(_.replaceAll("from deserializer", ""))
           .map(_.replaceAll("None", ""))
           .map(_.trim)
           .filterNot(_ == "")
-      case _: LogicalNativeCommand => answer.filterNot(nonDeterministicLine).filterNot(_ == "")
+      case _: HiveNativeCommand => answer.filterNot(nonDeterministicLine).filterNot(_ == "")
       case _: ExplainCommand => answer
       case _: DescribeCommand =>
         // Filter out non-deterministic lines and lines which do not have actual results but
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index af45dfd6e28c2..df72be7746ac6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -56,6 +56,21 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     Locale.setDefault(originalLocale)
   }
 
+  test("SPARK-4908: concurent hive native commands") {
+    (1 to 100).par.map { _ =>
+      sql("USE default")
+      sql("SHOW TABLES")
+    }
+  }
+  
+  createQueryTest("! operator",
+    """
+      |SELECT a FROM (
+      |  SELECT 1 AS a FROM src LIMIT 1 UNION ALL
+      |  SELECT 2 AS a FROM src LIMIT 1) table
+      |WHERE !(a>1)
+    """.stripMargin)
+
   createQueryTest("constant object inspector for generic udf",
     """SELECT named_struct(
       lower("AA"), "10",
@@ -211,7 +226,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
   // Jdk version leads to different query output for double, so not use createQueryTest here
   test("division") {
     val res = sql("SELECT 2 / 1, 1 / 2, 1 / 3, 1 / COUNT(*) FROM src LIMIT 1").collect().head
-    Seq(2.0, 0.5, 0.3333333333333333, 0.002).zip(res).foreach( x =>
+    Seq(2.0, 0.5, 0.3333333333333333, 0.002).zip(res.toSeq).foreach( x =>
       assert(x._1 == x._2.asInstanceOf[Double]))
   }
 
@@ -220,7 +235,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
 
   test("Query expressed in SQL") {
     setConf("spark.sql.dialect", "sql")
-    assert(sql("SELECT 1").collect() === Array(Seq(1)))
+    assert(sql("SELECT 1").collect() === Array(Row(1)))
     setConf("spark.sql.dialect", "hiveql")
   }
 
@@ -319,10 +334,8 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
   createQueryTest("DISTINCT",
     "SELECT DISTINCT key, value FROM src")
 
-  ignore("empty aggregate input") {
-    createQueryTest("empty aggregate input",
-      "SELECT SUM(key) FROM (SELECT * FROM src LIMIT 0) a")
-  }
+  createQueryTest("empty aggregate input",
+    "SELECT SUM(key) FROM (SELECT * FROM src LIMIT 0) a")
 
   createQueryTest("lateral view1",
     "SELECT tbl.* FROM src LATERAL VIEW explode(array(1,2)) tbl as a")
@@ -413,6 +426,15 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
   createQueryTest("select null from table",
     "SELECT null FROM src LIMIT 1")
 
+  test("predicates contains an empty AttributeSet() references") {
+    sql(
+      """
+        |SELECT a FROM (
+        |  SELECT 1 AS a FROM src LIMIT 1 ) table
+        |WHERE abs(20141202) is not null
+      """.stripMargin).collect()
+  }
+
   test("implement identity function using case statement") {
     val actual = sql("SELECT (CASE key WHEN key THEN key END) FROM src")
       .map { case Row(i: Int) => i }
@@ -445,7 +467,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
         TestData(2, "str2") :: Nil)
     testData.registerTempTable("REGisteredTABle")
 
-    assertResult(Array(Array(2, "str2"))) {
+    assertResult(Array(Row(2, "str2"))) {
       sql("SELECT tablealias.A, TABLEALIAS.b FROM reGisteredTABle TableAlias " +
         "WHERE TableAliaS.a > 1").collect()
     }
@@ -531,12 +553,12 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     // Describe a table
     assertResult(
       Array(
-        Array("key", "int", null),
-        Array("value", "string", null),
-        Array("dt", "string", null),
-        Array("# Partition Information", "", ""),
-        Array("# col_name", "data_type", "comment"),
-        Array("dt", "string", null))
+        Row("key", "int", null),
+        Row("value", "string", null),
+        Row("dt", "string", null),
+        Row("# Partition Information", "", ""),
+        Row("# col_name", "data_type", "comment"),
+        Row("dt", "string", null))
     ) {
       sql("DESCRIBE test_describe_commands1")
         .select('col_name, 'data_type, 'comment)
@@ -546,12 +568,12 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     // Describe a table with a fully qualified table name
     assertResult(
       Array(
-        Array("key", "int", null),
-        Array("value", "string", null),
-        Array("dt", "string", null),
-        Array("# Partition Information", "", ""),
-        Array("# col_name", "data_type", "comment"),
-        Array("dt", "string", null))
+        Row("key", "int", null),
+        Row("value", "string", null),
+        Row("dt", "string", null),
+        Row("# Partition Information", "", ""),
+        Row("# col_name", "data_type", "comment"),
+        Row("dt", "string", null))
     ) {
       sql("DESCRIBE default.test_describe_commands1")
         .select('col_name, 'data_type, 'comment)
@@ -601,9 +623,8 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
 
     assertResult(
       Array(
-        Array("# Registered as a temporary table", null, null),
-        Array("a", "IntegerType", null),
-        Array("b", "StringType", null))
+        Row("a", "IntegerType", null),
+        Row("b", "StringType", null))
     ) {
       sql("DESCRIBE test_describe_commands2")
         .select('col_name, 'data_type, 'comment)
@@ -826,7 +847,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
         case Row(key: String, value: String) => key -> value
         case Row(KV(key, value)) => key -> value
       }.toSet
-    clear()
+    conf.clear()
 
     // "SET" itself returns all config variables currently specified in SQLConf.
     // TODO: Should we be listing the default here always? probably...
@@ -858,7 +879,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
       collectResults(sql(s"SET $nonexistentKey"))
     }
 
-    clear()
+    conf.clear()
   }
 
   createQueryTest("select from thrift based table",
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
index ee9d08ff75450..422e843d2b0d2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
@@ -27,6 +27,17 @@ case class Data(a: Int, B: Int, n: Nested, nestedArray: Seq[Nested])
  * A set of test cases expressed in Hive QL that are not covered by the tests included in the hive distribution.
  */
 class HiveResolutionSuite extends HiveComparisonTest {
+
+  case class NestedData(a: Seq[NestedData2], B: NestedData2)
+  case class NestedData2(a: NestedData3, B: NestedData3)
+  case class NestedData3(a: Int, B: Int)
+
+  test("SPARK-3698: case insensitive test for nested data") {
+    sparkContext.makeRDD(Seq.empty[NestedData]).registerTempTable("nested")
+    // This should be successfully analyzed
+    sql("SELECT a[0].A.A from nested").queryExecution.analyzed
+  }
+
   createQueryTest("table.attr",
     "SELECT src.key FROM src ORDER BY key LIMIT 1")
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
index 54c0f017d4cb6..16f77a438e1ae 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.{Row, SchemaRDD}
+import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.Row
 
 import org.apache.spark.util.Utils
 
@@ -44,6 +45,14 @@ class HiveTableScanSuite extends HiveComparisonTest {
       |SELECT * from part_scan_test;
     """.stripMargin)
 
+  // In unit test, kv1.txt is a small file and will be loaded as table src
+  // Since the small file will be considered as a single split, we assume
+  // Hive / SparkSQL HQL has the same output even for SORT BY
+  createQueryTest("file_split_for_small_table",
+    """
+      |SELECT key, value FROM src SORT BY key, value
+    """.stripMargin)
+
   test("Spark-4041: lowercase issue") {
     TestHive.sql("CREATE TABLE tb (KEY INT, VALUE STRING) STORED AS ORC")
     TestHive.sql("insert into table tb select key, value from src")
@@ -68,5 +77,15 @@ class HiveTableScanSuite extends HiveComparisonTest {
       === Array(Row(java.sql.Timestamp.valueOf("2014-12-11 00:00:00")),Row(null)))
     TestHive.sql("DROP TABLE timestamp_query_null")
   }
-  
+
+  test("Spark-4959 Attributes are case sensitive when using a select query from a projection") {
+    sql("create table spark_4959 (col1 string)")
+    sql("""insert into table spark_4959 select "hi" from src limit 1""")
+    table("spark_4959").select(
+      'col1.as('CaseSensitiveColName),
+      'col1.as('CaseSensitiveColName2)).registerTempTable("spark_4959_2")
+
+    assert(sql("select CaseSensitiveColName from spark_4959_2").first() === Row("hi"))
+    assert(sql("select casesensitivecolname from spark_4959_2").first() === Row("hi"))
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
index 5fcaf671a80de..f2374a215291b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
@@ -50,7 +50,7 @@ class HiveUdfSuite extends QueryTest {
   import TestHive._
 
   test("spark sql udf test that returns a struct") {
-    registerFunction("getStruct", (_: Int) => Fields(1, 2, 3, 4, 5))
+    udf.register("getStruct", (_: Int) => Fields(1, 2, 3, 4, 5))
     assert(sql(
       """
         |SELECT getStruct(1).f1,
@@ -64,7 +64,7 @@ class HiveUdfSuite extends QueryTest {
   test("SPARK-4785 When called with arguments referring column fields, PMOD throws NPE") {
     checkAnswer(
       sql("SELECT PMOD(CAST(key as INT), 10) FROM src LIMIT 1"),
-      8
+      Row(8)
     )
   }
 
@@ -92,10 +92,21 @@ class HiveUdfSuite extends QueryTest {
   }
 
   test("SPARK-2693 udaf aggregates test") {
-    checkAnswer(sql("SELECT percentile(key,1) FROM src LIMIT 1"),
+    checkAnswer(sql("SELECT percentile(key, 1) FROM src LIMIT 1"),
       sql("SELECT max(key) FROM src").collect().toSeq)
+      
+    checkAnswer(sql("SELECT percentile(key, array(1, 1)) FROM src LIMIT 1"),
+      sql("SELECT array(max(key), max(key)) FROM src").collect().toSeq)
   }
 
+  test("Generic UDAF aggregates") {
+    checkAnswer(sql("SELECT ceiling(percentile_approx(key, 0.99999)) FROM src LIMIT 1"),
+      sql("SELECT max(key) FROM src LIMIT 1").collect().toSeq)
+      
+    checkAnswer(sql("SELECT percentile_approx(100.0, array(0.9, 0.9)) FROM src LIMIT 1"),
+      sql("SELECT array(100, 100) FROM src LIMIT 1").collect().toSeq)
+   }
+  
   test("UDFIntegerToString") {
     val testData = TestHive.sparkContext.parallelize(
       IntegerCaseClass(1) :: IntegerCaseClass(2) :: Nil)
@@ -104,7 +115,7 @@ class HiveUdfSuite extends QueryTest {
     sql(s"CREATE TEMPORARY FUNCTION testUDFIntegerToString AS '${classOf[UDFIntegerToString].getName}'")
     checkAnswer(
       sql("SELECT testUDFIntegerToString(i) FROM integerTable"), //.collect(),
-      Seq(Seq("1"), Seq("2")))
+      Seq(Row("1"), Row("2")))
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFIntegerToString")
 
     TestHive.reset()
@@ -120,7 +131,7 @@ class HiveUdfSuite extends QueryTest {
     sql(s"CREATE TEMPORARY FUNCTION testUDFListListInt AS '${classOf[UDFListListInt].getName}'")
     checkAnswer(
       sql("SELECT testUDFListListInt(lli) FROM listListIntTable"), //.collect(),
-      Seq(Seq(0), Seq(2), Seq(13)))
+      Seq(Row(0), Row(2), Row(13)))
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFListListInt")
 
     TestHive.reset()
@@ -135,7 +146,7 @@ class HiveUdfSuite extends QueryTest {
     sql(s"CREATE TEMPORARY FUNCTION testUDFListString AS '${classOf[UDFListString].getName}'")
     checkAnswer(
       sql("SELECT testUDFListString(l) FROM listStringTable"), //.collect(),
-      Seq(Seq("a,b,c"), Seq("d,e")))
+      Seq(Row("a,b,c"), Row("d,e")))
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFListString")
 
     TestHive.reset()
@@ -149,7 +160,7 @@ class HiveUdfSuite extends QueryTest {
     sql(s"CREATE TEMPORARY FUNCTION testStringStringUdf AS '${classOf[UDFStringString].getName}'")
     checkAnswer(
       sql("SELECT testStringStringUdf(\"hello\", s) FROM stringTable"), //.collect(),
-      Seq(Seq("hello world"), Seq("hello goodbye")))
+      Seq(Row("hello world"), Row("hello goodbye")))
     sql("DROP TEMPORARY FUNCTION IF EXISTS testStringStringUdf")
 
     TestHive.reset()
@@ -166,7 +177,7 @@ class HiveUdfSuite extends QueryTest {
     sql(s"CREATE TEMPORARY FUNCTION testUDFTwoListList AS '${classOf[UDFTwoListList].getName}'")
     checkAnswer(
       sql("SELECT testUDFTwoListList(lli, lli) FROM TwoListTable"), //.collect(),
-      Seq(Seq("0, 0"), Seq("2, 2"), Seq("13, 13")))
+      Seq(Row("0, 0"), Row("2, 2"), Row("13, 13")))
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFTwoListList")
 
     TestHive.reset()
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index b341eae512417..7f9f1ac7cd80d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -21,6 +21,7 @@ import org.apache.spark.sql.QueryTest
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.types._
 
 case class Nested1(f1: Nested2)
 case class Nested2(f2: Nested3)
@@ -32,8 +33,15 @@ case class Nested3(f3: Int)
  * valid, but Hive currently cannot execute it.
  */
 class SQLQuerySuite extends QueryTest {
+  test("SPARK-4512 Fix attribute reference resolution error when using SORT BY") {
+    checkAnswer(
+      sql("SELECT * FROM (SELECT key + key AS a FROM src SORT BY value) t ORDER BY t.a"),
+      sql("SELECT key + key as a FROM src ORDER BY a").collect().toSeq
+    )
+  }
+
   test("CTAS with serde") {
-    sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value").collect
+    sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value").collect()
     sql(
       """CREATE TABLE ctas2
         | ROW FORMAT SERDE "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe"
@@ -43,23 +51,23 @@ class SQLQuerySuite extends QueryTest {
         | AS
         |   SELECT key, value
         |   FROM src
-        |   ORDER BY key, value""".stripMargin).collect
+        |   ORDER BY key, value""".stripMargin).collect()
     sql(
       """CREATE TABLE ctas3
         | ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\012'
         | STORED AS textfile AS
         |   SELECT key, value
         |   FROM src
-        |   ORDER BY key, value""".stripMargin).collect
+        |   ORDER BY key, value""".stripMargin).collect()
 
     // the table schema may like (key: integer, value: string)
     sql(
       """CREATE TABLE IF NOT EXISTS ctas4 AS
-        | SELECT 1 AS key, value FROM src LIMIT 1""".stripMargin).collect
+        | SELECT 1 AS key, value FROM src LIMIT 1""".stripMargin).collect()
     // do nothing cause the table ctas4 already existed.
     sql(
       """CREATE TABLE IF NOT EXISTS ctas4 AS
-        | SELECT key, value FROM src ORDER BY key, value""".stripMargin).collect
+        | SELECT key, value FROM src ORDER BY key, value""".stripMargin).collect()
 
     checkAnswer(
       sql("SELECT k, value FROM ctas1 ORDER BY k, value"),
@@ -81,7 +89,7 @@ class SQLQuerySuite extends QueryTest {
     intercept[org.apache.hadoop.hive.metastore.api.AlreadyExistsException] {
       sql(
         """CREATE TABLE ctas4 AS
-          | SELECT key, value FROM src ORDER BY key, value""".stripMargin).collect
+          | SELECT key, value FROM src ORDER BY key, value""".stripMargin).collect()
     }
     checkAnswer(
       sql("SELECT key, value FROM ctas4 ORDER BY key, value"),
@@ -96,6 +104,24 @@ class SQLQuerySuite extends QueryTest {
     )
   }
 
+  test("command substitution") {
+    sql("set tbl=src")
+    checkAnswer(
+      sql("SELECT key FROM ${hiveconf:tbl} ORDER BY key, value limit 1"),
+      sql("SELECT key FROM src ORDER BY key, value limit 1").collect().toSeq)
+
+    sql("set hive.variable.substitute=false") // disable the substitution
+    sql("set tbl2=src")
+    intercept[Exception] {
+      sql("SELECT key FROM ${hiveconf:tbl2} ORDER BY key, value limit 1").collect()
+    }
+
+    sql("set hive.variable.substitute=true") // enable the substitution
+    checkAnswer(
+      sql("SELECT key FROM ${hiveconf:tbl2} ORDER BY key, value limit 1"),
+      sql("SELECT key FROM src ORDER BY key, value limit 1").collect().toSeq)
+  }
+
   test("ordering not in select") {
     checkAnswer(
       sql("SELECT key FROM src ORDER BY value"),
@@ -118,7 +144,7 @@ class SQLQuerySuite extends QueryTest {
     sparkContext.parallelize(Nested1(Nested2(Nested3(1))) :: Nil).registerTempTable("nested")
     checkAnswer(
       sql("SELECT f1.f2.f3 FROM nested"),
-      1)
+      Row(1))
     checkAnswer(sql("CREATE TABLE test_ctas_1234 AS SELECT * from nested"),
       Seq.empty[Row])
     checkAnswer(
@@ -137,6 +163,19 @@ class SQLQuerySuite extends QueryTest {
       sql("SELECT key, value FROM src ORDER BY key").collect().toSeq)
   }
 
+  test("SPARK-4825 save join to table") {
+    val testData = sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString))
+    sql("CREATE TABLE test1 (key INT, value STRING)")
+    testData.insertInto("test1")
+    sql("CREATE TABLE test2 (key INT, value STRING)")
+    testData.insertInto("test2")
+    testData.insertInto("test2")
+    sql("SELECT COUNT(a.value) FROM test1 a JOIN test2 b ON a.key = b.key").saveAsTable("test")
+    checkAnswer(
+      table("test"),
+      sql("SELECT COUNT(a.value) FROM test1 a JOIN test2 b ON a.key = b.key").collect().toSeq)
+  }
+
   test("SPARK-3708 Backticks aren't handled correctly is aliases") {
     checkAnswer(
       sql("SELECT k FROM (SELECT `key` AS `k` FROM src) a"),
@@ -172,9 +211,60 @@ class SQLQuerySuite extends QueryTest {
       sql("SELECT case when ~1=-2 then 1 else 0 end FROM src"),
       sql("SELECT 1 FROM src").collect().toSeq)
   }
-  
- test("SPARK-4154 Query does not work if it has 'not between' in Spark SQL and HQL") {
-    checkAnswer(sql("SELECT key FROM src WHERE key not between 0 and 10 order by key"), 
-        sql("SELECT key FROM src WHERE key between 11 and 500 order by key").collect().toSeq)
+
+  test("SPARK-4154 Query does not work if it has 'not between' in Spark SQL and HQL") {
+    checkAnswer(sql("SELECT key FROM src WHERE key not between 0 and 10 order by key"),
+      sql("SELECT key FROM src WHERE key between 11 and 500 order by key").collect().toSeq)
+  }
+
+  test("SPARK-2554 SumDistinct partial aggregation") {
+    checkAnswer(sql("SELECT sum( distinct key) FROM src group by key order by key"),
+      sql("SELECT distinct key FROM src order by key").collect().toSeq)
+  }
+
+  test("SPARK-4963 SchemaRDD sample on mutable row return wrong result") {
+    sql("SELECT * FROM src WHERE key % 2 = 0")
+      .sample(withReplacement = false, fraction = 0.3)
+      .registerTempTable("sampled")
+    (1 to 10).foreach { i =>
+      checkAnswer(
+        sql("SELECT * FROM sampled WHERE key % 2 = 1"),
+        Seq.empty[Row])
+    }
+  }
+
+  test("SPARK-5284 Insert into Hive throws NPE when a inner complex type field has a null value") {
+    val schema = StructType(
+      StructField("s",
+        StructType(
+          StructField("innerStruct", StructType(StructField("s1", StringType, true) :: Nil)) ::
+            StructField("innerArray", ArrayType(IntegerType), true) ::
+            StructField("innerMap", MapType(StringType, IntegerType)) :: Nil), true) :: Nil)
+    val row = Row(Row(null, null, null))
+
+    val rowRdd = sparkContext.parallelize(row :: Nil)
+
+    applySchema(rowRdd, schema).registerTempTable("testTable")
+
+    sql(
+      """CREATE TABLE nullValuesInInnerComplexTypes
+        |  (s struct<innerStruct: struct<s1:string>,
+        |            innerArray:array<int>,
+        |            innerMap: map<string, int>>)
+      """.stripMargin).collect()
+
+    sql(
+      """
+        |INSERT OVERWRITE TABLE nullValuesInInnerComplexTypes
+        |SELECT * FROM testTable
+      """.stripMargin)
+
+    checkAnswer(
+      sql("SELECT * FROM nullValuesInInnerComplexTypes"),
+      Row(Row(null, null, null))
+    )
+
+    sql("DROP TABLE nullValuesInInnerComplexTypes")
+    dropTempTable("testTable")
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
index 6f57fe8958387..581f666399492 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
@@ -17,103 +17,66 @@
 
 package org.apache.spark.sql.parquet
 
-import java.io.File
-
-import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
-
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Row}
-import org.apache.spark.sql.catalyst.types.{DataType, StringType, IntegerType}
-import org.apache.spark.sql.{parquet, SchemaRDD}
-import org.apache.spark.util.Utils
-
-// Implicits
-import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.catalyst.expressions.Row
+import org.apache.spark.sql.hive.test.TestHive
 
 case class Cases(lower: String, UPPER: String)
 
-class HiveParquetSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAfterEach {
-
-  val dirname = Utils.createTempDir()
-
-  var testRDD: SchemaRDD = null
-
-  override def beforeAll() {
-    // write test data
-    ParquetTestData.writeFile()
-    testRDD = parquetFile(ParquetTestData.testDir.toString)
-    testRDD.registerTempTable("testsource")
-  }
-
-  override def afterAll() {
-    Utils.deleteRecursively(ParquetTestData.testDir)
-    Utils.deleteRecursively(dirname)
-    reset() // drop all tables that were registered as part of the tests
-  }
-
-  // in case tests are failing we delete before and after each test
-  override def beforeEach() {
-    Utils.deleteRecursively(dirname)
-  }
+class HiveParquetSuite extends QueryTest with ParquetTest {
+  val sqlContext = TestHive
 
-  override def afterEach() {
-    Utils.deleteRecursively(dirname)
-  }
+  import sqlContext._
 
   test("Case insensitive attribute names") {
-    val tempFile = File.createTempFile("parquet", "")
-    tempFile.delete()
-    sparkContext.parallelize(1 to 10)
-      .map(_.toString)
-      .map(i => Cases(i, i))
-      .saveAsParquetFile(tempFile.getCanonicalPath)
-
-    parquetFile(tempFile.getCanonicalPath).registerTempTable("cases")
-    sql("SELECT upper FROM cases").collect().map(_.getString(0)) === (1 to 10).map(_.toString)
-    sql("SELECT LOWER FROM cases").collect().map(_.getString(0)) === (1 to 10).map(_.toString)
+    withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") {
+      val expected = (1 to 4).map(i => Row(i.toString))
+      checkAnswer(sql("SELECT upper FROM cases"), expected)
+      checkAnswer(sql("SELECT LOWER FROM cases"), expected)
+    }
   }
 
   test("SELECT on Parquet table") {
-    val rdd = sql("SELECT * FROM testsource").collect()
-    assert(rdd != null)
-    assert(rdd.forall(_.size == 6))
+    val data = (1 to 4).map(i => (i, s"val_$i"))
+    withParquetTable(data, "t") {
+      checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple))
+    }
   }
 
   test("Simple column projection + filter on Parquet table") {
-    val rdd = sql("SELECT myboolean, mylong FROM testsource WHERE myboolean=true").collect()
-    assert(rdd.size === 5, "Filter returned incorrect number of rows")
-    assert(rdd.forall(_.getBoolean(0)), "Filter returned incorrect Boolean field value")
+    withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") {
+      checkAnswer(
+        sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"),
+        Seq(Row(true, "val_2"), Row(true, "val_4")))
+    }
   }
 
   test("Converting Hive to Parquet Table via saveAsParquetFile") {
-    sql("SELECT * FROM src").saveAsParquetFile(dirname.getAbsolutePath)
-    parquetFile(dirname.getAbsolutePath).registerTempTable("ptable")
-    val rddOne = sql("SELECT * FROM src").collect().sortBy(_.getInt(0))
-    val rddTwo = sql("SELECT * from ptable").collect().sortBy(_.getInt(0))
-
-    compareRDDs(rddOne, rddTwo, "src (Hive)", Seq("key:Int", "value:String"))
+    withTempPath { dir =>
+      sql("SELECT * FROM src").saveAsParquetFile(dir.getCanonicalPath)
+      parquetFile(dir.getCanonicalPath).registerTempTable("p")
+      withTempTable("p") {
+        checkAnswer(
+          sql("SELECT * FROM src ORDER BY key"),
+          sql("SELECT * from p ORDER BY key").collect().toSeq)
+      }
+    }
   }
 
-  test("INSERT OVERWRITE TABLE Parquet table") {
-    sql("SELECT * FROM testsource").saveAsParquetFile(dirname.getAbsolutePath)
-    parquetFile(dirname.getAbsolutePath).registerTempTable("ptable")
-    // let's do three overwrites for good measure
-    sql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
-    sql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
-    sql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
-    val rddCopy = sql("SELECT * FROM ptable").collect()
-    val rddOrig = sql("SELECT * FROM testsource").collect()
-    assert(rddCopy.size === rddOrig.size, "INSERT OVERWRITE changed size of table??")
-    compareRDDs(rddOrig, rddCopy, "testsource", ParquetTestData.testSchemaFieldNames)
-  }
 
-  private def compareRDDs(rddOne: Array[Row], rddTwo: Array[Row], tableName: String, fieldNames: Seq[String]) {
-    var counter = 0
-    (rddOne, rddTwo).zipped.foreach {
-      (a,b) => (a,b).zipped.toArray.zipWithIndex.foreach {
-        case ((value_1, value_2), index) =>
-          assert(value_1 === value_2, s"table $tableName row $counter field ${fieldNames(index)} don't match")
+  test("INSERT OVERWRITE TABLE Parquet table") {
+    withParquetTable((1 to 4).map(i => (i, s"val_$i")), "t") {
+      withTempPath { file =>
+        sql("SELECT * FROM t LIMIT 1").saveAsParquetFile(file.getCanonicalPath)
+        parquetFile(file.getCanonicalPath).registerTempTable("p")
+        withTempTable("p") {
+          // let's do three overwrites for good measure
+          sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
+          sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
+          sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
+          checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq)
+        }
       }
-    counter = counter + 1
     }
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
index 488ebba043794..79fd99d9f89ff 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
@@ -37,7 +37,7 @@ case class ParquetDataWithKey(p: Int, intField: Int, stringField: String)
  * A suite to test the automatic conversion of metastore tables with parquet data to use the
  * built in parquet support.
  */
-class ParquetMetastoreSuite extends ParquetTest {
+class ParquetMetastoreSuite extends ParquetPartitioningTest {
   override def beforeAll(): Unit = {
     super.beforeAll()
 
@@ -112,7 +112,7 @@ class ParquetMetastoreSuite extends ParquetTest {
 /**
  * A suite of tests for the Parquet support through the data sources API.
  */
-class ParquetSourceSuite extends ParquetTest {
+class ParquetSourceSuite extends ParquetPartitioningTest {
   override def beforeAll(): Unit = {
     super.beforeAll()
 
@@ -145,7 +145,7 @@ class ParquetSourceSuite extends ParquetTest {
 /**
  * A collection of tests for parquet data with various forms of partitioning.
  */
-abstract class ParquetTest extends QueryTest with BeforeAndAfterAll {
+abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll {
   var partitionedTableDir: File = null
   var partitionedTableDirWithKey: File = null
 
@@ -174,72 +174,84 @@ abstract class ParquetTest extends QueryTest with BeforeAndAfterAll {
   }
 
   Seq("partitioned_parquet", "partitioned_parquet_with_key").foreach { table =>
+    test(s"ordering of the partitioning columns $table") {
+      checkAnswer(
+        sql(s"SELECT p, stringField FROM $table WHERE p = 1"),
+        Seq.fill(10)(Row(1, "part-1"))
+      )
+
+      checkAnswer(
+        sql(s"SELECT stringField, p FROM $table WHERE p = 1"),
+        Seq.fill(10)(Row("part-1", 1))
+      )
+    }
+
     test(s"project the partitioning column $table") {
       checkAnswer(
         sql(s"SELECT p, count(*) FROM $table group by p"),
-        (1, 10) ::
-        (2, 10) ::
-        (3, 10) ::
-        (4, 10) ::
-        (5, 10) ::
-        (6, 10) ::
-        (7, 10) ::
-        (8, 10) ::
-        (9, 10) ::
-        (10, 10) :: Nil
+        Row(1, 10) ::
+          Row(2, 10) ::
+          Row(3, 10) ::
+          Row(4, 10) ::
+          Row(5, 10) ::
+          Row(6, 10) ::
+          Row(7, 10) ::
+          Row(8, 10) ::
+          Row(9, 10) ::
+          Row(10, 10) :: Nil
       )
     }
 
     test(s"project partitioning and non-partitioning columns $table") {
       checkAnswer(
         sql(s"SELECT stringField, p, count(intField) FROM $table GROUP BY p, stringField"),
-        ("part-1", 1, 10) ::
-        ("part-2", 2, 10) ::
-        ("part-3", 3, 10) ::
-        ("part-4", 4, 10) ::
-        ("part-5", 5, 10) ::
-        ("part-6", 6, 10) ::
-        ("part-7", 7, 10) ::
-        ("part-8", 8, 10) ::
-        ("part-9", 9, 10) ::
-        ("part-10", 10, 10) :: Nil
+        Row("part-1", 1, 10) ::
+          Row("part-2", 2, 10) ::
+          Row("part-3", 3, 10) ::
+          Row("part-4", 4, 10) ::
+          Row("part-5", 5, 10) ::
+          Row("part-6", 6, 10) ::
+          Row("part-7", 7, 10) ::
+          Row("part-8", 8, 10) ::
+          Row("part-9", 9, 10) ::
+          Row("part-10", 10, 10) :: Nil
       )
     }
 
     test(s"simple count $table") {
       checkAnswer(
         sql(s"SELECT COUNT(*) FROM $table"),
-        100)
+        Row(100))
     }
 
     test(s"pruned count $table") {
       checkAnswer(
         sql(s"SELECT COUNT(*) FROM $table WHERE p = 1"),
-        10)
+        Row(10))
     }
 
     test(s"non-existant partition $table") {
       checkAnswer(
         sql(s"SELECT COUNT(*) FROM $table WHERE p = 1000"),
-        0)
+        Row(0))
     }
 
     test(s"multi-partition pruned count $table") {
       checkAnswer(
         sql(s"SELECT COUNT(*) FROM $table WHERE p IN (1,2,3)"),
-        30)
+        Row(30))
     }
 
     test(s"non-partition predicates $table") {
       checkAnswer(
         sql(s"SELECT COUNT(*) FROM $table WHERE intField IN (1,2,3)"),
-        30)
+        Row(30))
     }
 
     test(s"sum $table") {
       checkAnswer(
         sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND p = 1"),
-        1 + 2 + 3)
+        Row(1 + 2 + 3))
     }
 
     test(s"hive udfs $table") {
@@ -254,6 +266,6 @@ abstract class ParquetTest extends QueryTest with BeforeAndAfterAll {
   test("non-part select(*)") {
     checkAnswer(
       sql("SELECT COUNT(*) FROM normal_parquet"),
-      10)
+      Row(10))
   }
 }
diff --git a/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala b/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala
index 754ffc422072d..c0b7741bc3e53 100644
--- a/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala
+++ b/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala
@@ -18,8 +18,12 @@
 package org.apache.spark.sql.hive
 
 import java.net.URI
-import java.util.{ArrayList => JArrayList}
-import java.util.Properties
+import java.util.{ArrayList => JArrayList, Properties}
+
+import scala.collection.JavaConversions._
+import scala.language.implicitConversions
+
+import org.apache.hadoop.{io => hadoopIo}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.common.`type`.HiveDecimal
@@ -29,21 +33,17 @@ import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table}
 import org.apache.hadoop.hive.ql.plan.{CreateTableDesc, FileSinkDesc, TableDesc}
 import org.apache.hadoop.hive.ql.processors._
 import org.apache.hadoop.hive.ql.stats.StatsSetupConst
+import org.apache.hadoop.hive.serde2.{ColumnProjectionUtils, Deserializer, io => hiveIo}
+import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, PrimitiveObjectInspector}
 import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.{HiveDecimalObjectInspector, PrimitiveObjectInspectorFactory}
-import org.apache.hadoop.hive.serde2.objectinspector.{PrimitiveObjectInspector, ObjectInspector}
 import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfo, TypeInfoFactory}
-import org.apache.hadoop.hive.serde2.{Deserializer, ColumnProjectionUtils}
-import org.apache.hadoop.hive.serde2.{io => hiveIo}
-import org.apache.hadoop.{io => hadoopIo}
+import org.apache.hadoop.io.NullWritable
 import org.apache.hadoop.mapred.InputFormat
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
-import scala.collection.JavaConversions._
-import scala.language.implicitConversions
 
-import org.apache.spark.sql.catalyst.types.DecimalType
+import org.apache.spark.sql.types.{Decimal, DecimalType}
 
-class HiveFunctionWrapper(var functionClassName: String) extends java.io.Serializable {
+case class HiveFunctionWrapper(functionClassName: String) extends java.io.Serializable {
   // for Serialization
   def this() = this(null)
 
@@ -71,76 +71,114 @@ private[hive] object HiveShim {
   def getStringWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
       PrimitiveCategory.STRING,
-      if (value == null) null else new hadoopIo.Text(value.asInstanceOf[String]))
+      getStringWritable(value))
 
   def getIntWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
       PrimitiveCategory.INT,
-      if (value == null) null else new hadoopIo.IntWritable(value.asInstanceOf[Int]))
+      getIntWritable(value))
 
   def getDoubleWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
       PrimitiveCategory.DOUBLE,
-      if (value == null) null else new hiveIo.DoubleWritable(value.asInstanceOf[Double]))
+      getDoubleWritable(value))
 
   def getBooleanWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
       PrimitiveCategory.BOOLEAN,
-      if (value == null) null else new hadoopIo.BooleanWritable(value.asInstanceOf[Boolean]))
+      getBooleanWritable(value))
 
   def getLongWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
       PrimitiveCategory.LONG,
-      if (value == null) null else new hadoopIo.LongWritable(value.asInstanceOf[Long]))
+      getLongWritable(value))
 
   def getFloatWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
       PrimitiveCategory.FLOAT,
-      if (value == null) null else new hadoopIo.FloatWritable(value.asInstanceOf[Float]))
+      getFloatWritable(value))
 
   def getShortWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
       PrimitiveCategory.SHORT,
-      if (value == null) null else new hiveIo.ShortWritable(value.asInstanceOf[Short]))
+      getShortWritable(value))
 
   def getByteWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
       PrimitiveCategory.BYTE,
-      if (value == null) null else new hiveIo.ByteWritable(value.asInstanceOf[Byte]))
+      getByteWritable(value))
 
   def getBinaryWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
       PrimitiveCategory.BINARY,
-      if (value == null) null else new hadoopIo.BytesWritable(value.asInstanceOf[Array[Byte]]))
+      getBinaryWritable(value))
 
   def getDateWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
       PrimitiveCategory.DATE,
-      if (value == null) null else new hiveIo.DateWritable(value.asInstanceOf[java.sql.Date]))
+      getDateWritable(value))
 
   def getTimestampWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
       PrimitiveCategory.TIMESTAMP,
-      if (value == null) {
-        null
-      } else {
-        new hiveIo.TimestampWritable(value.asInstanceOf[java.sql.Timestamp])
-      })
+      getTimestampWritable(value))
 
   def getDecimalWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
       PrimitiveCategory.DECIMAL,
-      if (value == null) {
-        null
-      } else {
-        new hiveIo.HiveDecimalWritable(
-          HiveShim.createDecimal(value.asInstanceOf[Decimal].toBigDecimal.underlying()))
-      })
+      getDecimalWritable(value))
 
   def getPrimitiveNullWritableConstantObjectInspector: ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
       PrimitiveCategory.VOID, null)
 
+  def getStringWritable(value: Any): hadoopIo.Text =
+    if (value == null) null else new hadoopIo.Text(value.asInstanceOf[String])
+
+  def getIntWritable(value: Any): hadoopIo.IntWritable =
+    if (value == null) null else new hadoopIo.IntWritable(value.asInstanceOf[Int])
+
+  def getDoubleWritable(value: Any): hiveIo.DoubleWritable =
+    if (value == null) null else new hiveIo.DoubleWritable(value.asInstanceOf[Double])
+
+  def getBooleanWritable(value: Any): hadoopIo.BooleanWritable =
+    if (value == null) null else new hadoopIo.BooleanWritable(value.asInstanceOf[Boolean])
+
+  def getLongWritable(value: Any): hadoopIo.LongWritable =
+    if (value == null) null else new hadoopIo.LongWritable(value.asInstanceOf[Long])
+
+  def getFloatWritable(value: Any): hadoopIo.FloatWritable =
+    if (value == null) null else new hadoopIo.FloatWritable(value.asInstanceOf[Float])
+
+  def getShortWritable(value: Any): hiveIo.ShortWritable =
+    if (value == null) null else new hiveIo.ShortWritable(value.asInstanceOf[Short])
+
+  def getByteWritable(value: Any): hiveIo.ByteWritable =
+    if (value == null) null else new hiveIo.ByteWritable(value.asInstanceOf[Byte])
+
+  def getBinaryWritable(value: Any): hadoopIo.BytesWritable =
+    if (value == null) null else new hadoopIo.BytesWritable(value.asInstanceOf[Array[Byte]])
+
+  def getDateWritable(value: Any): hiveIo.DateWritable =
+    if (value == null) null else new hiveIo.DateWritable(value.asInstanceOf[java.sql.Date])
+
+  def getTimestampWritable(value: Any): hiveIo.TimestampWritable =
+    if (value == null) {
+      null
+    } else {
+      new hiveIo.TimestampWritable(value.asInstanceOf[java.sql.Timestamp])
+    }
+
+  def getDecimalWritable(value: Any): hiveIo.HiveDecimalWritable =
+    if (value == null) {
+      null
+    } else {
+      new hiveIo.HiveDecimalWritable(
+        HiveShim.createDecimal(value.asInstanceOf[Decimal].toJavaBigDecimal))
+    }
+
+  def getPrimitiveNullWritable: NullWritable = NullWritable.get()
+
   def createDriverResultsArray = new JArrayList[String]
 
   def processResults(results: JArrayList[String]) = results
@@ -174,6 +212,7 @@ private[hive] object HiveShim {
 
   def compatibilityBlackList = Seq(
     "decimal_.*",
+    "udf7",
     "drop_partitions_filter2",
     "show_.*",
     "serde_regex",
@@ -196,7 +235,11 @@ private[hive] object HiveShim {
   }
 
   def toCatalystDecimal(hdoi: HiveDecimalObjectInspector, data: Any): Decimal = {
-    Decimal(hdoi.getPrimitiveJavaObject(data).bigDecimalValue())
+    if (hdoi.preferWritable()) {
+      Decimal(hdoi.getPrimitiveWritableObject(data).getHiveDecimal().bigDecimalValue)
+    } else {
+      Decimal(hdoi.getPrimitiveJavaObject(data).bigDecimalValue())
+    }
   }
 }
 
diff --git a/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala b/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala
index 7c8cbf10c1c30..c04cda7bf1537 100644
--- a/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala
+++ b/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala
@@ -20,8 +20,12 @@ package org.apache.spark.sql.hive
 import java.util.{ArrayList => JArrayList}
 import java.util.Properties
 
+import scala.collection.JavaConversions._
+import scala.language.implicitConversions
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.io.NullWritable
 import org.apache.hadoop.mapred.InputFormat
 import org.apache.hadoop.hive.common.StatsSetupConst
 import org.apache.hadoop.hive.common.`type`.{HiveDecimal}
@@ -36,12 +40,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.{PrimitiveObjectInspector,
 import org.apache.hadoop.hive.serde2.{Deserializer, ColumnProjectionUtils}
 import org.apache.hadoop.hive.serde2.{io => hiveIo}
 import org.apache.hadoop.{io => hadoopIo}
-import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.types.DecimalType
-import org.apache.spark.sql.catalyst.types.decimal.Decimal
 
-import scala.collection.JavaConversions._
-import scala.language.implicitConversions
+import org.apache.spark.Logging
+import org.apache.spark.sql.types.{Decimal, DecimalType}
 
 
 /**
@@ -52,7 +53,7 @@ import scala.language.implicitConversions
  *
  * @param functionClassName UDF class name
  */
-class HiveFunctionWrapper(var functionClassName: String) extends java.io.Externalizable {
+case class HiveFunctionWrapper(var functionClassName: String) extends java.io.Externalizable {
   // for Serialization
   def this() = this(null)
 
@@ -163,91 +164,123 @@ private[hive] object HiveShim {
     new TableDesc(inputFormatClass, outputFormatClass, properties)
   }
 
+
   def getStringWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.stringTypeInfo,
-      if (value == null) null else new hadoopIo.Text(value.asInstanceOf[String]))
+      TypeInfoFactory.stringTypeInfo, getStringWritable(value))
 
   def getIntWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.intTypeInfo,
-      if (value == null) null else new hadoopIo.IntWritable(value.asInstanceOf[Int]))
+      TypeInfoFactory.intTypeInfo, getIntWritable(value))
 
   def getDoubleWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.doubleTypeInfo, if (value == null) {
-        null
-      } else {
-        new hiveIo.DoubleWritable(value.asInstanceOf[Double])
-      })
+      TypeInfoFactory.doubleTypeInfo, getDoubleWritable(value))
 
   def getBooleanWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.booleanTypeInfo, if (value == null) {
-        null
-      } else {
-        new hadoopIo.BooleanWritable(value.asInstanceOf[Boolean])
-      })
+      TypeInfoFactory.booleanTypeInfo, getBooleanWritable(value))
 
   def getLongWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.longTypeInfo,
-      if (value == null) null else new hadoopIo.LongWritable(value.asInstanceOf[Long]))
+      TypeInfoFactory.longTypeInfo, getLongWritable(value))
 
   def getFloatWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.floatTypeInfo, if (value == null) {
-        null
-      } else {
-        new hadoopIo.FloatWritable(value.asInstanceOf[Float])
-      })
+      TypeInfoFactory.floatTypeInfo, getFloatWritable(value))
 
   def getShortWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.shortTypeInfo,
-      if (value == null) null else new hiveIo.ShortWritable(value.asInstanceOf[Short]))
+      TypeInfoFactory.shortTypeInfo, getShortWritable(value))
 
   def getByteWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.byteTypeInfo,
-      if (value == null) null else new hiveIo.ByteWritable(value.asInstanceOf[Byte]))
+      TypeInfoFactory.byteTypeInfo, getByteWritable(value))
 
   def getBinaryWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.binaryTypeInfo, if (value == null) {
-        null
-      } else {
-        new hadoopIo.BytesWritable(value.asInstanceOf[Array[Byte]])
-      })
+      TypeInfoFactory.binaryTypeInfo, getBinaryWritable(value))
 
   def getDateWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.dateTypeInfo,
-      if (value == null) null else new hiveIo.DateWritable(value.asInstanceOf[java.sql.Date]))
+      TypeInfoFactory.dateTypeInfo, getDateWritable(value))
 
   def getTimestampWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.timestampTypeInfo, if (value == null) {
-        null
-      } else {
-        new hiveIo.TimestampWritable(value.asInstanceOf[java.sql.Timestamp])
-      })
+      TypeInfoFactory.timestampTypeInfo, getTimestampWritable(value))
 
   def getDecimalWritableConstantObjectInspector(value: Any): ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-      TypeInfoFactory.decimalTypeInfo,
-      if (value == null) {
-        null
-      } else {
-        // TODO precise, scale?
-        new hiveIo.HiveDecimalWritable(
-          HiveShim.createDecimal(value.asInstanceOf[Decimal].toBigDecimal.underlying()))
-      })
+      TypeInfoFactory.decimalTypeInfo, getDecimalWritable(value))
 
   def getPrimitiveNullWritableConstantObjectInspector: ObjectInspector =
     PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
       TypeInfoFactory.voidTypeInfo, null)
 
+  def getStringWritable(value: Any): hadoopIo.Text =
+    if (value == null) null else new hadoopIo.Text(value.asInstanceOf[String])
+
+  def getIntWritable(value: Any): hadoopIo.IntWritable =
+    if (value == null) null else new hadoopIo.IntWritable(value.asInstanceOf[Int])
+
+  def getDoubleWritable(value: Any): hiveIo.DoubleWritable =
+    if (value == null) {
+      null
+    } else {
+      new hiveIo.DoubleWritable(value.asInstanceOf[Double])
+    }
+
+  def getBooleanWritable(value: Any): hadoopIo.BooleanWritable =
+    if (value == null) {
+      null
+    } else {
+      new hadoopIo.BooleanWritable(value.asInstanceOf[Boolean])
+    }
+
+  def getLongWritable(value: Any): hadoopIo.LongWritable =
+    if (value == null) null else new hadoopIo.LongWritable(value.asInstanceOf[Long])
+
+  def getFloatWritable(value: Any): hadoopIo.FloatWritable =
+    if (value == null) {
+      null
+    } else {
+      new hadoopIo.FloatWritable(value.asInstanceOf[Float])
+    }
+
+  def getShortWritable(value: Any): hiveIo.ShortWritable =
+    if (value == null) null else new hiveIo.ShortWritable(value.asInstanceOf[Short])
+
+  def getByteWritable(value: Any): hiveIo.ByteWritable =
+    if (value == null) null else new hiveIo.ByteWritable(value.asInstanceOf[Byte])
+
+  def getBinaryWritable(value: Any): hadoopIo.BytesWritable =
+    if (value == null) {
+      null
+    } else {
+      new hadoopIo.BytesWritable(value.asInstanceOf[Array[Byte]])
+    }
+
+  def getDateWritable(value: Any): hiveIo.DateWritable =
+    if (value == null) null else new hiveIo.DateWritable(value.asInstanceOf[java.sql.Date])
+
+  def getTimestampWritable(value: Any): hiveIo.TimestampWritable =
+    if (value == null) {
+      null
+    } else {
+      new hiveIo.TimestampWritable(value.asInstanceOf[java.sql.Timestamp])
+    }
+
+  def getDecimalWritable(value: Any): hiveIo.HiveDecimalWritable =
+    if (value == null) {
+      null
+    } else {
+      // TODO precise, scale?
+      new hiveIo.HiveDecimalWritable(
+        HiveShim.createDecimal(value.asInstanceOf[Decimal].toJavaBigDecimal))
+    }
+
+  def getPrimitiveNullWritable: NullWritable = NullWritable.get()
+
   def createDriverResultsArray = new JArrayList[Object]
 
   def processResults(results: JArrayList[Object]) = {
@@ -355,7 +388,12 @@ private[hive] object HiveShim {
   }
 
   def toCatalystDecimal(hdoi: HiveDecimalObjectInspector, data: Any): Decimal = {
-    Decimal(hdoi.getPrimitiveJavaObject(data).bigDecimalValue(), hdoi.precision(), hdoi.scale())
+    if (hdoi.preferWritable()) {
+      Decimal(hdoi.getPrimitiveWritableObject(data).getHiveDecimal().bigDecimalValue,
+        hdoi.precision(), hdoi.scale())
+    } else {
+      Decimal(hdoi.getPrimitiveJavaObject(data).bigDecimalValue(), hdoi.precision(), hdoi.scale())
+    }
   }
 }
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index b8b8f2e6cab65..22b0d714b57f6 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -48,11 +48,6 @@
       <groupId>org.scala-lang</groupId>
       <artifactId>scala-library</artifactId>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
@@ -73,18 +68,13 @@
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
     <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
-      
-      <!-- 
-           This plugin forces the generation of jar containing streaming test classes, 
+      <!--
+           This plugin forces the generation of jar containing streaming test classes,
            so that the tests classes of external modules can use them. The two execution profiles
            are necessary - first one for 'mvn package', second one for 'mvn test-compile'. Ideally,
-           'mvn compile' should not compile test classes and therefore should not need this. 
+           'mvn compile' should not compile test classes and therefore should not need this.
            However, an open Maven bug (http://jira.codehaus.org/browse/MNG-3559)
-           causes the compilation to fail if streaming test-jar is not generated. Hence, the 
+           causes the compilation to fail if streaming test-jar is not generated. Hence, the
            second execution profile for 'mvn test-compile'.
       -->
       <plugin>
@@ -106,5 +96,13 @@
         </executions>
       </plugin>
     </plugins>
+    <resources>
+      <resource>
+        <directory>../python</directory>
+        <includes>
+          <include>pyspark/streaming/*.py</include>
+        </includes>
+      </resource>
+    </resources>
   </build>
 </project>
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala b/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala
index a0aeacbc733bd..fdbbe2aa6ef08 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala
@@ -17,30 +17,63 @@
 
 package org.apache.spark.streaming
 
+import java.util.concurrent.TimeUnit
+import java.util.concurrent.locks.ReentrantLock
+
 private[streaming] class ContextWaiter {
+
+  private val lock = new ReentrantLock()
+  private val condition = lock.newCondition()
+
+  // Guarded by "lock"
   private var error: Throwable = null
-  private var stopped: Boolean = false
 
-  def notifyError(e: Throwable) = synchronized {
-    error = e
-    notifyAll()
-  }
+  // Guarded by "lock"
+  private var stopped: Boolean = false
 
-  def notifyStop() = synchronized {
-    stopped = true
-    notifyAll()
+  def notifyError(e: Throwable): Unit = {
+    lock.lock()
+    try {
+      error = e
+      condition.signalAll()
+    } finally {
+      lock.unlock()
+    }
   }
 
-  def waitForStopOrError(timeout: Long = -1) = synchronized {
-    // If already had error, then throw it
-    if (error != null) {
-      throw error
+  def notifyStop(): Unit = {
+    lock.lock()
+    try {
+      stopped = true
+      condition.signalAll()
+    } finally {
+      lock.unlock()
     }
+  }
 
-    // If not already stopped, then wait
-    if (!stopped) {
-      if (timeout < 0) wait() else wait(timeout)
+  /**
+   * Return `true` if it's stopped; or throw the reported error if `notifyError` has been called; or
+   * `false` if the waiting time detectably elapsed before return from the method.
+   */
+  def waitForStopOrError(timeout: Long = -1): Boolean = {
+    lock.lock()
+    try {
+      if (timeout < 0) {
+        while (!stopped && error == null) {
+          condition.await()
+        }
+      } else {
+        var nanos = TimeUnit.MILLISECONDS.toNanos(timeout)
+        while (!stopped && error == null && nanos > 0) {
+          nanos = condition.awaitNanos(nanos)
+        }
+      }
+      // If already had error, then throw it
       if (error != null) throw error
+      // already stopped or timeout
+      stopped
+    } finally {
+      lock.unlock()
     }
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
index e59c24adb84af..0e285d6088ec1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
@@ -160,6 +160,14 @@ final private[streaming] class DStreamGraph extends Serializable with Logging {
     }
   }
 
+  /**
+   * Get the maximum remember duration across all the input streams. This is a conservative but
+   * safe remember duration which can be used to perform cleanup operations.
+   */
+  def getMaxInputStreamRememberDuration(): Duration = {
+    inputStreams.map { _.rememberDuration }.maxBy { _.milliseconds }
+  }
+
   @throws(classOf[IOException])
   private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
     logDebug("DStreamGraph.writeObject used")
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index ecab5510a8e7b..8ef0787137845 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -22,7 +22,6 @@ import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.Map
 import scala.collection.mutable.Queue
-import scala.language.implicitConversions
 import scala.reflect.ClassTag
 
 import akka.actor.{Props, SupervisorStrategy}
@@ -523,9 +522,11 @@ object StreamingContext extends Logging {
 
   private[streaming] val DEFAULT_CLEANER_TTL = 3600
 
-  implicit def toPairDStreamFunctions[K, V](stream: DStream[(K, V)])
+  @deprecated("Replaced by implicit functions in the DStream companion object. This is " +
+    "kept here only for backward compatibility.", "1.3.0")
+  def toPairDStreamFunctions[K, V](stream: DStream[(K, V)])
       (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K] = null) = {
-    new PairDStreamFunctions[K, V](stream)
+    DStream.toPairDStreamFunctions(stream)(kt, vt, ord)
   }
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingSource.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingSource.scala
index e35a568ddf115..9697437dd2fe5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingSource.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingSource.scala
@@ -29,9 +29,17 @@ private[streaming] class StreamingSource(ssc: StreamingContext) extends Source {
   private val streamingListener = ssc.progressListener
 
   private def registerGauge[T](name: String, f: StreamingJobProgressListener => T,
-      defaultValue: T) {
+      defaultValue: T): Unit = {
+    registerGaugeWithOption[T](name,
+      (l: StreamingJobProgressListener) => Option(f(streamingListener)), defaultValue)
+  }
+
+  private def registerGaugeWithOption[T](
+      name: String,
+      f: StreamingJobProgressListener => Option[T],
+      defaultValue: T): Unit = {
     metricRegistry.register(MetricRegistry.name("streaming", name), new Gauge[T] {
-      override def getValue: T = Option(f(streamingListener)).getOrElse(defaultValue)
+      override def getValue: T = f(streamingListener).getOrElse(defaultValue)
     })
   }
 
@@ -41,6 +49,12 @@ private[streaming] class StreamingSource(ssc: StreamingContext) extends Source {
   // Gauge for number of total completed batches
   registerGauge("totalCompletedBatches", _.numTotalCompletedBatches, 0L)
 
+  // Gauge for number of total received records
+  registerGauge("totalReceivedRecords", _.numTotalReceivedRecords, 0L)
+
+  // Gauge for number of total processed records
+  registerGauge("totalProcessedRecords", _.numTotalProcessedRecords, 0L)
+
   // Gauge for number of unprocessed batches
   registerGauge("unprocessedBatches", _.numUnprocessedBatches, 0L)
 
@@ -55,19 +69,30 @@ private[streaming] class StreamingSource(ssc: StreamingContext) extends Source {
 
   // Gauge for last completed batch, useful for monitoring the streaming job's running status,
   // displayed data -1 for any abnormal condition.
-  registerGauge("lastCompletedBatch_submissionTime",
-    _.lastCompletedBatch.map(_.submissionTime).getOrElse(-1L), -1L)
-  registerGauge("lastCompletedBatch_processStartTime",
-    _.lastCompletedBatch.flatMap(_.processingStartTime).getOrElse(-1L), -1L)
-  registerGauge("lastCompletedBatch_processEndTime",
-    _.lastCompletedBatch.flatMap(_.processingEndTime).getOrElse(-1L), -1L)
+  registerGaugeWithOption("lastCompletedBatch_submissionTime",
+    _.lastCompletedBatch.map(_.submissionTime), -1L)
+  registerGaugeWithOption("lastCompletedBatch_processingStartTime",
+    _.lastCompletedBatch.flatMap(_.processingStartTime), -1L)
+  registerGaugeWithOption("lastCompletedBatch_processingEndTime",
+    _.lastCompletedBatch.flatMap(_.processingEndTime), -1L)
+
+  // Gauge for last completed batch's delay information.
+  registerGaugeWithOption("lastCompletedBatch_processingDelay",
+    _.lastCompletedBatch.flatMap(_.processingDelay), -1L)
+  registerGaugeWithOption("lastCompletedBatch_schedulingDelay",
+    _.lastCompletedBatch.flatMap(_.schedulingDelay), -1L)
+  registerGaugeWithOption("lastCompletedBatch_totalDelay",
+    _.lastCompletedBatch.flatMap(_.totalDelay), -1L)
 
   // Gauge for last received batch, useful for monitoring the streaming job's running status,
   // displayed data -1 for any abnormal condition.
-  registerGauge("lastReceivedBatch_submissionTime",
-    _.lastCompletedBatch.map(_.submissionTime).getOrElse(-1L), -1L)
-  registerGauge("lastReceivedBatch_processStartTime",
-    _.lastCompletedBatch.flatMap(_.processingStartTime).getOrElse(-1L), -1L)
-  registerGauge("lastReceivedBatch_processEndTime",
-    _.lastCompletedBatch.flatMap(_.processingEndTime).getOrElse(-1L), -1L)
+  registerGaugeWithOption("lastReceivedBatch_submissionTime",
+    _.lastCompletedBatch.map(_.submissionTime), -1L)
+  registerGaugeWithOption("lastReceivedBatch_processingStartTime",
+    _.lastCompletedBatch.flatMap(_.processingStartTime), -1L)
+  registerGaugeWithOption("lastReceivedBatch_processingEndTime",
+    _.lastCompletedBatch.flatMap(_.processingEndTime), -1L)
+
+  // Gauge for last received batch records.
+  registerGauge("lastReceivedBatch_records", _.lastReceivedBatchRecords.values.sum, 0L)
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index 2a7004e56ef53..c382a12f4d099 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -51,7 +51,15 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
    * operator, so this DStream will be registered as an output stream and there materialized.
    */
   def print(): Unit = {
-    dstream.print()
+    print(10)
+  }
+
+  /**
+   * Print the first num elements of each RDD generated in this DStream. This is an output
+   * operator, so this DStream will be registered as an output stream and there materialized.
+   */
+  def print(num: Int): Unit = {
+    dstream.print(num)
   }
 
   /**
@@ -203,7 +211,9 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
    * @param slideDuration  sliding interval of the window (i.e., the interval after which
    *                       the new DStream will generate RDDs); must be a multiple of this
    *                       DStream's batching interval
+   * @deprecated As this API is not Java compatible.
    */
+  @deprecated("Use Java-compatible version of reduceByWindow", "1.3.0")
   def reduceByWindow(
       reduceFunc: (T, T) => T,
       windowDuration: Duration,
@@ -212,6 +222,24 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.reduceByWindow(reduceFunc, windowDuration, slideDuration)
   }
 
+  /**
+   * Return a new DStream in which each RDD has a single element generated by reducing all
+   * elements in a sliding window over this DStream.
+   * @param reduceFunc associative reduce function
+   * @param windowDuration width of the window; must be a multiple of this DStream's
+   *                       batching interval
+   * @param slideDuration  sliding interval of the window (i.e., the interval after which
+   *                       the new DStream will generate RDDs); must be a multiple of this
+   *                       DStream's batching interval
+   */
+  def reduceByWindow(
+      reduceFunc: JFunction2[T, T, T],
+      windowDuration: Duration,
+      slideDuration: Duration
+    ): JavaDStream[T] = {
+    dstream.reduceByWindow(reduceFunc, windowDuration, slideDuration)
+  }
+
   /**
    * Return a new DStream in which each RDD has a single element generated by reducing all
    * elements in a sliding window over this DStream. However, the reduction is done incrementally
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
index bb44b906d7386..de124cf40eff1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
@@ -36,7 +36,6 @@ import org.apache.spark.api.java.function.{Function => JFunction, Function2 => J
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming._
-import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.dstream.DStream
 
 /**
@@ -815,6 +814,6 @@ object JavaPairDStream {
 
   def scalaToJavaLong[K: ClassTag](dstream: JavaPairDStream[K, Long])
   : JavaPairDStream[K, JLong] = {
-    StreamingContext.toPairDStreamFunctions(dstream.dstream).mapValues(new JLong(_))
+    DStream.toPairDStreamFunctions(dstream.dstream).mapValues(new JLong(_))
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index d8695b8e05962..9a2254bcdc1f7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -17,14 +17,15 @@
 
 package org.apache.spark.streaming.api.java
 
+import java.lang.{Boolean => JBoolean}
+import java.io.{Closeable, InputStream}
+import java.util.{List => JList, Map => JMap}
 
 import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
 
-import java.io.{Closeable, InputStream}
-import java.util.{List => JList, Map => JMap}
-
 import akka.actor.{Props, SupervisorStrategy}
+import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat}
 
 import org.apache.spark.{SparkConf, SparkContext}
@@ -250,21 +251,53 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
    * Files must be written to the monitored directory by "moving" them from another
    * location within the same file system. File names starting with . are ignored.
    * @param directory HDFS directory to monitor for new file
+   * @param kClass class of key for reading HDFS file
+   * @param vClass class of value for reading HDFS file
+   * @param fClass class of input format for reading HDFS file
    * @tparam K Key type for reading HDFS file
    * @tparam V Value type for reading HDFS file
    * @tparam F Input format for reading HDFS file
    */
   def fileStream[K, V, F <: NewInputFormat[K, V]](
-      directory: String): JavaPairInputDStream[K, V] = {
-    implicit val cmk: ClassTag[K] =
-      implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[K]]
-    implicit val cmv: ClassTag[V] =
-      implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[V]]
-    implicit val cmf: ClassTag[F] =
-      implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[F]]
+      directory: String,
+      kClass: Class[K],
+      vClass: Class[V],
+      fClass: Class[F]): JavaPairInputDStream[K, V] = {
+    implicit val cmk: ClassTag[K] = ClassTag(kClass)
+    implicit val cmv: ClassTag[V] = ClassTag(vClass)
+    implicit val cmf: ClassTag[F] = ClassTag(fClass)
     ssc.fileStream[K, V, F](directory)
   }
 
+  /**
+   * Create an input stream that monitors a Hadoop-compatible filesystem
+   * for new files and reads them using the given key-value types and input format.
+   * Files must be written to the monitored directory by "moving" them from another
+   * location within the same file system. File names starting with . are ignored.
+   * @param directory HDFS directory to monitor for new file
+   * @param kClass class of key for reading HDFS file
+   * @param vClass class of value for reading HDFS file
+   * @param fClass class of input format for reading HDFS file
+   * @param filter Function to filter paths to process
+   * @param newFilesOnly Should process only new files and ignore existing files in the directory
+   * @tparam K Key type for reading HDFS file
+   * @tparam V Value type for reading HDFS file
+   * @tparam F Input format for reading HDFS file
+   */
+  def fileStream[K, V, F <: NewInputFormat[K, V]](
+      directory: String,
+      kClass: Class[K],
+      vClass: Class[V],
+      fClass: Class[F],
+      filter: JFunction[Path, JBoolean],
+      newFilesOnly: Boolean): JavaPairInputDStream[K, V] = {
+    implicit val cmk: ClassTag[K] = ClassTag(kClass)
+    implicit val cmv: ClassTag[V] = ClassTag(vClass)
+    implicit val cmf: ClassTag[F] = ClassTag(fClass)
+    def fn = (x: Path) => filter.call(x).booleanValue()
+    ssc.fileStream[K, V, F](directory, fn, newFilesOnly)
+  }
+
   /**
    * Create an input stream with any arbitrary user implemented actor receiver.
    * @param props Props object defining creation of the actor
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index dbf1ebbaf653a..b874f561c12eb 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -20,16 +20,16 @@ package org.apache.spark.streaming.dstream
 
 import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
 
-import scala.deprecated
 import scala.collection.mutable.HashMap
+import scala.language.implicitConversions
 import scala.reflect.ClassTag
 import scala.util.matching.Regex
 
 import org.apache.spark.{Logging, SparkException}
-import org.apache.spark.rdd.{BlockRDD, RDD}
+import org.apache.spark.rdd.{BlockRDD, PairRDDFunctions, RDD}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming._
-import org.apache.spark.streaming.StreamingContext._
+import org.apache.spark.streaming.StreamingContext.rddToFileName
 import org.apache.spark.streaming.scheduler.Job
 import org.apache.spark.util.{CallSite, MetadataCleaner, Utils}
 
@@ -48,8 +48,7 @@ import org.apache.spark.util.{CallSite, MetadataCleaner, Utils}
  * `window`. In addition, [[org.apache.spark.streaming.dstream.PairDStreamFunctions]] contains
  * operations available only on DStreams of key-value pairs, such as `groupByKeyAndWindow` and
  * `join`. These operations are automatically available on any DStream of pairs
- * (e.g., DStream[(Int, Int)] through implicit conversions when
- * `org.apache.spark.streaming.StreamingContext._` is imported.
+ * (e.g., DStream[(Int, Int)] through implicit conversions.
  *
  * DStreams internally is characterized by a few basic properties:
  *  - A list of other DStreams that the DStream depends on
@@ -293,7 +292,13 @@ abstract class DStream[T: ClassTag] (
         // set this DStream's creation site, generate RDDs and then restore the previous call site.
         val prevCallSite = ssc.sparkContext.getCallSite()
         ssc.sparkContext.setCallSite(creationSite)
-        val rddOption = compute(time)
+        // Disable checks for existing output directories in jobs launched by the streaming
+        // scheduler, since we may need to write output to an existing directory during checkpoint
+        // recovery; see SPARK-4835 for more details. We need to have this call here because
+        // compute() might cause Spark jobs to be launched.
+        val rddOption = PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
+          compute(time)
+        }
         ssc.sparkContext.setCallSite(prevCallSite)
 
         rddOption.foreach { case newRDD =>
@@ -606,13 +611,21 @@ abstract class DStream[T: ClassTag] (
    * operator, so this DStream will be registered as an output stream and there materialized.
    */
   def print() {
+    print(10)
+  }
+
+  /**
+   * Print the first num elements of each RDD generated in this DStream. This is an output
+   * operator, so this DStream will be registered as an output stream and there materialized.
+   */
+  def print(num: Int) {
     def foreachFunc = (rdd: RDD[T], time: Time) => {
-      val first11 = rdd.take(11)
+      val firstNum = rdd.take(num + 1)
       println ("-------------------------------------------")
       println ("Time: " + time)
       println ("-------------------------------------------")
-      first11.take(10).foreach(println)
-      if (first11.size > 10) println("...")
+      firstNum.take(num).foreach(println)
+      if (firstNum.size > num) println("...")
       println()
     }
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
@@ -802,10 +815,21 @@ abstract class DStream[T: ClassTag] (
   }
 }
 
-private[streaming] object DStream {
+object DStream {
+
+  // `toPairDStreamFunctions` was in SparkContext before 1.3 and users had to
+  // `import StreamingContext._` to enable it. Now we move it here to make the compiler find
+  // it automatically. However, we still keep the old function in StreamingContext for backward
+  // compatibility and forward to the following function directly.
+
+  implicit def toPairDStreamFunctions[K, V](stream: DStream[(K, V)])
+      (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K] = null):
+    PairDStreamFunctions[K, V] = {
+    new PairDStreamFunctions[K, V](stream)
+  }
 
   /** Get the creation site of a DStream from the stack trace of when the DStream is created. */
-  def getCreationSite(): CallSite = {
+  private[streaming] def getCreationSite(): CallSite = {
     val SPARK_CLASS_REGEX = """^org\.apache\.spark""".r
     val SPARK_STREAMING_TESTCLASS_REGEX = """^org\.apache\.spark\.streaming\.test""".r
     val SPARK_EXAMPLES_CLASS_REGEX = """^org\.apache\.spark\.examples""".r
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
index 5f13fdc5579ed..e7c5639a63499 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.streaming.dstream
 
 import java.io.{IOException, ObjectInputStream}
+import java.util.concurrent.ConcurrentHashMap
 
 import scala.collection.mutable
 import scala.reflect.ClassTag
@@ -74,12 +75,15 @@ class FileInputDStream[K: ClassTag, V: ClassTag, F <: NewInputFormat[K,V] : Clas
     newFilesOnly: Boolean = true)
   extends InputDStream[(K, V)](ssc_) {
 
+  // This is a def so that it works during checkpoint recovery:
+  private def clock = ssc.scheduler.clock
+
   // Data to be saved as part of the streaming checkpoints
   protected[streaming] override val checkpointData = new FileInputDStreamCheckpointData
 
   // Initial ignore threshold based on which old, existing files in the directory (at the time of
   // starting the streaming application) will be ignored or considered
-  private val initialModTimeIgnoreThreshold = if (newFilesOnly) System.currentTimeMillis() else 0L
+  private val initialModTimeIgnoreThreshold = if (newFilesOnly) clock.currentTime() else 0L
 
   /*
    * Make sure that the information of files selected in the last few batches are remembered.
@@ -91,8 +95,9 @@ class FileInputDStream[K: ClassTag, V: ClassTag, F <: NewInputFormat[K,V] : Clas
   remember(durationToRemember)
 
   // Map of batch-time to selected file info for the remembered batches
+  // This is a concurrent map because it's also accessed in unit tests
   @transient private[streaming] var batchTimeToSelectedFiles =
-    new mutable.HashMap[Time, Array[String]]
+    new mutable.HashMap[Time, Array[String]] with mutable.SynchronizedMap[Time, Array[String]]
 
   // Set of files that were selected in the remembered batches
   @transient private var recentlySelectedFiles = new mutable.HashSet[String]()
@@ -151,7 +156,7 @@ class FileInputDStream[K: ClassTag, V: ClassTag, F <: NewInputFormat[K,V] : Clas
    */
   private def findNewFiles(currentTime: Long): Array[String] = {
     try {
-      lastNewFileFindingTime = System.currentTimeMillis
+      lastNewFileFindingTime = clock.currentTime()
 
       // Calculate ignore threshold
       val modTimeIgnoreThreshold = math.max(
@@ -164,7 +169,7 @@ class FileInputDStream[K: ClassTag, V: ClassTag, F <: NewInputFormat[K,V] : Clas
         def accept(path: Path): Boolean = isNewFile(path, currentTime, modTimeIgnoreThreshold)
       }
       val newFiles = fs.listStatus(directoryPath, filter).map(_.getPath.toString)
-      val timeTaken = System.currentTimeMillis - lastNewFileFindingTime
+      val timeTaken = clock.currentTime() - lastNewFileFindingTime
       logInfo("Finding new files took " + timeTaken + " ms")
       logDebug("# cached file times = " + fileToModTime.size)
       if (timeTaken > slideDuration.milliseconds) {
@@ -267,7 +272,8 @@ class FileInputDStream[K: ClassTag, V: ClassTag, F <: NewInputFormat[K,V] : Clas
     logDebug(this.getClass().getSimpleName + ".readObject used")
     ois.defaultReadObject()
     generatedRDDs = new mutable.HashMap[Time, RDD[(K,V)]] ()
-    batchTimeToSelectedFiles = new mutable.HashMap[Time, Array[String]]()
+    batchTimeToSelectedFiles =
+      new mutable.HashMap[Time, Array[String]] with mutable.SynchronizedMap[Time, Array[String]]
     recentlySelectedFiles = new mutable.HashSet[String]()
     fileToModTime = new TimeStampedHashMap[String, Long](true)
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
index 98539e06b4e29..8a58571632447 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
@@ -27,12 +27,10 @@ import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat}
 import org.apache.spark.{HashPartitioner, Partitioner, SerializableWritable}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.streaming.{Duration, Time}
-import org.apache.spark.streaming.StreamingContext._
+import org.apache.spark.streaming.StreamingContext.rddToFileName
 
 /**
  * Extra functions available on DStream of (key, value) pairs through an implicit conversion.
- * Import `org.apache.spark.streaming.StreamingContext._` at the top of your program to use
- * these functions.
  */
 class PairDStreamFunctions[K, V](self: DStream[(K,V)])
     (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K])
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
index c834744631e02..8be04314c4285 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
@@ -86,7 +86,7 @@ abstract class ReceiverInputDStream[T: ClassTag](@transient ssc_ : StreamingCont
           }.toArray
           // Since storeInBlockManager = false, the storage level does not matter.
           new WriteAheadLogBackedBlockRDD[T](ssc.sparkContext,
-            blockIds, logSegments, storeInBlockManager = true, StorageLevel.MEMORY_ONLY_SER)
+            blockIds, logSegments, storeInBlockManager = false, StorageLevel.MEMORY_ONLY_SER)
         } else {
           new BlockRDD[T](ssc.sc, blockIds)
         }
@@ -94,15 +94,4 @@ abstract class ReceiverInputDStream[T: ClassTag](@transient ssc_ : StreamingCont
     }
     Some(blockRDD)
   }
-
-  /**
-   * Clear metadata that are older than `rememberDuration` of this DStream.
-   * This is an internal method that should not be called directly. This
-   * implementation overrides the default implementation to clear received
-   * block information.
-   */
-  private[streaming] override def clearMetadata(time: Time) {
-    super.clearMetadata(time)
-    ssc.scheduler.receiverTracker.cleanupOldMetadata(time - rememberDuration)
-  }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala
index 1a47089e513c4..c0a5af0b65cc3 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.streaming.dstream
 
-import org.apache.spark.streaming.StreamingContext._
-
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rdd.{CoGroupedRDD, MapPartitionsRDD}
 import org.apache.spark.Partitioner
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
index 7cd4554282ca1..71b61856e23c0 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.streaming.dstream
 
-import org.apache.spark.rdd.RDD
+import org.apache.spark.rdd.{PairRDDFunctions, RDD}
 import org.apache.spark.streaming.{Duration, Time}
 import scala.reflect.ClassTag
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/package.scala b/streaming/src/main/scala/org/apache/spark/streaming/package.scala
index 4dd985cf5a178..2153ae0d34184 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/package.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/package.scala
@@ -26,7 +26,7 @@ package org.apache.spark
  * available only on DStreams
  * of key-value pairs, such as `groupByKey` and `reduceByKey`. These operations are automatically
  * available on any DStream of the right type (e.g. DStream[(Int, Int)] through implicit
- * conversions when you `import org.apache.spark.streaming.StreamingContext._`.
+ * conversions.
  *
  * For the Java API of Spark Streaming, take a look at the
  * [[org.apache.spark.streaming.api.java.JavaStreamingContext]] which serves as the entry point, and
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ActorReceiver.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ActorReceiver.scala
index 1868a1ebc7b4a..a7d63bd4f2dbf 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ActorReceiver.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ActorReceiver.scala
@@ -123,7 +123,7 @@ private[streaming] case class ByteBufferData(bytes: ByteBuffer) extends ActorRec
  * As Actors can also be used to receive data from almost any stream source.
  * A nice set of abstraction(s) for actors as receivers is already provided for
  * a few general cases. It is thus exposed as an API where user may come with
- * his own Actor to run as receiver for Spark Streaming input source.
+ * their own Actor to run as receiver for Spark Streaming input source.
  *
  * This starts a supervisor actor which starts workers and also provides
  * [http://doc.akka.io/docs/akka/snapshot/scala/fault-tolerance.html fault-tolerance].
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
index 55765dc90698b..79263a7183977 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
@@ -116,7 +116,7 @@ private[streaming] class BlockGenerator(
 
   /**
    * Push a single data item into the buffer. After buffering the data, the
-   * `BlockGeneratorListnere.onAddData` callback will be called. All received data items
+   * `BlockGeneratorListener.onAddData` callback will be called. All received data items
    * will be periodically pushed into BlockManager.
    */
   def addDataWithCallback(data: Any, metadata: Any) = synchronized {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
index fdf995320beb4..f7a8ebee8a544 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
@@ -42,7 +42,7 @@ private[streaming] trait ReceivedBlockHandler {
   def storeBlock(blockId: StreamBlockId, receivedBlock: ReceivedBlock): ReceivedBlockStoreResult
 
   /** Cleanup old blocks older than the given threshold time */
-  def cleanupOldBlock(threshTime: Long)
+  def cleanupOldBlocks(threshTime: Long)
 }
 
 
@@ -82,7 +82,7 @@ private[streaming] class BlockManagerBasedBlockHandler(
     BlockManagerBasedStoreResult(blockId)
   }
 
-  def cleanupOldBlock(threshTime: Long) {
+  def cleanupOldBlocks(threshTime: Long) {
     // this is not used as blocks inserted into the BlockManager are cleared by DStream's clearing
     // of BlockRDDs.
   }
@@ -121,6 +121,24 @@ private[streaming] class WriteAheadLogBasedBlockHandler(
   private val maxFailures = conf.getInt(
     "spark.streaming.receiver.writeAheadLog.maxFailures", 3)
 
+  private val effectiveStorageLevel = {
+    if (storageLevel.deserialized) {
+      logWarning(s"Storage level serialization ${storageLevel.deserialized} is not supported when" +
+        s" write ahead log is enabled, change to serialization false")
+    }
+    if (storageLevel.replication > 1) {
+      logWarning(s"Storage level replication ${storageLevel.replication} is unnecessary when " +
+        s"write ahead log is enabled, change to replication 1")
+    }
+
+    StorageLevel(storageLevel.useDisk, storageLevel.useMemory, storageLevel.useOffHeap, false, 1)
+  }
+
+  if (storageLevel != effectiveStorageLevel) {
+    logWarning(s"User defined storage level $storageLevel is changed to effective storage level " +
+      s"$effectiveStorageLevel when write ahead log is enabled")
+  }
+
   // Manages rolling log files
   private val logManager = new WriteAheadLogManager(
     checkpointDirToLogDir(checkpointDir, streamId),
@@ -156,7 +174,7 @@ private[streaming] class WriteAheadLogBasedBlockHandler(
     // Store the block in block manager
     val storeInBlockManagerFuture = Future {
       val putResult =
-        blockManager.putBytes(blockId, serializedBlock, storageLevel, tellMaster = true)
+        blockManager.putBytes(blockId, serializedBlock, effectiveStorageLevel, tellMaster = true)
       if (!putResult.map { _._1 }.contains(blockId)) {
         throw new SparkException(
           s"Could not store $blockId to block manager with storage level $storageLevel")
@@ -169,16 +187,13 @@ private[streaming] class WriteAheadLogBasedBlockHandler(
     }
 
     // Combine the futures, wait for both to complete, and return the write ahead log segment
-    val combinedFuture = for {
-      _ <- storeInBlockManagerFuture
-      fileSegment <- storeInWriteAheadLogFuture
-    } yield fileSegment
+    val combinedFuture = storeInBlockManagerFuture.zip(storeInWriteAheadLogFuture).map(_._2)
     val segment = Await.result(combinedFuture, blockStoreTimeout)
     WriteAheadLogBasedStoreResult(blockId, segment)
   }
 
-  def cleanupOldBlock(threshTime: Long) {
-    logManager.cleanupOldLogs(threshTime)
+  def cleanupOldBlocks(threshTime: Long) {
+    logManager.cleanupOldLogs(threshTime, waitForCompletion = false)
   }
 
   def stop() {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala
index bf39d1e891cae..7bf3c33319491 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala
@@ -17,7 +17,10 @@
 
 package org.apache.spark.streaming.receiver
 
-/** Messages sent to the NetworkReceiver. */
-private[streaming] sealed trait ReceiverMessage
+import org.apache.spark.streaming.Time
+
+/** Messages sent to the Receiver. */
+private[streaming] sealed trait ReceiverMessage extends Serializable
 private[streaming] object StopReceiver extends ReceiverMessage
+private[streaming] case class CleanupOldBlocks(threshTime: Time) extends ReceiverMessage
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
index 3b1233e86c210..716cf2c7f32fc 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
@@ -30,6 +30,7 @@ import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.{Logging, SparkEnv, SparkException}
 import org.apache.spark.storage.StreamBlockId
+import org.apache.spark.streaming.Time
 import org.apache.spark.streaming.scheduler._
 import org.apache.spark.util.{AkkaUtils, Utils}
 
@@ -77,18 +78,14 @@ private[streaming] class ReceiverSupervisorImpl(
   /** Akka actor for receiving messages from the ReceiverTracker in the driver */
   private val actor = env.actorSystem.actorOf(
     Props(new Actor {
-      override def preStart() {
-        logInfo("Registered receiver " + streamId)
-        val msg = RegisterReceiver(
-          streamId, receiver.getClass.getSimpleName, Utils.localHostName(), self)
-        val future = trackerActor.ask(msg)(askTimeout)
-        Await.result(future, askTimeout)
-      }
 
       override def receive() = {
         case StopReceiver =>
           logInfo("Received stop signal")
           stop("Stopped by driver", None)
+        case CleanupOldBlocks(threshTime) =>
+          logDebug("Received delete old batch signal")
+          cleanupOldBlocks(threshTime)
       }
 
       def ref = self
@@ -200,4 +197,9 @@ private[streaming] class ReceiverSupervisorImpl(
 
   /** Generate new block ID */
   private def nextBlockId = StreamBlockId(streamId, newBlockId.getAndIncrement)
+
+  private def cleanupOldBlocks(cleanupThreshTime: Time): Unit = {
+    logDebug(s"Cleaning up blocks older then $cleanupThreshTime")
+    receivedBlockHandler.cleanupOldBlocks(cleanupThreshTime.milliseconds)
+  }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
index 39b66e1130768..8632c94349bf9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark.streaming.scheduler
 
-import akka.actor.{ActorRef, ActorSystem, Props, Actor}
-import org.apache.spark.{SparkException, SparkEnv, Logging}
-import org.apache.spark.streaming.{Checkpoint, Time, CheckpointWriter}
-import org.apache.spark.streaming.util.{ManualClock, RecurringTimer, Clock}
 import scala.util.{Failure, Success, Try}
 
+import akka.actor.{ActorRef, Props, Actor}
+
+import org.apache.spark.{SparkEnv, Logging}
+import org.apache.spark.streaming.{Checkpoint, CheckpointWriter, Time}
+import org.apache.spark.streaming.util.{Clock, ManualClock, RecurringTimer}
+
 /** Event classes for JobGenerator */
 private[scheduler] sealed trait JobGeneratorEvent
 private[scheduler] case class GenerateJobs(time: Time) extends JobGeneratorEvent
@@ -206,9 +208,13 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging {
     val timesToReschedule = (pendingTimes ++ downTimes).distinct.sorted(Time.ordering)
     logInfo("Batches to reschedule (" + timesToReschedule.size + " batches): " +
       timesToReschedule.mkString(", "))
-    timesToReschedule.foreach(time =>
+    timesToReschedule.foreach { time =>
+      // Allocate the related blocks when recovering from failure, because some blocks that were
+      // added but not allocated, are dangling in the queue after recovering, we have to allocate
+      // those blocks to the next batch, which is the batch they were supposed to go.
+      jobScheduler.receiverTracker.allocateBlocksToBatch(time) // allocate received blocks to batch
       jobScheduler.submitJobSet(JobSet(time, graph.generateJobs(time)))
-    )
+    }
 
     // Restart the timer
     timer.start(restartTime.milliseconds)
@@ -238,13 +244,17 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging {
   /** Clear DStream metadata for the given `time`. */
   private def clearMetadata(time: Time) {
     ssc.graph.clearMetadata(time)
-    jobScheduler.receiverTracker.cleanupOldMetadata(time - graph.batchDuration)
 
     // If checkpointing is enabled, then checkpoint,
     // else mark batch to be fully processed
     if (shouldCheckpoint) {
       eventActor ! DoCheckpoint(time)
     } else {
+      // If checkpointing is not enabled, then delete metadata information about
+      // received blocks (block data not saved in any case). Otherwise, wait for
+      // checkpointing of this batch to complete.
+      val maxRememberDuration = graph.getMaxInputStreamRememberDuration()
+      jobScheduler.receiverTracker.cleanupOldBlocksAndBatches(time - maxRememberDuration)
       markBatchFullyProcessed(time)
     }
   }
@@ -252,6 +262,11 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging {
   /** Clear DStream checkpoint data for the given `time`. */
   private def clearCheckpointData(time: Time) {
     ssc.graph.clearCheckpointData(time)
+
+    // All the checkpoint information about which batches have been processed, etc have
+    // been saved to checkpoints, so its safe to delete block metadata and data WAL files
+    val maxRememberDuration = graph.getMaxInputStreamRememberDuration()
+    jobScheduler.receiverTracker.cleanupOldBlocksAndBatches(time - maxRememberDuration)
     markBatchFullyProcessed(time)
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
index cfa3cd8925c80..0e0f5bd3b9db4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
@@ -22,6 +22,7 @@ import scala.collection.JavaConversions._
 import java.util.concurrent.{TimeUnit, ConcurrentHashMap, Executors}
 import akka.actor.{ActorRef, Actor, Props}
 import org.apache.spark.{SparkException, Logging, SparkEnv}
+import org.apache.spark.rdd.PairRDDFunctions
 import org.apache.spark.streaming._
 
 
@@ -168,7 +169,12 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
   private class JobHandler(job: Job) extends Runnable {
     def run() {
       eventActor ! JobStarted(job)
-      job.run()
+      // Disable checks for existing output directories in jobs launched by the streaming scheduler,
+      // since we may need to write output to an existing directory during checkpoint recovery;
+      // see SPARK-4835 for more details.
+      PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
+        job.run()
+      }
       eventActor ! JobCompleted(job)
     }
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
index 02758e0bca6c5..e19ac939f9ac5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
@@ -67,7 +67,7 @@ private[streaming] class ReceivedBlockTracker(
   extends Logging {
 
   private type ReceivedBlockQueue = mutable.Queue[ReceivedBlockInfo]
-  
+
   private val streamIdToUnallocatedBlockQueues = new mutable.HashMap[Int, ReceivedBlockQueue]
   private val timeToAllocatedBlocks = new mutable.HashMap[Time, AllocatedBlocks]
   private val logManagerOption = createLogManager()
@@ -107,8 +107,14 @@ private[streaming] class ReceivedBlockTracker(
       lastAllocatedBatchTime = batchTime
       allocatedBlocks
     } else {
-      throw new SparkException(s"Unexpected allocation of blocks, " +
-        s"last batch = $lastAllocatedBatchTime, batch time to allocate = $batchTime  ")
+      // This situation occurs when:
+      // 1. WAL is ended with BatchAllocationEvent, but without BatchCleanupEvent,
+      // possibly processed batch job or half-processed batch job need to be processed again,
+      // so the batchTime will be equal to lastAllocatedBatchTime.
+      // 2. Slow checkpointing makes recovered batch time older than WAL recovered
+      // lastAllocatedBatchTime.
+      // This situation will only occurs in recovery time.
+      logInfo(s"Possibly processed batch $batchTime need to be processed again in WAL recovery")
     }
   }
 
@@ -139,15 +145,17 @@ private[streaming] class ReceivedBlockTracker(
     getReceivedBlockQueue(streamId).toSeq
   }
 
-  /** Clean up block information of old batches. */
-  def cleanupOldBatches(cleanupThreshTime: Time): Unit = synchronized {
+  /**
+   * Clean up block information of old batches. If waitForCompletion is true, this method
+   * returns only after the files are cleaned up.
+   */
+  def cleanupOldBatches(cleanupThreshTime: Time, waitForCompletion: Boolean): Unit = synchronized {
     assert(cleanupThreshTime.milliseconds < clock.currentTime())
     val timesToCleanup = timeToAllocatedBlocks.keys.filter { _ < cleanupThreshTime }.toSeq
     logInfo("Deleting batches " + timesToCleanup)
     writeToLog(BatchCleanupEvent(timesToCleanup))
     timeToAllocatedBlocks --= timesToCleanup
-    logManagerOption.foreach(_.cleanupOldLogs(cleanupThreshTime.milliseconds))
-    log
+    logManagerOption.foreach(_.cleanupOldLogs(cleanupThreshTime.milliseconds, waitForCompletion))
   }
 
   /** Stop the block tracker. */
@@ -200,9 +208,11 @@ private[streaming] class ReceivedBlockTracker(
 
   /** Write an update to the tracker to the write ahead log */
   private def writeToLog(record: ReceivedBlockTrackerLogEvent) {
-    logDebug(s"Writing to log $record")
-    logManagerOption.foreach { logManager =>
+    if (isLogManagerEnabled) {
+      logDebug(s"Writing to log $record")
+      logManagerOption.foreach { logManager =>
         logManager.writeToLog(ByteBuffer.wrap(Utils.serialize(record)))
+      }
     }
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index 32e481dabc8ca..4f998869731ed 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -24,9 +24,8 @@ import scala.language.existentials
 import akka.actor._
 
 import org.apache.spark.{Logging, SerializableWritable, SparkEnv, SparkException}
-import org.apache.spark.SparkContext._
 import org.apache.spark.streaming.{StreamingContext, Time}
-import org.apache.spark.streaming.receiver.{Receiver, ReceiverSupervisorImpl, StopReceiver}
+import org.apache.spark.streaming.receiver.{CleanupOldBlocks, Receiver, ReceiverSupervisorImpl, StopReceiver}
 
 /**
  * Messages used by the NetworkReceiver and the ReceiverTracker to communicate
@@ -119,9 +118,20 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
     }
   }
 
-    /** Clean up metadata older than the given threshold time */
-  def cleanupOldMetadata(cleanupThreshTime: Time) {
-    receivedBlockTracker.cleanupOldBatches(cleanupThreshTime)
+  /**
+   * Clean up the data and metadata of blocks and batches that are strictly
+   * older than the threshold time. Note that this does not
+   */
+  def cleanupOldBlocksAndBatches(cleanupThreshTime: Time) {
+    // Clean up old block and batch metadata
+    receivedBlockTracker.cleanupOldBatches(cleanupThreshTime, waitForCompletion = false)
+
+    // Signal the receivers to delete old block data
+    if (ssc.conf.getBoolean("spark.streaming.receiver.writeAheadLog.enable", false)) {
+      logInfo(s"Cleanup old received batch data: $cleanupThreshTime")
+      receiverInfo.values.flatMap { info => Option(info.actor) }
+        .foreach { _ ! CleanupOldBlocks(cleanupThreshTime) }
+    }
   }
 
   /** Register a receiver */
@@ -150,8 +160,8 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
         logWarning("No prior receiver info")
         ReceiverInfo(streamId, "", null, false, "", lastErrorMessage = message, lastError = error)
     }
-    receiverInfo(streamId) = newReceiverInfo
-    listenerBus.post(StreamingListenerReceiverStopped(receiverInfo(streamId)))
+    receiverInfo -= streamId
+    listenerBus.post(StreamingListenerReceiverStopped(newReceiverInfo))
     val messageWithError = if (error != null && !error.isEmpty) {
       s"$message - $error"
     } else {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
index f61069b56db5e..5ee53a5c5f561 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
@@ -25,7 +25,6 @@ import org.apache.spark.streaming.scheduler.StreamingListenerBatchStarted
 import org.apache.spark.streaming.scheduler.BatchInfo
 import org.apache.spark.streaming.scheduler.StreamingListenerBatchSubmitted
 import org.apache.spark.util.Distribution
-import org.apache.spark.Logging
 
 
 private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
@@ -36,6 +35,8 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
   private val completedaBatchInfos = new Queue[BatchInfo]
   private val batchInfoLimit = ssc.conf.getInt("spark.streaming.ui.retainedBatches", 100)
   private var totalCompletedBatches = 0L
+  private var totalReceivedRecords = 0L
+  private var totalProcessedRecords = 0L
   private val receiverInfos = new HashMap[Int, ReceiverInfo]
 
   val batchDuration = ssc.graph.batchDuration.milliseconds
@@ -65,6 +66,10 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
   override def onBatchStarted(batchStarted: StreamingListenerBatchStarted) = synchronized {
     runningBatchInfos(batchStarted.batchInfo.batchTime) = batchStarted.batchInfo
     waitingBatchInfos.remove(batchStarted.batchInfo.batchTime)
+
+    batchStarted.batchInfo.receivedBlockInfo.foreach { case (_, infos) =>
+      totalReceivedRecords += infos.map(_.numRecords).sum
+    }
   }
 
   override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted) = synchronized {
@@ -73,6 +78,10 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
     completedaBatchInfos.enqueue(batchCompleted.batchInfo)
     if (completedaBatchInfos.size > batchInfoLimit) completedaBatchInfos.dequeue()
     totalCompletedBatches += 1L
+
+    batchCompleted.batchInfo.receivedBlockInfo.foreach { case (_, infos) =>
+      totalProcessedRecords += infos.map(_.numRecords).sum
+    }
   }
 
   def numReceivers = synchronized {
@@ -83,6 +92,14 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
     totalCompletedBatches
   }
 
+  def numTotalReceivedRecords: Long = synchronized {
+    totalReceivedRecords
+  }
+
+  def numTotalProcessedRecords: Long = synchronized {
+    totalProcessedRecords
+  }
+
   def numUnprocessedBatches: Long = synchronized {
     waitingBatchInfos.size + runningBatchInfos.size
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
index 1353e487c72cf..98e9a2e639e25 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
@@ -67,6 +67,12 @@ private[ui] class StreamingPage(parent: StreamingTab)
       <li>
         <strong>Waiting batches: </strong>{listener.numUnprocessedBatches}
       </li>
+      <li>
+        <strong>Received records: </strong>{listener.numTotalReceivedRecords}
+      </li>
+      <li>
+        <strong>Processed records: </strong>{listener.numTotalProcessedRecords}
+      </li>
     </ul>
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/Clock.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/Clock.scala
index 7cd867ce34b87..d6d96d7ba00fd 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/Clock.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/Clock.scala
@@ -59,9 +59,11 @@ class SystemClock() extends Clock {
 private[streaming]
 class ManualClock() extends Clock {
 
-  var time = 0L
+  private var time = 0L
 
-  def currentTime() = time
+  def currentTime() = this.synchronized {
+    time
+  }
 
   def setTime(timeToSet: Long) = {
     this.synchronized {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala
index 27a28bab83ed5..858ba3c9eb4e5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala
@@ -63,7 +63,7 @@ private[streaming] object HdfsUtils {
   }
 
   def getFileSystemForPath(path: Path, conf: Configuration): FileSystem = {
-    // For local file systems, return the raw loca file system, such calls to flush()
+    // For local file systems, return the raw local file system, such calls to flush()
     // actually flushes the stream.
     val fs = path.getFileSystem(conf)
     fs match {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogManager.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogManager.scala
index 70d234320be7c..166661b7496df 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogManager.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogManager.scala
@@ -19,11 +19,11 @@ package org.apache.spark.streaming.util
 import java.nio.ByteBuffer
 
 import scala.collection.mutable.ArrayBuffer
-import scala.concurrent.{ExecutionContext, Future}
+import scala.concurrent.duration.Duration
+import scala.concurrent.{Await, ExecutionContext, Future}
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
-import org.apache.hadoop.fs.permission.FsPermission
 import org.apache.spark.Logging
 import org.apache.spark.util.Utils
 import WriteAheadLogManager._
@@ -124,8 +124,12 @@ private[streaming] class WriteAheadLogManager(
    * files, which is usually based on the local system time. So if there is coordination necessary
    * between the node calculating the threshTime (say, driver node), and the local system time
    * (say, worker node), the caller has to take account of possible time skew.
+   *
+   * If waitForCompletion is set to true, this method will return only after old logs have been
+   * deleted. This should be set to true only for testing. Else the files will be deleted
+   * asynchronously.
    */
-  def cleanupOldLogs(threshTime: Long): Unit = {
+  def cleanupOldLogs(threshTime: Long, waitForCompletion: Boolean): Unit = {
     val oldLogFiles = synchronized { pastLogs.filter { _.endTime < threshTime } }
     logInfo(s"Attempting to clear ${oldLogFiles.size} old log files in $logDirectory " +
       s"older than $threshTime: ${oldLogFiles.map { _.path }.mkString("\n")}")
@@ -146,10 +150,15 @@ private[streaming] class WriteAheadLogManager(
       logInfo(s"Cleared log files in $logDirectory older than $threshTime")
     }
     if (!executionContext.isShutdown) {
-      Future { deleteFiles() }
+      val f = Future { deleteFiles() }
+      if (waitForCompletion) {
+        import scala.concurrent.duration._
+        Await.ready(f, 1 second)
+      }
     }
   }
 
+
   /** Stop the manager, close any open log writer */
   def stop(): Unit = synchronized {
     if (currentLogWriter != null) {
diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
index ce645fccba1d0..d4c40745658c2 100644
--- a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
+++ b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
@@ -17,13 +17,20 @@
 
 package org.apache.spark.streaming;
 
+import java.io.*;
+import java.lang.Iterable;
+import java.nio.charset.Charset;
+import java.util.*;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 import scala.Tuple2;
 
 import org.junit.Assert;
+import static org.junit.Assert.*;
 import org.junit.Test;
-import java.io.*;
-import java.util.*;
-import java.lang.Iterable;
 
 import com.google.common.base.Optional;
 import com.google.common.collect.Lists;
@@ -57,7 +64,7 @@ public void equalIterable(Iterable<?> a, Iterable<?> b) {
 
   @Test
   public void testInitialization() {
-    Assert.assertNotNull(ssc.sc());
+    Assert.assertNotNull(ssc.sparkContext());
   }
 
   @SuppressWarnings("unchecked")
@@ -299,7 +306,17 @@ public void testReduce() {
 
   @SuppressWarnings("unchecked")
   @Test
-  public void testReduceByWindow() {
+  public void testReduceByWindowWithInverse() {
+    testReduceByWindow(true);
+  }
+
+  @SuppressWarnings("unchecked")
+  @Test
+  public void testReduceByWindowWithoutInverse() {
+    testReduceByWindow(false);
+  }
+
+  private void testReduceByWindow(boolean withInverse) {
     List<List<Integer>> inputData = Arrays.asList(
         Arrays.asList(1,2,3),
         Arrays.asList(4,5,6),
@@ -312,8 +329,14 @@ public void testReduceByWindow() {
         Arrays.asList(24));
 
     JavaDStream<Integer> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
-    JavaDStream<Integer> reducedWindowed = stream.reduceByWindow(new IntegerSum(),
+    JavaDStream<Integer> reducedWindowed = null;
+    if (withInverse) {
+      reducedWindowed = stream.reduceByWindow(new IntegerSum(),
         new IntegerDifference(), new Duration(2000), new Duration(1000));
+    } else {
+      reducedWindowed = stream.reduceByWindow(new IntegerSum(),
+        new Duration(2000), new Duration(1000));
+    }
     JavaTestUtils.attachTestOutputStream(reducedWindowed);
     List<List<Integer>> result = JavaTestUtils.runStreams(ssc, 4, 4);
 
@@ -662,7 +685,7 @@ public void testStreamingContextTransform(){
       listOfDStreams1,
       new Function2<List<JavaRDD<?>>, Time, JavaRDD<Long>>() {
         public JavaRDD<Long> call(List<JavaRDD<?>> listOfRDDs, Time time) {
-          assert(listOfRDDs.size() == 2);
+          Assert.assertEquals(2, listOfRDDs.size());
           return null;
         }
       }
@@ -675,7 +698,7 @@ public JavaRDD<Long> call(List<JavaRDD<?>> listOfRDDs, Time time) {
       listOfDStreams2,
       new Function2<List<JavaRDD<?>>, Time, JavaPairRDD<Integer, Tuple2<Integer, String>>>() {
         public JavaPairRDD<Integer, Tuple2<Integer, String>> call(List<JavaRDD<?>> listOfRDDs, Time time) {
-          assert(listOfRDDs.size() == 3);
+          Assert.assertEquals(3, listOfRDDs.size());
           JavaRDD<Integer> rdd1 = (JavaRDD<Integer>)listOfRDDs.get(0);
           JavaRDD<Integer> rdd2 = (JavaRDD<Integer>)listOfRDDs.get(1);
           JavaRDD<Tuple2<Integer, String>> rdd3 = (JavaRDD<Tuple2<Integer, String>>)listOfRDDs.get(2);
@@ -969,7 +992,7 @@ public Integer call(Tuple2<String, Integer> in) throws Exception {
             });
 
     JavaTestUtils.attachTestOutputStream(reversed);
-    List<List<Tuple2<Integer, String>>> result = JavaTestUtils.runStreams(ssc, 2, 2);
+    List<List<Integer>> result = JavaTestUtils.runStreams(ssc, 2, 2);
 
     Assert.assertEquals(expected, result);
   }
@@ -1012,7 +1035,7 @@ public Iterable<Tuple2<Integer, String>> call(Tuple2<String, Integer> in) throws
           }
         });
     JavaTestUtils.attachTestOutputStream(flatMapped);
-    List<List<Tuple2<String, Integer>>> result = JavaTestUtils.runStreams(ssc, 2, 2);
+    List<List<Tuple2<Integer, String>>> result = JavaTestUtils.runStreams(ssc, 2, 2);
 
     Assert.assertEquals(expected, result);
   }
@@ -1163,9 +1186,9 @@ public void testGroupByKeyAndWindow() {
     JavaTestUtils.attachTestOutputStream(groupWindowed);
     List<List<Tuple2<String, List<Integer>>>> result = JavaTestUtils.runStreams(ssc, 3, 3);
 
-    assert(result.size() == expected.size());
+    Assert.assertEquals(expected.size(), result.size());
     for (int i = 0; i < result.size(); i++) {
-      assert(convert(result.get(i)).equals(convert(expected.get(i))));
+      Assert.assertEquals(convert(expected.get(i)), convert(result.get(i)));
     }
   }
 
@@ -1383,7 +1406,7 @@ public JavaPairRDD<Integer, Integer> call(JavaPairRDD<Integer, Integer> in) thro
         });
 
     JavaTestUtils.attachTestOutputStream(sorted);
-    List<List<Tuple2<String, String>>> result = JavaTestUtils.runStreams(ssc, 2, 2);
+    List<List<Tuple2<Integer, Integer>>> result = JavaTestUtils.runStreams(ssc, 2, 2);
 
     Assert.assertEquals(expected, result);
   }
@@ -1743,13 +1766,66 @@ public Iterable<String> call(InputStream in) throws IOException {
       StorageLevel.MEMORY_ONLY());
   }
 
+  @SuppressWarnings("unchecked")
   @Test
-  public void testTextFileStream() {
-    JavaDStream<String> test = ssc.textFileStream("/tmp/foo");
+  public void testTextFileStream() throws IOException {
+    File testDir = Utils.createTempDir(System.getProperty("java.io.tmpdir"));
+    List<List<String>> expected = fileTestPrepare(testDir);
+
+    JavaDStream<String> input = ssc.textFileStream(testDir.toString());
+    JavaTestUtils.attachTestOutputStream(input);
+    List<List<String>> result = JavaTestUtils.runStreams(ssc, 1, 1);
+
+    assertOrderInvariantEquals(expected, result);
+  }
+
+  @SuppressWarnings("unchecked")
+  @Test
+  public void testFileStream() throws IOException {
+    File testDir = Utils.createTempDir(System.getProperty("java.io.tmpdir"));
+    List<List<String>> expected = fileTestPrepare(testDir);
+
+    JavaPairInputDStream<LongWritable, Text> inputStream = ssc.fileStream(
+      testDir.toString(),
+      LongWritable.class,
+      Text.class,
+      TextInputFormat.class,
+      new Function<Path, Boolean>() {
+        @Override
+        public Boolean call(Path v1) throws Exception {
+          return Boolean.TRUE;
+        }
+      },
+      true);
+
+    JavaDStream<String> test = inputStream.map(
+      new Function<Tuple2<LongWritable, Text>, String>() {
+        @Override
+        public String call(Tuple2<LongWritable, Text> v1) throws Exception {
+          return v1._2().toString();
+        }
+    });
+
+    JavaTestUtils.attachTestOutputStream(test);
+    List<List<String>> result = JavaTestUtils.runStreams(ssc, 1, 1);
+
+    assertOrderInvariantEquals(expected, result);
   }
 
   @Test
   public void testRawSocketStream() {
     JavaReceiverInputDStream<String> test = ssc.rawSocketStream("localhost", 12345);
   }
+
+  private List<List<String>> fileTestPrepare(File testDir) throws IOException {
+    File existingFile = new File(testDir, "0");
+    Files.write("0\n", existingFile, Charset.forName("UTF-8"));
+    assertTrue(existingFile.setLastModified(1000) && existingFile.lastModified() == 1000);
+
+    List<List<String>> expected = Arrays.asList(
+      Arrays.asList("0")
+    );
+
+    return expected;
+  }
 }
diff --git a/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
index 6e1f01900071b..1e24da7f5f60c 100644
--- a/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
+++ b/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.streaming;
 
+import org.apache.spark.SparkConf;
 import org.apache.spark.streaming.api.java.JavaStreamingContext;
 import org.junit.After;
 import org.junit.Before;
@@ -27,8 +28,11 @@ public abstract class LocalJavaStreamingContext {
 
     @Before
     public void setUp() {
-        System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
-        ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000));
+        SparkConf conf = new SparkConf()
+            .setMaster("local[2]")
+            .setAppName("test")
+            .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
+        ssc = new JavaStreamingContext(conf, new Duration(1000));
         ssc.checkpoint("checkpoint");
     }
 
diff --git a/streaming/src/test/resources/log4j.properties b/streaming/src/test/resources/log4j.properties
index 4411d6e20c52a..9697237bfa1a3 100644
--- a/streaming/src/test/resources/log4j.properties
+++ b/streaming/src/test/resources/log4j.properties
@@ -15,11 +15,10 @@
 # limitations under the License.
 #
 
-# Set everything to be logged to the file streaming/target/unit-tests.log
+# Set everything to be logged to the file target/unit-tests.log
 log4j.rootCategory=INFO, file
-# log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
index 86b96785d7b87..e8f4a7779ec21 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
@@ -28,7 +28,6 @@ import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.{BlockRDD, RDD}
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.dstream.{DStream, WindowedDStream}
 import org.apache.spark.HashPartitioner
 
@@ -639,7 +638,7 @@ class BasicOperationsSuite extends TestSuiteBase {
       if (rememberDuration != null) ssc.remember(rememberDuration)
       val output = runStreams[(Int, Int)](ssc, cleanupTestInput.size, numExpectedOutput)
       val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
-      assert(clock.time === Seconds(10).milliseconds)
+      assert(clock.currentTime() === Seconds(10).milliseconds)
       assert(output.size === numExpectedOutput)
       operatedStream
     }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
index c97998add8ffa..8f8bc61437ba5 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
@@ -18,19 +18,19 @@
 package org.apache.spark.streaming
 
 import java.io.File
-import java.nio.charset.Charset
 
-import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer}
 import scala.reflect.ClassTag
 
+import com.google.common.base.Charsets
 import com.google.common.io.Files
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.io.{IntWritable, Text}
 import org.apache.hadoop.mapred.TextOutputFormat
 import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat}
+import org.scalatest.concurrent.Eventually._
 
-import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.dstream.{DStream, FileInputDStream}
 import org.apache.spark.streaming.util.ManualClock
 import org.apache.spark.util.Utils
@@ -46,8 +46,6 @@ class CheckpointSuite extends TestSuiteBase {
 
   override def batchDuration = Milliseconds(500)
 
-  override def actuallyWait = true // to allow checkpoints to be written
-
   override def beforeFunction() {
     super.beforeFunction()
     Utils.deleteRecursively(new File(checkpointDir))
@@ -144,7 +142,6 @@ class CheckpointSuite extends TestSuiteBase {
     ssc.start()
     advanceTimeWithRealDelay(ssc, 4)
     ssc.stop()
-    System.clearProperty("spark.streaming.manualClock.jump")
     ssc = null
   }
 
@@ -256,6 +253,45 @@ class CheckpointSuite extends TestSuiteBase {
     }
   }
 
+  test("recovery with saveAsHadoopFile inside transform operation") {
+    // Regression test for SPARK-4835.
+    //
+    // In that issue, the problem was that `saveAsHadoopFile(s)` would fail when the last batch
+    // was restarted from a checkpoint since the output directory would already exist.  However,
+    // the other saveAsHadoopFile* tests couldn't catch this because they only tested whether the
+    // output matched correctly and not whether the post-restart batch had successfully finished
+    // without throwing any errors.  The following test reproduces the same bug with a test that
+    // actually fails because the error in saveAsHadoopFile causes transform() to fail, which
+    // prevents the expected output from being written to the output stream.
+    //
+    // This is not actually a valid use of transform, but it's being used here so that we can test
+    // the fix for SPARK-4835 independently of additional test cleanup.
+    //
+    // After SPARK-5079 is addressed, should be able to remove this test since a strengthened
+    // version of the other saveAsHadoopFile* tests would prevent regressions for this issue.
+    val tempDir = Files.createTempDir()
+    try {
+      testCheckpointedOperation(
+        Seq(Seq("a", "a", "b"), Seq("", ""), Seq(), Seq("a", "a", "b"), Seq("", ""), Seq()),
+        (s: DStream[String]) => {
+          s.transform { (rdd, time) =>
+            val output = rdd.map(x => (x, 1)).reduceByKey(_ + _)
+            output.saveAsHadoopFile(
+              new File(tempDir, "result-" + time.milliseconds).getAbsolutePath,
+              classOf[Text],
+              classOf[IntWritable],
+              classOf[TextOutputFormat[Text, IntWritable]])
+            output
+          }
+        },
+        Seq(Seq(("a", 2), ("b", 1)), Seq(("", 2)), Seq(), Seq(("a", 2), ("b", 1)), Seq(("", 2)), Seq()),
+        3
+      )
+    } finally {
+      Utils.deleteRecursively(tempDir)
+    }
+  }
+
   // This tests whether the StateDStream's RDD checkpoints works correctly such
   // that the system can recover from a master failure. This assumes as reliable,
   // replayable input source - TestInputDStream.
@@ -274,109 +310,161 @@ class CheckpointSuite extends TestSuiteBase {
     testCheckpointedOperation(input, operation, output, 7)
   }
 
-
   // This tests whether file input stream remembers what files were seen before
   // the master failure and uses them again to process a large window operation.
   // It also tests whether batches, whose processing was incomplete due to the
   // failure, are re-processed or not.
   test("recovery with file input stream") {
     // Set up the streaming context and input streams
+    val batchDuration = Seconds(2)  // Due to 1-second resolution of setLastModified() on some OS's.
     val testDir = Utils.createTempDir()
-    var ssc = new StreamingContext(master, framework, Seconds(1))
-    ssc.checkpoint(checkpointDir)
-    val fileStream = ssc.textFileStream(testDir.toString)
-    // Making value 3 take large time to process, to ensure that the master
-    // shuts down in the middle of processing the 3rd batch
-    val mappedStream = fileStream.map(s => {
-      val i = s.toInt
-      if (i == 3) Thread.sleep(2000)
-      i
-    })
-
-    // Reducing over a large window to ensure that recovery from master failure
-    // requires reprocessing of all the files seen before the failure
-    val reducedStream = mappedStream.reduceByWindow(_ + _, Seconds(30), Seconds(1))
-    val outputBuffer = new ArrayBuffer[Seq[Int]]
-    var outputStream = new TestOutputStream(reducedStream, outputBuffer)
-    outputStream.register()
-    ssc.start()
-
-    // Create files and advance manual clock to process them
-    // var clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
-    Thread.sleep(1000)
-    for (i <- Seq(1, 2, 3)) {
-      Files.write(i + "\n", new File(testDir, i.toString), Charset.forName("UTF-8"))
-      // wait to make sure that the file is written such that it gets shown in the file listings
-      Thread.sleep(1000)
+    val outputBuffer = new ArrayBuffer[Seq[Int]] with SynchronizedBuffer[Seq[Int]]
+
+    /**
+     * Writes a file named `i` (which contains the number `i`) to the test directory and sets its
+     * modification time to `clock`'s current time.
+     */
+    def writeFile(i: Int, clock: ManualClock): Unit = {
+      val file = new File(testDir, i.toString)
+      Files.write(i + "\n", file, Charsets.UTF_8)
+      assert(file.setLastModified(clock.currentTime()))
+      // Check that the file's modification date is actually the value we wrote, since rounding or
+      // truncation will break the test:
+      assert(file.lastModified() === clock.currentTime())
     }
-    logInfo("Output = " + outputStream.output.mkString(","))
-    assert(outputStream.output.size > 0, "No files processed before restart")
-    ssc.stop()
 
-    // Verify whether files created have been recorded correctly or not
-    var fileInputDStream = ssc.graph.getInputStreams().head.asInstanceOf[FileInputDStream[_, _, _]]
-    def recordedFiles = fileInputDStream.batchTimeToSelectedFiles.values.flatten
-    assert(!recordedFiles.filter(_.endsWith("1")).isEmpty)
-    assert(!recordedFiles.filter(_.endsWith("2")).isEmpty)
-    assert(!recordedFiles.filter(_.endsWith("3")).isEmpty)
-
-    // Create files while the master is down
-    for (i <- Seq(4, 5, 6)) {
-      Files.write(i + "\n", new File(testDir, i.toString), Charset.forName("UTF-8"))
-      Thread.sleep(1000)
+    /**
+     * Returns ids that identify which files which have been recorded by the file input stream.
+     */
+    def recordedFiles(ssc: StreamingContext): Seq[Int] = {
+      val fileInputDStream =
+        ssc.graph.getInputStreams().head.asInstanceOf[FileInputDStream[_, _, _]]
+      val filenames = fileInputDStream.batchTimeToSelectedFiles.values.flatten
+      filenames.map(_.split(File.separator).last.toInt).toSeq.sorted
     }
 
-    // Recover context from checkpoint file and verify whether the files that were
-    // recorded before failure were saved and successfully recovered
-    logInfo("*********** RESTARTING ************")
-    ssc = new StreamingContext(checkpointDir)
-    fileInputDStream = ssc.graph.getInputStreams().head.asInstanceOf[FileInputDStream[_, _, _]]
-    assert(!recordedFiles.filter(_.endsWith("1")).isEmpty)
-    assert(!recordedFiles.filter(_.endsWith("2")).isEmpty)
-    assert(!recordedFiles.filter(_.endsWith("3")).isEmpty)
+    try {
+      // This is a var because it's re-assigned when we restart from a checkpoint
+      var clock: ManualClock = null
+      withStreamingContext(new StreamingContext(conf, batchDuration)) { ssc =>
+        ssc.checkpoint(checkpointDir)
+        clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
+        val batchCounter = new BatchCounter(ssc)
+        val fileStream = ssc.textFileStream(testDir.toString)
+        // Make value 3 take a large time to process, to ensure that the driver
+        // shuts down in the middle of processing the 3rd batch
+        CheckpointSuite.batchThreeShouldBlockIndefinitely = true
+        val mappedStream = fileStream.map(s => {
+          val i = s.toInt
+          if (i == 3) {
+            while (CheckpointSuite.batchThreeShouldBlockIndefinitely) {
+              Thread.sleep(Long.MaxValue)
+            }
+          }
+          i
+        })
+
+        // Reducing over a large window to ensure that recovery from driver failure
+        // requires reprocessing of all the files seen before the failure
+        val reducedStream = mappedStream.reduceByWindow(_ + _, batchDuration * 30, batchDuration)
+        val outputStream = new TestOutputStream(reducedStream, outputBuffer)
+        outputStream.register()
+        ssc.start()
+
+        // Advance half a batch so that the first file is created after the StreamingContext starts
+        clock.addToTime(batchDuration.milliseconds / 2)
+        // Create files and advance manual clock to process them
+        for (i <- Seq(1, 2, 3)) {
+          writeFile(i, clock)
+          // Advance the clock after creating the file to avoid a race when
+          // setting its modification time
+          clock.addToTime(batchDuration.milliseconds)
+          if (i != 3) {
+            // Since we want to shut down while the 3rd batch is processing
+            eventually(eventuallyTimeout) {
+              assert(batchCounter.getNumCompletedBatches === i)
+            }
+          }
+        }
+        clock.addToTime(batchDuration.milliseconds)
+        eventually(eventuallyTimeout) {
+          // Wait until all files have been recorded and all batches have started
+          assert(recordedFiles(ssc) === Seq(1, 2, 3) && batchCounter.getNumStartedBatches === 3)
+        }
+        // Wait for a checkpoint to be written
+        val fs = new Path(checkpointDir).getFileSystem(ssc.sc.hadoopConfiguration)
+        eventually(eventuallyTimeout) {
+          assert(Checkpoint.getCheckpointFiles(checkpointDir, fs).size === 6)
+        }
+        ssc.stop()
+        // Check that we shut down while the third batch was being processed
+        assert(batchCounter.getNumCompletedBatches === 2)
+        assert(outputStream.output.flatten === Seq(1, 3))
+      }
 
-    // Restart stream computation
-    ssc.start()
-    for (i <- Seq(7, 8, 9)) {
-      Files.write(i + "\n", new File(testDir, i.toString), Charset.forName("UTF-8"))
-      Thread.sleep(1000)
-    }
-    Thread.sleep(1000)
-    logInfo("Output = " + outputStream.output.mkString("[", ", ", "]"))
-    assert(outputStream.output.size > 0, "No files processed after restart")
-    ssc.stop()
+      // The original StreamingContext has now been stopped.
+      CheckpointSuite.batchThreeShouldBlockIndefinitely = false
 
-    // Verify whether files created while the driver was down have been recorded or not
-    assert(!recordedFiles.filter(_.endsWith("4")).isEmpty)
-    assert(!recordedFiles.filter(_.endsWith("5")).isEmpty)
-    assert(!recordedFiles.filter(_.endsWith("6")).isEmpty)
-
-    // Verify whether new files created after recover have been recorded or not
-    assert(!recordedFiles.filter(_.endsWith("7")).isEmpty)
-    assert(!recordedFiles.filter(_.endsWith("8")).isEmpty)
-    assert(!recordedFiles.filter(_.endsWith("9")).isEmpty)
-
-    // Append the new output to the old buffer
-    outputStream = ssc.graph.getOutputStreams().head.asInstanceOf[TestOutputStream[Int]]
-    outputBuffer ++= outputStream.output
-
-    val expectedOutput = Seq(1, 3, 6, 10, 15, 21, 28, 36, 45)
-    logInfo("--------------------------------")
-    logInfo("output, size = " + outputBuffer.size)
-    outputBuffer.foreach(x => logInfo("[" + x.mkString(",") + "]"))
-    logInfo("expected output, size = " + expectedOutput.size)
-    expectedOutput.foreach(x => logInfo("[" + x + "]"))
-    logInfo("--------------------------------")
-
-    // Verify whether all the elements received are as expected
-    val output = outputBuffer.flatMap(x => x)
-    assert(output.contains(6))  // To ensure that the 3rd input (i.e., 3) was processed
-    output.foreach(o =>         // To ensure all the inputs are correctly added cumulatively
-      assert(expectedOutput.contains(o), "Expected value " + o + " not found")
-    )
-    // To ensure that all the inputs were received correctly
-    assert(expectedOutput.last === output.last)
-    Utils.deleteRecursively(testDir)
+      // Create files while the streaming driver is down
+      for (i <- Seq(4, 5, 6)) {
+        writeFile(i, clock)
+        // Advance the clock after creating the file to avoid a race when
+        // setting its modification time
+        clock.addToTime(batchDuration.milliseconds)
+      }
+
+      // Recover context from checkpoint file and verify whether the files that were
+      // recorded before failure were saved and successfully recovered
+      logInfo("*********** RESTARTING ************")
+      withStreamingContext(new StreamingContext(checkpointDir)) { ssc =>
+        // So that the restarted StreamingContext's clock has gone forward in time since failure
+        ssc.conf.set("spark.streaming.manualClock.jump", (batchDuration * 3).milliseconds.toString)
+        val oldClockTime = clock.currentTime()
+        clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
+        val batchCounter = new BatchCounter(ssc)
+        val outputStream = ssc.graph.getOutputStreams().head.asInstanceOf[TestOutputStream[Int]]
+        // Check that we remember files that were recorded before the restart
+        assert(recordedFiles(ssc) === Seq(1, 2, 3))
+
+        // Restart stream computation
+        ssc.start()
+        // Verify that the clock has traveled forward to the expected time
+        eventually(eventuallyTimeout) {
+          clock.currentTime() === oldClockTime
+        }
+        // Wait for pre-failure batch to be recomputed (3 while SSC was down plus last batch)
+        val numBatchesAfterRestart = 4
+        eventually(eventuallyTimeout) {
+          assert(batchCounter.getNumCompletedBatches === numBatchesAfterRestart)
+        }
+        for ((i, index) <- Seq(7, 8, 9).zipWithIndex) {
+          writeFile(i, clock)
+          // Advance the clock after creating the file to avoid a race when
+          // setting its modification time
+          clock.addToTime(batchDuration.milliseconds)
+          eventually(eventuallyTimeout) {
+            assert(batchCounter.getNumCompletedBatches === index + numBatchesAfterRestart + 1)
+          }
+        }
+        clock.addToTime(batchDuration.milliseconds)
+        logInfo("Output after restart = " + outputStream.output.mkString("[", ", ", "]"))
+        assert(outputStream.output.size > 0, "No files processed after restart")
+        ssc.stop()
+
+        // Verify whether files created while the driver was down (4, 5, 6) and files created after
+        // recovery (7, 8, 9) have been recorded
+        assert(recordedFiles(ssc) === (1 to 9))
+
+        // Append the new output to the old buffer
+        outputBuffer ++= outputStream.output
+
+        // Verify whether all the elements received are as expected
+        val expectedOutput = Seq(1, 3, 6, 10, 15, 21, 28, 36, 45)
+        assert(outputBuffer.flatten.toSet === expectedOutput.toSet)
+      }
+    } finally {
+      Utils.deleteRecursively(testDir)
+    }
   }
 
 
@@ -433,12 +521,12 @@ class CheckpointSuite extends TestSuiteBase {
    */
   def advanceTimeWithRealDelay[V: ClassTag](ssc: StreamingContext, numBatches: Long): Seq[Seq[V]] = {
     val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
-    logInfo("Manual clock before advancing = " + clock.time)
+    logInfo("Manual clock before advancing = " + clock.currentTime())
     for (i <- 1 to numBatches.toInt) {
       clock.addToTime(batchDuration.milliseconds)
       Thread.sleep(batchDuration.milliseconds)
     }
-    logInfo("Manual clock after advancing = " + clock.time)
+    logInfo("Manual clock after advancing = " + clock.currentTime())
     Thread.sleep(batchDuration.milliseconds)
 
     val outputStream = ssc.graph.getOutputStreams.filter { dstream =>
@@ -447,3 +535,7 @@ class CheckpointSuite extends TestSuiteBase {
     outputStream.output.map(_.flatten)
   }
 }
+
+private object CheckpointSuite extends Serializable {
+  var batchThreeShouldBlockIndefinitely: Boolean = true
+}
\ No newline at end of file
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
index 307052a4a9cbb..bddf51e130422 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
@@ -28,7 +28,6 @@ import java.util.concurrent.{Executors, TimeUnit, ArrayBlockingQueue}
 import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer, SynchronizedQueue}
-import scala.concurrent.duration._
 import scala.language.postfixOps
 
 import com.google.common.io.Files
@@ -234,45 +233,57 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
   }
 
   def testFileStream(newFilesOnly: Boolean) {
-    var ssc: StreamingContext = null
     val testDir: File = null
     try {
+      val batchDuration = Seconds(2)
       val testDir = Utils.createTempDir()
+      // Create a file that exists before the StreamingContext is created:
       val existingFile = new File(testDir, "0")
       Files.write("0\n", existingFile, Charset.forName("UTF-8"))
+      assert(existingFile.setLastModified(10000) && existingFile.lastModified === 10000)
 
-      Thread.sleep(1000)
       // Set up the streaming context and input streams
-      val newConf = conf.clone.set(
-        "spark.streaming.clock", "org.apache.spark.streaming.util.SystemClock")
-      ssc = new StreamingContext(newConf, batchDuration)
-      val fileStream = ssc.fileStream[LongWritable, Text, TextInputFormat](
-        testDir.toString, (x: Path) => true, newFilesOnly = newFilesOnly).map(_._2.toString)
-      val outputBuffer = new ArrayBuffer[Seq[String]] with SynchronizedBuffer[Seq[String]]
-      val outputStream = new TestOutputStream(fileStream, outputBuffer)
-      outputStream.register()
-      ssc.start()
-
-      // Create files in the directory
-      val input = Seq(1, 2, 3, 4, 5)
-      input.foreach { i =>
-        Thread.sleep(batchDuration.milliseconds)
-        val file = new File(testDir, i.toString)
-        Files.write(i + "\n", file, Charset.forName("UTF-8"))
-        logInfo("Created file " + file)
-      }
+      withStreamingContext(new StreamingContext(conf, batchDuration)) { ssc =>
+        val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
+        // This `setTime` call ensures that the clock is past the creation time of `existingFile`
+        clock.setTime(existingFile.lastModified + batchDuration.milliseconds)
+        val batchCounter = new BatchCounter(ssc)
+        val fileStream = ssc.fileStream[LongWritable, Text, TextInputFormat](
+          testDir.toString, (x: Path) => true, newFilesOnly = newFilesOnly).map(_._2.toString)
+        val outputBuffer = new ArrayBuffer[Seq[String]] with SynchronizedBuffer[Seq[String]]
+        val outputStream = new TestOutputStream(fileStream, outputBuffer)
+        outputStream.register()
+        ssc.start()
+
+        // Advance the clock so that the files are created after StreamingContext starts, but
+        // not enough to trigger a batch
+        clock.addToTime(batchDuration.milliseconds / 2)
+
+        // Over time, create files in the directory
+        val input = Seq(1, 2, 3, 4, 5)
+        input.foreach { i =>
+          val file = new File(testDir, i.toString)
+          Files.write(i + "\n", file, Charset.forName("UTF-8"))
+          assert(file.setLastModified(clock.currentTime()))
+          assert(file.lastModified === clock.currentTime)
+          logInfo("Created file " + file)
+          // Advance the clock after creating the file to avoid a race when
+          // setting its modification time
+          clock.addToTime(batchDuration.milliseconds)
+          eventually(eventuallyTimeout) {
+            assert(batchCounter.getNumCompletedBatches === i)
+          }
+        }
 
-      // Verify that all the files have been read
-      val expectedOutput = if (newFilesOnly) {
-        input.map(_.toString).toSet
-      } else {
-        (Seq(0) ++ input).map(_.toString).toSet
-      }
-      eventually(timeout(maxWaitTimeMillis milliseconds), interval(100 milliseconds)) {
+        // Verify that all the files have been read
+        val expectedOutput = if (newFilesOnly) {
+          input.map(_.toString).toSet
+        } else {
+          (Seq(0) ++ input).map(_.toString).toSet
+        }
         assert(outputBuffer.flatten.toSet === expectedOutput)
       }
     } finally {
-      if (ssc != null) ssc.stop()
       if (testDir != null) Utils.deleteRecursively(testDir)
     }
   }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala b/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala
index 5dbb7232009eb..e0f14fd954280 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala
@@ -20,7 +20,6 @@ package org.apache.spark.streaming
 import org.apache.spark.Logging
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.util.Utils
-import org.apache.spark.streaming.StreamingContext._
 
 import scala.util.Random
 import scala.collection.mutable.ArrayBuffer
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
index 3661e16a9ef2f..132ff2443fc0f 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
@@ -168,7 +168,7 @@ class ReceivedBlockHandlerSuite extends FunSuite with BeforeAndAfter with Matche
       manualClock.currentTime() shouldEqual 5000L
 
       val cleanupThreshTime = 3000L
-      handler.cleanupOldBlock(cleanupThreshTime)
+      handler.cleanupOldBlocks(cleanupThreshTime)
       eventually(timeout(10000 millis), interval(10 millis)) {
         getWriteAheadLogFiles().size should be < preCleanupLogFiles.size
       }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
index 01a09b67b99dc..fbb7b0bfebafc 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
@@ -82,15 +82,15 @@ class ReceivedBlockTrackerSuite
     receivedBlockTracker.allocateBlocksToBatch(2)
     receivedBlockTracker.getBlocksOfBatchAndStream(2, streamId) shouldBe empty
 
-    // Verify that batch 2 cannot be allocated again
-    intercept[SparkException] {
-      receivedBlockTracker.allocateBlocksToBatch(2)
-    }
+    // Verify that older batches have no operation on batch allocation,
+    // will return the same blocks as previously allocated.
+    receivedBlockTracker.allocateBlocksToBatch(1)
+    receivedBlockTracker.getBlocksOfBatchAndStream(1, streamId) shouldEqual blockInfos
 
-    // Verify that older batches cannot be allocated again
-    intercept[SparkException] {
-      receivedBlockTracker.allocateBlocksToBatch(1)
-    }
+    blockInfos.map(receivedBlockTracker.addBlock)
+    receivedBlockTracker.allocateBlocksToBatch(2)
+    receivedBlockTracker.getBlocksOfBatchAndStream(2, streamId) shouldBe empty
+    receivedBlockTracker.getUnallocatedBlocks(streamId) shouldEqual blockInfos
   }
 
   test("block addition, block to batch allocation and cleanup with write ahead log") {
@@ -166,7 +166,7 @@ class ReceivedBlockTrackerSuite
     // Cleanup first batch but not second batch
     val oldestLogFile = getWriteAheadLogFiles().head
     incrementTime()
-    tracker3.cleanupOldBatches(batchTime2)
+    tracker3.cleanupOldBatches(batchTime2, waitForCompletion = true)
 
     // Verify that the batch allocations have been cleaned, and the act has been written to log
     tracker3.getBlocksOfBatchAndStream(batchTime1, streamId) shouldEqual Seq.empty
@@ -186,14 +186,14 @@ class ReceivedBlockTrackerSuite
     tracker4.getBlocksOfBatchAndStream(batchTime1, streamId) shouldBe empty  // should be cleaned
     tracker4.getBlocksOfBatchAndStream(batchTime2, streamId) shouldEqual blockInfos2
   }
-  
+
   test("enabling write ahead log but not setting checkpoint dir") {
     conf.set("spark.streaming.receiver.writeAheadLog.enable", "true")
     intercept[SparkException] {
       createTracker(setCheckpointDir = false)
     }
   }
-  
+
   test("setting checkpoint dir but not enabling write ahead log") {
     // When WAL config is not set, log manager should not be enabled
     val tracker1 = createTracker(setCheckpointDir = true)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
index e26c0c6859e57..e8c34a9ee40b9 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
@@ -17,21 +17,26 @@
 
 package org.apache.spark.streaming
 
+import java.io.File
 import java.nio.ByteBuffer
 import java.util.concurrent.Semaphore
 
+import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.SparkConf
-import org.apache.spark.storage.{StorageLevel, StreamBlockId}
-import org.apache.spark.streaming.receiver.{BlockGenerator, BlockGeneratorListener, Receiver, ReceiverSupervisor}
-import org.scalatest.FunSuite
+import com.google.common.io.Files
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.time.SpanSugar._
 
+import org.apache.spark.SparkConf
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.storage.StreamBlockId
+import org.apache.spark.streaming.receiver._
+import org.apache.spark.streaming.receiver.WriteAheadLogBasedBlockHandler._
+
 /** Testsuite for testing the network receiver behavior */
-class ReceiverSuite extends FunSuite with Timeouts {
+class ReceiverSuite extends TestSuiteBase with Timeouts with Serializable {
 
   test("receiver life cycle") {
 
@@ -192,7 +197,6 @@ class ReceiverSuite extends FunSuite with Timeouts {
     val minExpectedMessagesPerBlock = expectedMessagesPerBlock - 3
     val maxExpectedMessagesPerBlock = expectedMessagesPerBlock + 1
     val receivedBlockSizes = recordedBlocks.map { _.size }.mkString(",")
-    println(minExpectedMessagesPerBlock, maxExpectedMessagesPerBlock, ":", receivedBlockSizes)
     assert(
       // the first and last block may be incomplete, so we slice them out
       recordedBlocks.drop(1).dropRight(1).forall { block =>
@@ -203,39 +207,91 @@ class ReceiverSuite extends FunSuite with Timeouts {
     )
   }
 
-
   /**
-   * An implementation of NetworkReceiver that is used for testing a receiver's life cycle.
+   * Test whether write ahead logs are generated by received,
+   * and automatically cleaned up. The clean up must be aware of the
+   * remember duration of the input streams. E.g., input streams on which window()
+   * has been applied must remember the data for longer, and hence corresponding
+   * WALs should be cleaned later.
    */
-  class FakeReceiver extends Receiver[Int](StorageLevel.MEMORY_ONLY) {
-    @volatile var otherThread: Thread = null
-    @volatile var receiving = false
-    @volatile var onStartCalled = false
-    @volatile var onStopCalled = false
-
-    def onStart() {
-      otherThread = new Thread() {
-        override def run() {
-          receiving = true
-          while(!isStopped()) {
-            Thread.sleep(10)
-          }
+  test("write ahead log - generating and cleaning") {
+    val sparkConf = new SparkConf()
+      .setMaster("local[4]")  // must be at least 3 as we are going to start 2 receivers
+      .setAppName(framework)
+      .set("spark.ui.enabled", "true")
+      .set("spark.streaming.receiver.writeAheadLog.enable", "true")
+      .set("spark.streaming.receiver.writeAheadLog.rollingInterval", "1")
+    val batchDuration = Milliseconds(500)
+    val tempDirectory = Files.createTempDir()
+    val logDirectory1 = new File(checkpointDirToLogDir(tempDirectory.getAbsolutePath, 0))
+    val logDirectory2 = new File(checkpointDirToLogDir(tempDirectory.getAbsolutePath, 1))
+    val allLogFiles1 = new mutable.HashSet[String]()
+    val allLogFiles2 = new mutable.HashSet[String]()
+    logInfo("Temp checkpoint directory = " + tempDirectory)
+
+    def getBothCurrentLogFiles(): (Seq[String], Seq[String]) = {
+      (getCurrentLogFiles(logDirectory1), getCurrentLogFiles(logDirectory2))
+    }
+
+    def getCurrentLogFiles(logDirectory: File): Seq[String] = {
+      try {
+        if (logDirectory.exists()) {
+          logDirectory1.listFiles().filter { _.getName.startsWith("log") }.map { _.toString }
+        } else {
+          Seq.empty
         }
+      } catch {
+        case e: Exception =>
+          Seq.empty
       }
-      onStartCalled = true
-      otherThread.start()
-
     }
 
-    def onStop() {
-      onStopCalled = true
-      otherThread.join()
+    def printLogFiles(message: String, files: Seq[String]) {
+      logInfo(s"$message (${files.size} files):\n" + files.mkString("\n"))
     }
 
-    def reset() {
-      receiving = false
-      onStartCalled = false
-      onStopCalled = false
+    withStreamingContext(new StreamingContext(sparkConf, batchDuration)) { ssc =>
+      tempDirectory.deleteOnExit()
+      val receiver1 = ssc.sparkContext.clean(new FakeReceiver(sendData = true))
+      val receiver2 = ssc.sparkContext.clean(new FakeReceiver(sendData = true))
+      val receiverStream1 = ssc.receiverStream(receiver1)
+      val receiverStream2 = ssc.receiverStream(receiver2)
+      receiverStream1.register()
+      receiverStream2.window(batchDuration * 6).register()  // 3 second window
+      ssc.checkpoint(tempDirectory.getAbsolutePath())
+      ssc.start()
+
+      // Run until sufficient WAL files have been generated and
+      // the first WAL files has been deleted
+      eventually(timeout(20 seconds), interval(batchDuration.milliseconds millis)) {
+        val (logFiles1, logFiles2) = getBothCurrentLogFiles()
+        allLogFiles1 ++= logFiles1
+        allLogFiles2 ++= logFiles2
+        if (allLogFiles1.size > 0) {
+          assert(!logFiles1.contains(allLogFiles1.toSeq.sorted.head))
+        }
+        if (allLogFiles2.size > 0) {
+          assert(!logFiles2.contains(allLogFiles2.toSeq.sorted.head))
+        }
+        assert(allLogFiles1.size >= 7)
+        assert(allLogFiles2.size >= 7)
+      }
+      ssc.stop(stopSparkContext = true, stopGracefully = true)
+
+      val sortedAllLogFiles1 = allLogFiles1.toSeq.sorted
+      val sortedAllLogFiles2 = allLogFiles2.toSeq.sorted
+      val (leftLogFiles1, leftLogFiles2) = getBothCurrentLogFiles()
+
+      printLogFiles("Receiver 0: all", sortedAllLogFiles1)
+      printLogFiles("Receiver 0: left", leftLogFiles1)
+      printLogFiles("Receiver 1: all", sortedAllLogFiles2)
+      printLogFiles("Receiver 1: left", leftLogFiles2)
+
+      // Verify that necessary latest log files are not deleted
+      //   receiverStream1 needs to retain just the last batch = 1 log file
+      //   receiverStream2 needs to retain 3 seconds (3-seconds window) = 3 log files
+      assert(sortedAllLogFiles1.takeRight(1).forall(leftLogFiles1.contains))
+      assert(sortedAllLogFiles2.takeRight(3).forall(leftLogFiles2.contains))
     }
   }
 
@@ -315,3 +371,42 @@ class ReceiverSuite extends FunSuite with Timeouts {
   }
 }
 
+/**
+ * An implementation of Receiver that is used for testing a receiver's life cycle.
+ */
+class FakeReceiver(sendData: Boolean = false) extends Receiver[Int](StorageLevel.MEMORY_ONLY) {
+  @volatile var otherThread: Thread = null
+  @volatile var receiving = false
+  @volatile var onStartCalled = false
+  @volatile var onStopCalled = false
+
+  def onStart() {
+    otherThread = new Thread() {
+      override def run() {
+        receiving = true
+        var count = 0
+        while(!isStopped()) {
+          if (sendData) {
+            store(count)
+            count += 1
+          }
+          Thread.sleep(10)
+        }
+      }
+    }
+    onStartCalled = true
+    otherThread.start()
+  }
+
+  def onStop() {
+    onStopCalled = true
+    otherThread.join()
+  }
+
+  def reset() {
+    receiving = false
+    onStartCalled = false
+    onStopCalled = false
+  }
+}
+
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
index 84fed95a75e67..f52562b0a0f73 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
@@ -73,8 +73,8 @@ class StreamingListenerSuite extends TestSuiteBase with Matchers {
 
     ssc.start()
     try {
-      eventually(timeout(1000 millis), interval(20 millis)) {
-        collector.startedReceiverStreamIds.size should be >= 1
+      eventually(timeout(2000 millis), interval(20 millis)) {
+        collector.startedReceiverStreamIds.size should equal (1)
         collector.startedReceiverStreamIds(0) should equal (0)
         collector.stoppedReceiverStreamIds should have size 1
         collector.stoppedReceiverStreamIds(0) should equal (0)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
index 52972f63c6c5c..7d82c3e4aadcf 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
@@ -21,11 +21,16 @@ import java.io.{ObjectInputStream, IOException}
 
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.SynchronizedBuffer
+import scala.language.implicitConversions
 import scala.reflect.ClassTag
 
 import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.time.{Span, Seconds => ScalaTestSeconds}
+import org.scalatest.concurrent.Eventually.timeout
+import org.scalatest.concurrent.PatienceConfiguration
 
 import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream}
+import org.apache.spark.streaming.scheduler.{StreamingListenerBatchStarted, StreamingListenerBatchCompleted, StreamingListener}
 import org.apache.spark.streaming.util.ManualClock
 import org.apache.spark.{SparkConf, Logging}
 import org.apache.spark.rdd.RDD
@@ -103,6 +108,40 @@ class TestOutputStreamWithPartitions[T: ClassTag](parent: DStream[T],
   def toTestOutputStream = new TestOutputStream[T](this.parent, this.output.map(_.flatten))
 }
 
+/**
+ * An object that counts the number of started / completed batches. This is implemented using a
+ * StreamingListener. Constructing a new instance automatically registers a StreamingListener on
+ * the given StreamingContext.
+ */
+class BatchCounter(ssc: StreamingContext) {
+
+  // All access to this state should be guarded by `BatchCounter.this.synchronized`
+  private var numCompletedBatches = 0
+  private var numStartedBatches = 0
+
+  private val listener = new StreamingListener {
+    override def onBatchStarted(batchStarted: StreamingListenerBatchStarted): Unit =
+      BatchCounter.this.synchronized {
+        numStartedBatches += 1
+        BatchCounter.this.notifyAll()
+      }
+    override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit =
+      BatchCounter.this.synchronized {
+        numCompletedBatches += 1
+        BatchCounter.this.notifyAll()
+      }
+  }
+  ssc.addStreamingListener(listener)
+
+  def getNumCompletedBatches: Int = this.synchronized {
+    numCompletedBatches
+  }
+
+  def getNumStartedBatches: Int = this.synchronized {
+    numStartedBatches
+  }
+}
+
 /**
  * This is the base trait for Spark Streaming testsuites. This provides basic functionality
  * to run user-defined set of input on user-defined stream operations, and verify the output.
@@ -142,6 +181,9 @@ trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging {
     .setMaster(master)
     .setAppName(framework)
 
+  // Timeout for use in ScalaTest `eventually` blocks
+  val eventuallyTimeout: PatienceConfiguration.Timeout = timeout(Span(10, ScalaTestSeconds))
+
   // Default before function for any streaming test suite. Override this
   // if you want to add your stuff to "before" (i.e., don't call before { } )
   def beforeFunction() {
@@ -291,7 +333,7 @@ trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging {
 
       // Advance manual clock
       val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
-      logInfo("Manual clock before advancing = " + clock.time)
+      logInfo("Manual clock before advancing = " + clock.currentTime())
       if (actuallyWait) {
         for (i <- 1 to numBatches) {
           logInfo("Actually waiting for " + batchDuration)
@@ -301,7 +343,7 @@ trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging {
       } else {
         clock.addToTime(numBatches * batchDuration.milliseconds)
       }
-      logInfo("Manual clock after advancing = " + clock.time)
+      logInfo("Manual clock after advancing = " + clock.currentTime())
 
       // Wait until expected number of output items have been generated
       val startTime = System.currentTimeMillis()
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/WindowOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/WindowOperationsSuite.scala
index 471c99fab4682..a5d2bb2fde16c 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/WindowOperationsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/WindowOperationsSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.streaming
 
-import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.storage.StorageLevel
 
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
index d2b983c4b4d1a..7a6a2f3e577dd 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
@@ -20,15 +20,15 @@ import java.io.File
 
 import scala.util.Random
 
-import com.google.common.io.Files
 import org.apache.hadoop.conf.Configuration
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.{BeforeAndAfterEach, BeforeAndAfterAll, FunSuite}
 
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.storage.{BlockId, BlockManager, StorageLevel, StreamBlockId}
 import org.apache.spark.streaming.util.{WriteAheadLogFileSegment, WriteAheadLogWriter}
+import org.apache.spark.util.Utils
 
-class WriteAheadLogBackedBlockRDDSuite extends FunSuite with BeforeAndAfterAll {
+class WriteAheadLogBackedBlockRDDSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAfterEach {
   val conf = new SparkConf()
     .setMaster("local[2]")
     .setAppName(this.getClass.getSimpleName)
@@ -38,16 +38,22 @@ class WriteAheadLogBackedBlockRDDSuite extends FunSuite with BeforeAndAfterAll {
   var blockManager: BlockManager = null
   var dir: File = null
 
+  override def beforeEach(): Unit = {
+    dir = Utils.createTempDir()
+  }
+
+  override def afterEach(): Unit = {
+    Utils.deleteRecursively(dir)
+  }
+
   override def beforeAll(): Unit = {
     sparkContext = new SparkContext(conf)
     blockManager = sparkContext.env.blockManager
-    dir = Files.createTempDir()
   }
 
   override def afterAll(): Unit = {
     // Copied from LocalSparkContext, simpler than to introduced test dependencies to core tests.
     sparkContext.stop()
-    dir.delete()
     System.clearProperty("spark.driver.port")
   }
 
@@ -137,7 +143,7 @@ class WriteAheadLogBackedBlockRDDSuite extends FunSuite with BeforeAndAfterAll {
       blockIds: Seq[BlockId]
     ): Seq[WriteAheadLogFileSegment] = {
     require(blockData.size === blockIds.size)
-    val writer = new WriteAheadLogWriter(new File(dir, Random.nextString(10)).toString, hadoopConf)
+    val writer = new WriteAheadLogWriter(new File(dir, "logFile").toString, hadoopConf)
     val segments = blockData.zip(blockIds).map { case (data, id) =>
       writer.write(blockManager.dataSerialize(id, data.iterator))
     }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
index 1956a4f1db90a..7ce9499dc614d 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
@@ -22,11 +22,8 @@ import java.nio.ByteBuffer
 import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.duration._
 import scala.language.{implicitConversions, postfixOps}
-import scala.util.Random
 
 import WriteAheadLogSuite._
-import com.google.common.io.Files
-import org.apache.commons.io.FileUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.spark.util.Utils
@@ -42,9 +39,9 @@ class WriteAheadLogSuite extends FunSuite with BeforeAndAfter {
   var manager: WriteAheadLogManager = null
 
   before {
-    tempDir = Files.createTempDir()
+    tempDir = Utils.createTempDir()
     testDir = tempDir.toString
-    testFile = new File(tempDir, Random.nextString(10)).toString
+    testFile = new File(tempDir, "testFile").toString
     if (manager != null) {
       manager.stop()
       manager = null
@@ -52,7 +49,7 @@ class WriteAheadLogSuite extends FunSuite with BeforeAndAfter {
   }
 
   after {
-    FileUtils.deleteQuietly(tempDir)
+    Utils.deleteRecursively(tempDir)
   }
 
   test("WriteAheadLogWriter - writing data") {
@@ -185,15 +182,29 @@ class WriteAheadLogSuite extends FunSuite with BeforeAndAfter {
   }
 
   test("WriteAheadLogManager - cleanup old logs") {
+    logCleanUpTest(waitForCompletion = false)
+  }
+
+  test("WriteAheadLogManager - cleanup old logs synchronously") {
+    logCleanUpTest(waitForCompletion = true)
+  }
+
+  private def logCleanUpTest(waitForCompletion: Boolean): Unit = {
     // Write data with manager, recover with new manager and verify
     val manualClock = new ManualClock
     val dataToWrite = generateRandomData()
     manager = writeDataUsingManager(testDir, dataToWrite, manualClock, stopManager = false)
     val logFiles = getLogFilesInDirectory(testDir)
     assert(logFiles.size > 1)
-    manager.cleanupOldLogs(manualClock.currentTime() / 2)
-    eventually(timeout(1 second), interval(10 milliseconds)) {
+
+    manager.cleanupOldLogs(manualClock.currentTime() / 2, waitForCompletion)
+
+    if (waitForCompletion) {
       assert(getLogFilesInDirectory(testDir).size < logFiles.size)
+    } else {
+      eventually(timeout(1 second), interval(10 milliseconds)) {
+        assert(getLogFilesInDirectory(testDir).size < logFiles.size)
+      }
     }
   }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/ByteType.java b/streaming/src/test/scala/org/apache/spark/streamingtest/ImplicitSuite.scala
similarity index 57%
rename from sql/core/src/main/java/org/apache/spark/sql/api/java/ByteType.java
rename to streaming/src/test/scala/org/apache/spark/streamingtest/ImplicitSuite.scala
index e5cdf06b21bbe..d0bf328f2b74d 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/ByteType.java
+++ b/streaming/src/test/scala/org/apache/spark/streamingtest/ImplicitSuite.scala
@@ -15,13 +15,21 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.api.java;
+package org.apache.spark.streamingtest
 
 /**
- * The data type representing byte and Byte values.
+ * A test suite to make sure all `implicit` functions work correctly.
  *
- * {@code ByteType} is represented by the singleton object {@link DataType#ByteType}.
+ * As `implicit` is a compiler feature, we don't need to run this class.
+ * What we need to do is making the compiler happy.
  */
-public class ByteType extends DataType {
-  protected ByteType() {}
+class ImplicitSuite {
+
+  // We only want to test if `implict` works well with the compiler, so we don't need a real DStream.
+  def mockDStream[T]: org.apache.spark.streaming.dstream.DStream[T] = null
+
+  def testToPairDStreamFunctions(): Unit = {
+    val dstream: org.apache.spark.streaming.dstream.DStream[(Int, Int)] = mockDStream
+    dstream.groupByKey()
+  }
 }
diff --git a/tools/pom.xml b/tools/pom.xml
index c0bc6e2a2af9d..e7419ed2c607a 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -52,11 +52,6 @@
       <groupId>org.scala-lang</groupId>
       <artifactId>scala-compiler</artifactId>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
 
   <build>
@@ -85,10 +80,6 @@
         <groupId>org.codehaus.mojo</groupId>
         <artifactId>build-helper-maven-plugin</artifactId>
       </plugin>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-      </plugin>
     </plugins>
   </build>
 </project>
diff --git a/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala b/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala
index db58eb642b56d..15ee95070a3d3 100644
--- a/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala
+++ b/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala
@@ -21,7 +21,7 @@ import java.util.concurrent.{CountDownLatch, Executors}
 import java.util.concurrent.atomic.AtomicLong
 
 import org.apache.spark.executor.ShuffleWriteMetrics
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.shuffle.hash.HashShuffleManager
 import org.apache.spark.util.Utils
@@ -49,13 +49,13 @@ object StoragePerfTester {
     val writeData = "1" * recordLength
     val executor = Executors.newFixedThreadPool(numMaps)
 
-    System.setProperty("spark.shuffle.compress", "false")
-    System.setProperty("spark.shuffle.sync", "true")
-    System.setProperty("spark.shuffle.manager",
-      "org.apache.spark.shuffle.hash.HashShuffleManager")
+    val conf = new SparkConf()
+      .set("spark.shuffle.compress", "false")
+      .set("spark.shuffle.sync", "true")
+      .set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")
 
     // This is only used to instantiate a BlockManager. All thread scheduling is done manually.
-    val sc = new SparkContext("local[4]", "Write Tester")
+    val sc = new SparkContext("local[4]", "Write Tester", conf)
     val hashShuffleManager = sc.env.shuffleManager.asInstanceOf[HashShuffleManager]
 
     def writeOutputBytes(mapId: Int, total: AtomicLong) = {
diff --git a/yarn/pom.xml b/yarn/pom.xml
index d7579bf9622d6..7595549e4b6dc 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -64,11 +64,6 @@
       <classifier>tests</classifier>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-all</artifactId>
@@ -128,32 +123,6 @@
   </profiles>
 
   <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-install-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-        <configuration>
-          <environmentVariables>
-            <SPARK_HOME>${basedir}/../..</SPARK_HOME>
-          </environmentVariables>
-        </configuration>
-      </plugin>
-    </plugins>
-
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
   </build>
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 987b3373fb8ff..902bdda59860e 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -60,7 +60,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
   @volatile private var exitCode = 0
   @volatile private var unregistered = false
   @volatile private var finished = false
-  @volatile private var finalStatus = FinalApplicationStatus.SUCCEEDED
+  @volatile private var finalStatus = getDefaultFinalStatus
   @volatile private var finalMsg: String = ""
   @volatile private var userClassThread: Thread = _
 
@@ -102,7 +102,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
             logInfo("Invoking sc stop from shutdown hook")
             sc.stop()
           }
-          val maxAppAttempts = client.getMaxRegAttempts(yarnConf)
+          val maxAppAttempts = client.getMaxRegAttempts(sparkConf, yarnConf)
           val isLastAttempt = client.getAttemptId().getAttemptId() >= maxAppAttempts
 
           if (!finished) {
@@ -152,6 +152,20 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
     exitCode
   }
 
+  /**
+   * Set the default final application status for client mode to UNDEFINED to handle
+   * if YARN HA restarts the application so that it properly retries. Set the final
+   * status to SUCCEEDED in cluster mode to handle if the user calls System.exit
+   * from the application code.
+   */
+  final def getDefaultFinalStatus() = {
+    if (isDriver) {
+      FinalApplicationStatus.SUCCEEDED
+    } else {
+      FinalApplicationStatus.UNDEFINED
+    }
+  }
+
   /**
    * unregister is used to completely unregister the application from the ResourceManager.
    * This means the ResourceManager will not retry the application attempt on your behalf if
@@ -311,7 +325,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
   private def cleanupStagingDir(fs: FileSystem) {
     var stagingDirPath: Path = null
     try {
-      val preserveFiles = sparkConf.get("spark.yarn.preserve.staging.files", "false").toBoolean
+      val preserveFiles = sparkConf.getBoolean("spark.yarn.preserve.staging.files", false)
       if (!preserveFiles) {
         stagingDirPath = new Path(System.getenv("SPARK_YARN_STAGING_DIR"))
         if (stagingDirPath == null) {
@@ -329,43 +343,43 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
 
   private def waitForSparkContextInitialized(): SparkContext = {
     logInfo("Waiting for spark context initialization")
-    try {
-      sparkContextRef.synchronized {
-        var count = 0
-        val waitTime = 10000L
-        val numTries = sparkConf.getInt("spark.yarn.applicationMaster.waitTries", 10)
-        while (sparkContextRef.get() == null && count < numTries && !finished) {
-          logInfo("Waiting for spark context initialization ... " + count)
-          count = count + 1
-          sparkContextRef.wait(waitTime)
-        }
+    sparkContextRef.synchronized {
+      val waitTries = sparkConf.getOption("spark.yarn.applicationMaster.waitTries")
+        .map(_.toLong * 10000L)
+      if (waitTries.isDefined) {
+        logWarning(
+          "spark.yarn.applicationMaster.waitTries is deprecated, use spark.yarn.am.waitTime")
+      }
+      val totalWaitTime = sparkConf.getLong("spark.yarn.am.waitTime", waitTries.getOrElse(100000L))
+      val deadline = System.currentTimeMillis() + totalWaitTime
 
-        val sparkContext = sparkContextRef.get()
-        if (sparkContext == null) {
-          logError(("SparkContext did not initialize after waiting for %d ms. Please check earlier"
-            + " log output for errors. Failing the application.").format(numTries * waitTime))
-        }
-        sparkContext
+      while (sparkContextRef.get() == null && System.currentTimeMillis < deadline && !finished) {
+        logInfo("Waiting for spark context initialization ... ")
+        sparkContextRef.wait(10000L)
+      }
+
+      val sparkContext = sparkContextRef.get()
+      if (sparkContext == null) {
+        logError(("SparkContext did not initialize after waiting for %d ms. Please check earlier"
+          + " log output for errors. Failing the application.").format(totalWaitTime))
       }
+      sparkContext
     }
   }
 
   private def waitForSparkDriver(): ActorRef = {
     logInfo("Waiting for Spark driver to be reachable.")
     var driverUp = false
-    var count = 0
     val hostport = args.userArgs(0)
     val (driverHost, driverPort) = Utils.parseHostPort(hostport)
 
-    // spark driver should already be up since it launched us, but we don't want to
+    // Spark driver should already be up since it launched us, but we don't want to
     // wait forever, so wait 100 seconds max to match the cluster mode setting.
-    // Leave this config unpublished for now. SPARK-3779 to investigating changing
-    // this config to be time based.
-    val numTries = sparkConf.getInt("spark.yarn.applicationMaster.waitTries", 1000)
+    val totalWaitTime = sparkConf.getLong("spark.yarn.am.waitTime", 100000L)
+    val deadline = System.currentTimeMillis + totalWaitTime
 
-    while (!driverUp && !finished && count < numTries) {
+    while (!driverUp && !finished && System.currentTimeMillis < deadline) {
       try {
-        count = count + 1
         val socket = new Socket(driverHost, driverPort)
         socket.close()
         logInfo("Driver now available: %s:%s".format(driverHost, driverPort))
@@ -374,7 +388,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
         case e: Exception =>
           logError("Failed to connect to driver at %s:%s, retrying ...".
             format(driverHost, driverPort))
-          Thread.sleep(100)
+          Thread.sleep(100L)
       }
     }
 
@@ -511,7 +525,7 @@ object ApplicationMaster extends Logging {
     SignalLogger.register(log)
     val amArgs = new ApplicationMasterArguments(args)
     SparkHadoopUtil.get.runAsSparkUser { () =>
-      master = new ApplicationMaster(amArgs, new YarnRMClientImpl(amArgs))
+      master = new ApplicationMaster(amArgs, new YarnRMClient(amArgs))
       System.exit(master.run())
     }
   }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index addaddb711d3c..d4eeccf64275f 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -17,34 +17,56 @@
 
 package org.apache.spark.deploy.yarn
 
+import java.net.{InetAddress, UnknownHostException, URI, URISyntaxException}
 import java.nio.ByteBuffer
 
-import org.apache.hadoop.conf.Configuration
+import scala.collection.JavaConversions._
+import scala.collection.mutable.{HashMap, ListBuffer, Map}
+import scala.util.{Try, Success, Failure}
+
+import com.google.common.base.Objects
+
 import org.apache.hadoop.io.DataOutputBuffer
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs._
+import org.apache.hadoop.fs.permission.FsPermission
+import org.apache.hadoop.mapred.Master
+import org.apache.hadoop.mapreduce.MRJobConfig
+import org.apache.hadoop.security.{Credentials, UserGroupInformation}
+import org.apache.hadoop.util.StringUtils
+import org.apache.hadoop.yarn.api._
+import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
+import org.apache.hadoop.yarn.api.protocolrecords._
 import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.client.api.{YarnClient, YarnClientApplication}
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.util.Records
 
-import org.apache.spark.{Logging, SparkConf}
+import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkContext, SparkException}
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.util.Utils
 
-/**
- * Version of [[org.apache.spark.deploy.yarn.ClientBase]] tailored to YARN's stable API.
- */
 private[spark] class Client(
     val args: ClientArguments,
     val hadoopConf: Configuration,
     val sparkConf: SparkConf)
-  extends ClientBase with Logging {
+  extends Logging {
+
+  import Client._
 
   def this(clientArgs: ClientArguments, spConf: SparkConf) =
     this(clientArgs, SparkHadoopUtil.get.newConfiguration(spConf), spConf)
 
   def this(clientArgs: ClientArguments) = this(clientArgs, new SparkConf())
 
-  val yarnClient = YarnClient.createYarnClient
-  val yarnConf = new YarnConfiguration(hadoopConf)
+  private val yarnClient = YarnClient.createYarnClient
+  private val yarnConf = new YarnConfiguration(hadoopConf)
+  private val credentials = UserGroupInformation.getCurrentUser.getCredentials
+  private val amMemoryOverhead = args.amMemoryOverhead // MB
+  private val executorMemoryOverhead = args.executorMemoryOverhead // MB
+  private val distCacheMgr = new ClientDistributedCacheManager()
+  private val isClusterMode = args.isClusterMode
+
 
   def stop(): Unit = yarnClient.stop()
 
@@ -61,7 +83,7 @@ private[spark] class Client(
    * creating applications and setting up the application submission context. This was not
    * available in the alpha API.
    */
-  override def submitApplication(): ApplicationId = {
+  def submitApplication(): ApplicationId = {
     yarnClient.init(yarnConf)
     yarnClient.start()
 
@@ -98,32 +120,499 @@ private[spark] class Client(
     appContext.setQueue(args.amQueue)
     appContext.setAMContainerSpec(containerContext)
     appContext.setApplicationType("SPARK")
+    sparkConf.getOption("spark.yarn.maxAppAttempts").map(_.toInt) match {
+      case Some(v) => appContext.setMaxAppAttempts(v)
+      case None => logDebug("spark.yarn.maxAppAttempts is not set. " +
+          "Cluster's default value will be used.")
+    }
     val capability = Records.newRecord(classOf[Resource])
     capability.setMemory(args.amMemory + amMemoryOverhead)
+    capability.setVirtualCores(args.amCores)
     appContext.setResource(capability)
     appContext
   }
 
   /** Set up security tokens for launching our ApplicationMaster container. */
-  override def setupSecurityToken(amContainer: ContainerLaunchContext): Unit = {
+  private def setupSecurityToken(amContainer: ContainerLaunchContext): Unit = {
     val dob = new DataOutputBuffer
     credentials.writeTokenStorageToStream(dob)
     amContainer.setTokens(ByteBuffer.wrap(dob.getData))
   }
 
   /** Get the application report from the ResourceManager for an application we have submitted. */
-  override def getApplicationReport(appId: ApplicationId): ApplicationReport =
+  def getApplicationReport(appId: ApplicationId): ApplicationReport =
     yarnClient.getApplicationReport(appId)
 
   /**
    * Return the security token used by this client to communicate with the ApplicationMaster.
    * If no security is enabled, the token returned by the report is null.
    */
-  override def getClientToken(report: ApplicationReport): String =
+  private def getClientToken(report: ApplicationReport): String =
     Option(report.getClientToAMToken).map(_.toString).getOrElse("")
+
+  /**
+   * Fail fast if we have requested more resources per container than is available in the cluster.
+   */
+  private def verifyClusterResources(newAppResponse: GetNewApplicationResponse): Unit = {
+    val maxMem = newAppResponse.getMaximumResourceCapability().getMemory()
+    logInfo("Verifying our application has not requested more than the maximum " +
+      s"memory capability of the cluster ($maxMem MB per container)")
+    val executorMem = args.executorMemory + executorMemoryOverhead
+    if (executorMem > maxMem) {
+      throw new IllegalArgumentException(s"Required executor memory (${args.executorMemory}" +
+        s"+$executorMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster!")
+    }
+    val amMem = args.amMemory + amMemoryOverhead
+    if (amMem > maxMem) {
+      throw new IllegalArgumentException(s"Required AM memory (${args.amMemory}" +
+        s"+$amMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster!")
+    }
+    logInfo("Will allocate AM container, with %d MB memory including %d MB overhead".format(
+      amMem,
+      amMemoryOverhead))
+
+    // We could add checks to make sure the entire cluster has enough resources but that involves
+    // getting all the node reports and computing ourselves.
+  }
+
+  /**
+   * Copy the given file to a remote file system (e.g. HDFS) if needed.
+   * The file is only copied if the source and destination file systems are different. This is used
+   * for preparing resources for launching the ApplicationMaster container. Exposed for testing.
+   */
+  private[yarn] def copyFileToRemote(
+      destDir: Path,
+      srcPath: Path,
+      replication: Short,
+      setPerms: Boolean = false): Path = {
+    val destFs = destDir.getFileSystem(hadoopConf)
+    val srcFs = srcPath.getFileSystem(hadoopConf)
+    var destPath = srcPath
+    if (!compareFs(srcFs, destFs)) {
+      destPath = new Path(destDir, srcPath.getName())
+      logInfo(s"Uploading resource $srcPath -> $destPath")
+      FileUtil.copy(srcFs, srcPath, destFs, destPath, false, hadoopConf)
+      destFs.setReplication(destPath, replication)
+      if (setPerms) {
+        destFs.setPermission(destPath, new FsPermission(APP_FILE_PERMISSION))
+      }
+    } else {
+      logInfo(s"Source and destination file systems are the same. Not copying $srcPath")
+    }
+    // Resolve any symlinks in the URI path so using a "current" symlink to point to a specific
+    // version shows the specific version in the distributed cache configuration
+    val qualifiedDestPath = destFs.makeQualified(destPath)
+    val fc = FileContext.getFileContext(qualifiedDestPath.toUri(), hadoopConf)
+    fc.resolvePath(qualifiedDestPath)
+  }
+
+  /**
+   * Upload any resources to the distributed cache if needed. If a resource is intended to be
+   * consumed locally, set up the appropriate config for downstream code to handle it properly.
+   * This is used for setting up a container launch context for our ApplicationMaster.
+   * Exposed for testing.
+   */
+  def prepareLocalResources(appStagingDir: String): HashMap[String, LocalResource] = {
+    logInfo("Preparing resources for our AM container")
+    // Upload Spark and the application JAR to the remote file system if necessary,
+    // and add them as local resources to the application master.
+    val fs = FileSystem.get(hadoopConf)
+    val dst = new Path(fs.getHomeDirectory(), appStagingDir)
+    val nns = getNameNodesToAccess(sparkConf) + dst
+    obtainTokensForNamenodes(nns, hadoopConf, credentials)
+
+    val replication = sparkConf.getInt("spark.yarn.submit.file.replication",
+      fs.getDefaultReplication(dst)).toShort
+    val localResources = HashMap[String, LocalResource]()
+    FileSystem.mkdirs(fs, dst, new FsPermission(STAGING_DIR_PERMISSION))
+
+    val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
+
+    val oldLog4jConf = Option(System.getenv("SPARK_LOG4J_CONF"))
+    if (oldLog4jConf.isDefined) {
+      logWarning(
+        "SPARK_LOG4J_CONF detected in the system environment. This variable has been " +
+          "deprecated. Please refer to the \"Launching Spark on YARN\" documentation " +
+          "for alternatives.")
+    }
+
+    /**
+     * Copy the given main resource to the distributed cache if the scheme is not "local".
+     * Otherwise, set the corresponding key in our SparkConf to handle it downstream.
+     * Each resource is represented by a 4-tuple of:
+     *   (1) destination resource name,
+     *   (2) local path to the resource,
+     *   (3) Spark property key to set if the scheme is not local, and
+     *   (4) whether to set permissions for this resource
+     */
+    List(
+      (SPARK_JAR, sparkJar(sparkConf), CONF_SPARK_JAR, false),
+      (APP_JAR, args.userJar, CONF_SPARK_USER_JAR, true),
+      ("log4j.properties", oldLog4jConf.orNull, null, false)
+    ).foreach { case (destName, _localPath, confKey, setPermissions) =>
+      val localPath: String = if (_localPath != null) _localPath.trim() else ""
+      if (!localPath.isEmpty()) {
+        val localURI = new URI(localPath)
+        if (localURI.getScheme != LOCAL_SCHEME) {
+          val src = getQualifiedLocalPath(localURI, hadoopConf)
+          val destPath = copyFileToRemote(dst, src, replication, setPermissions)
+          val destFs = FileSystem.get(destPath.toUri(), hadoopConf)
+          distCacheMgr.addResource(destFs, hadoopConf, destPath,
+            localResources, LocalResourceType.FILE, destName, statCache)
+        } else if (confKey != null) {
+          // If the resource is intended for local use only, handle this downstream
+          // by setting the appropriate property
+          sparkConf.set(confKey, localPath)
+        }
+      }
+    }
+
+    /**
+     * Do the same for any additional resources passed in through ClientArguments.
+     * Each resource category is represented by a 3-tuple of:
+     *   (1) comma separated list of resources in this category,
+     *   (2) resource type, and
+     *   (3) whether to add these resources to the classpath
+     */
+    val cachedSecondaryJarLinks = ListBuffer.empty[String]
+    List(
+      (args.addJars, LocalResourceType.FILE, true),
+      (args.files, LocalResourceType.FILE, false),
+      (args.archives, LocalResourceType.ARCHIVE, false)
+    ).foreach { case (flist, resType, addToClasspath) =>
+      if (flist != null && !flist.isEmpty()) {
+        flist.split(',').foreach { file =>
+          val localURI = new URI(file.trim())
+          if (localURI.getScheme != LOCAL_SCHEME) {
+            val localPath = new Path(localURI)
+            val linkname = Option(localURI.getFragment()).getOrElse(localPath.getName())
+            val destPath = copyFileToRemote(dst, localPath, replication)
+            distCacheMgr.addResource(
+              fs, hadoopConf, destPath, localResources, resType, linkname, statCache)
+            if (addToClasspath) {
+              cachedSecondaryJarLinks += linkname
+            }
+          } else if (addToClasspath) {
+            // Resource is intended for local use only and should be added to the class path
+            cachedSecondaryJarLinks += file.trim()
+          }
+        }
+      }
+    }
+    if (cachedSecondaryJarLinks.nonEmpty) {
+      sparkConf.set(CONF_SPARK_YARN_SECONDARY_JARS, cachedSecondaryJarLinks.mkString(","))
+    }
+
+    localResources
+  }
+
+  /**
+   * Set up the environment for launching our ApplicationMaster container.
+   */
+  private def setupLaunchEnv(stagingDir: String): HashMap[String, String] = {
+    logInfo("Setting up the launch environment for our AM container")
+    val env = new HashMap[String, String]()
+    val extraCp = sparkConf.getOption("spark.driver.extraClassPath")
+    populateClasspath(args, yarnConf, sparkConf, env, extraCp)
+    env("SPARK_YARN_MODE") = "true"
+    env("SPARK_YARN_STAGING_DIR") = stagingDir
+    env("SPARK_USER") = UserGroupInformation.getCurrentUser().getShortUserName()
+
+    // Set the environment variables to be passed on to the executors.
+    distCacheMgr.setDistFilesEnv(env)
+    distCacheMgr.setDistArchivesEnv(env)
+
+    // Pick up any environment variables for the AM provided through spark.yarn.appMasterEnv.*
+    val amEnvPrefix = "spark.yarn.appMasterEnv."
+    sparkConf.getAll
+      .filter { case (k, v) => k.startsWith(amEnvPrefix) }
+      .map { case (k, v) => (k.substring(amEnvPrefix.length), v) }
+      .foreach { case (k, v) => YarnSparkHadoopUtil.addPathToEnvironment(env, k, v) }
+
+    // Keep this for backwards compatibility but users should move to the config
+    sys.env.get("SPARK_YARN_USER_ENV").foreach { userEnvs =>
+    // Allow users to specify some environment variables.
+      YarnSparkHadoopUtil.setEnvFromInputString(env, userEnvs)
+      // Pass SPARK_YARN_USER_ENV itself to the AM so it can use it to set up executor environments.
+      env("SPARK_YARN_USER_ENV") = userEnvs
+    }
+
+    // In cluster mode, if the deprecated SPARK_JAVA_OPTS is set, we need to propagate it to
+    // executors. But we can't just set spark.executor.extraJavaOptions, because the driver's
+    // SparkContext will not let that set spark* system properties, which is expected behavior for
+    // Yarn clients. So propagate it through the environment.
+    //
+    // Note that to warn the user about the deprecation in cluster mode, some code from
+    // SparkConf#validateSettings() is duplicated here (to avoid triggering the condition
+    // described above).
+    if (isClusterMode) {
+      sys.env.get("SPARK_JAVA_OPTS").foreach { value =>
+        val warning =
+          s"""
+            |SPARK_JAVA_OPTS was detected (set to '$value').
+            |This is deprecated in Spark 1.0+.
+            |
+            |Please instead use:
+            | - ./spark-submit with conf/spark-defaults.conf to set defaults for an application
+            | - ./spark-submit with --driver-java-options to set -X options for a driver
+            | - spark.executor.extraJavaOptions to set -X options for executors
+          """.stripMargin
+        logWarning(warning)
+        for (proc <- Seq("driver", "executor")) {
+          val key = s"spark.$proc.extraJavaOptions"
+          if (sparkConf.contains(key)) {
+            throw new SparkException(s"Found both $key and SPARK_JAVA_OPTS. Use only the former.")
+          }
+        }
+        env("SPARK_JAVA_OPTS") = value
+      }
+    }
+
+    sys.env.get(ENV_DIST_CLASSPATH).foreach { dcp =>
+      env(ENV_DIST_CLASSPATH) = dcp
+    }
+
+    env
+  }
+
+  /**
+   * Set up a ContainerLaunchContext to launch our ApplicationMaster container.
+   * This sets up the launch environment, java options, and the command for launching the AM.
+   */
+  private def createContainerLaunchContext(newAppResponse: GetNewApplicationResponse)
+    : ContainerLaunchContext = {
+    logInfo("Setting up container launch context for our AM")
+
+    val appId = newAppResponse.getApplicationId
+    val appStagingDir = getAppStagingDir(appId)
+    val localResources = prepareLocalResources(appStagingDir)
+    val launchEnv = setupLaunchEnv(appStagingDir)
+    val amContainer = Records.newRecord(classOf[ContainerLaunchContext])
+    amContainer.setLocalResources(localResources)
+    amContainer.setEnvironment(launchEnv)
+
+    val javaOpts = ListBuffer[String]()
+
+    // Set the environment variable through a command prefix
+    // to append to the existing value of the variable
+    var prefixEnv: Option[String] = None
+
+    // Add Xmx for AM memory
+    javaOpts += "-Xmx" + args.amMemory + "m"
+
+    val tmpDir = new Path(Environment.PWD.$(), YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR)
+    javaOpts += "-Djava.io.tmpdir=" + tmpDir
+
+    // TODO: Remove once cpuset version is pushed out.
+    // The context is, default gc for server class machines ends up using all cores to do gc -
+    // hence if there are multiple containers in same node, Spark GC affects all other containers'
+    // performance (which can be that of other Spark containers)
+    // Instead of using this, rely on cpusets by YARN to enforce "proper" Spark behavior in
+    // multi-tenant environments. Not sure how default Java GC behaves if it is limited to subset
+    // of cores on a node.
+    val useConcurrentAndIncrementalGC = launchEnv.get("SPARK_USE_CONC_INCR_GC").exists(_.toBoolean)
+    if (useConcurrentAndIncrementalGC) {
+      // In our expts, using (default) throughput collector has severe perf ramifications in
+      // multi-tenant machines
+      javaOpts += "-XX:+UseConcMarkSweepGC"
+      javaOpts += "-XX:+CMSIncrementalMode"
+      javaOpts += "-XX:+CMSIncrementalPacing"
+      javaOpts += "-XX:CMSIncrementalDutyCycleMin=0"
+      javaOpts += "-XX:CMSIncrementalDutyCycle=10"
+    }
+
+    // Forward the Spark configuration to the application master / executors.
+    // TODO: it might be nicer to pass these as an internal environment variable rather than
+    // as Java options, due to complications with string parsing of nested quotes.
+    for ((k, v) <- sparkConf.getAll) {
+      javaOpts += YarnSparkHadoopUtil.escapeForShell(s"-D$k=$v")
+    }
+
+    // Include driver-specific java options if we are launching a driver
+    if (isClusterMode) {
+      sparkConf.getOption("spark.driver.extraJavaOptions")
+        .orElse(sys.env.get("SPARK_JAVA_OPTS"))
+        .map(Utils.splitCommandString).getOrElse(Seq.empty)
+        .foreach(opts => javaOpts += opts)
+      val libraryPaths = Seq(sys.props.get("spark.driver.extraLibraryPath"),
+        sys.props.get("spark.driver.libraryPath")).flatten
+      if (libraryPaths.nonEmpty) {
+        prefixEnv = Some(Utils.libraryPathEnvPrefix(libraryPaths))
+      }
+      if (sparkConf.getOption("spark.yarn.am.extraJavaOptions").isDefined) {
+        logWarning("spark.yarn.am.extraJavaOptions will not take effect in cluster mode")
+      }
+    } else {
+      // Validate and include yarn am specific java options in yarn-client mode.
+      val amOptsKey = "spark.yarn.am.extraJavaOptions"
+      val amOpts = sparkConf.getOption(amOptsKey)
+      amOpts.foreach { opts =>
+        if (opts.contains("-Dspark")) {
+          val msg = s"$amOptsKey is not allowed to set Spark options (was '$opts'). "
+          throw new SparkException(msg)
+        }
+        if (opts.contains("-Xmx") || opts.contains("-Xms")) {
+          val msg = s"$amOptsKey is not allowed to alter memory settings (was '$opts')."
+          throw new SparkException(msg)
+        }
+        javaOpts ++= Utils.splitCommandString(opts)
+      }
+    }
+
+    // For log4j configuration to reference
+    javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR)
+
+    val userClass =
+      if (isClusterMode) {
+        Seq("--class", YarnSparkHadoopUtil.escapeForShell(args.userClass))
+      } else {
+        Nil
+      }
+    val userJar =
+      if (args.userJar != null) {
+        Seq("--jar", args.userJar)
+      } else {
+        Nil
+      }
+    val amClass =
+      if (isClusterMode) {
+        Class.forName("org.apache.spark.deploy.yarn.ApplicationMaster").getName
+      } else {
+        Class.forName("org.apache.spark.deploy.yarn.ExecutorLauncher").getName
+      }
+    val userArgs = args.userArgs.flatMap { arg =>
+      Seq("--arg", YarnSparkHadoopUtil.escapeForShell(arg))
+    }
+    val amArgs =
+      Seq(amClass) ++ userClass ++ userJar ++ userArgs ++
+        Seq(
+          "--executor-memory", args.executorMemory.toString + "m",
+          "--executor-cores", args.executorCores.toString,
+          "--num-executors ", args.numExecutors.toString)
+
+    // Command for the ApplicationMaster
+    val commands = prefixEnv ++ Seq(Environment.JAVA_HOME.$() + "/bin/java", "-server") ++
+      javaOpts ++ amArgs ++
+      Seq(
+        "1>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout",
+        "2>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr")
+
+    // TODO: it would be nicer to just make sure there are no null commands here
+    val printableCommands = commands.map(s => if (s == null) "null" else s).toList
+    amContainer.setCommands(printableCommands)
+
+    logDebug("===============================================================================")
+    logDebug("Yarn AM launch context:")
+    logDebug(s"    user class: ${Option(args.userClass).getOrElse("N/A")}")
+    logDebug("    env:")
+    launchEnv.foreach { case (k, v) => logDebug(s"        $k -> $v") }
+    logDebug("    resources:")
+    localResources.foreach { case (k, v) => logDebug(s"        $k -> $v")}
+    logDebug("    command:")
+    logDebug(s"        ${printableCommands.mkString(" ")}")
+    logDebug("===============================================================================")
+
+    // send the acl settings into YARN to control who has access via YARN interfaces
+    val securityManager = new SecurityManager(sparkConf)
+    amContainer.setApplicationACLs(YarnSparkHadoopUtil.getApplicationAclsForYarn(securityManager))
+    setupSecurityToken(amContainer)
+    UserGroupInformation.getCurrentUser().addCredentials(credentials)
+
+    amContainer
+  }
+
+  /**
+   * Report the state of an application until it has exited, either successfully or
+   * due to some failure, then return a pair of the yarn application state (FINISHED, FAILED,
+   * KILLED, or RUNNING) and the final application state (UNDEFINED, SUCCEEDED, FAILED,
+   * or KILLED).
+   *
+   * @param appId ID of the application to monitor.
+   * @param returnOnRunning Whether to also return the application state when it is RUNNING.
+   * @param logApplicationReport Whether to log details of the application report every iteration.
+   * @return A pair of the yarn application state and the final application state.
+   */
+  def monitorApplication(
+      appId: ApplicationId,
+      returnOnRunning: Boolean = false,
+      logApplicationReport: Boolean = true): (YarnApplicationState, FinalApplicationStatus) = {
+    val interval = sparkConf.getLong("spark.yarn.report.interval", 1000)
+    var lastState: YarnApplicationState = null
+    while (true) {
+      Thread.sleep(interval)
+      val report = getApplicationReport(appId)
+      val state = report.getYarnApplicationState
+
+      if (logApplicationReport) {
+        logInfo(s"Application report for $appId (state: $state)")
+        val details = Seq[(String, String)](
+          ("client token", getClientToken(report)),
+          ("diagnostics", report.getDiagnostics),
+          ("ApplicationMaster host", report.getHost),
+          ("ApplicationMaster RPC port", report.getRpcPort.toString),
+          ("queue", report.getQueue),
+          ("start time", report.getStartTime.toString),
+          ("final status", report.getFinalApplicationStatus.toString),
+          ("tracking URL", report.getTrackingUrl),
+          ("user", report.getUser)
+        )
+
+        // Use more loggable format if value is null or empty
+        val formattedDetails = details
+          .map { case (k, v) =>
+          val newValue = Option(v).filter(_.nonEmpty).getOrElse("N/A")
+          s"\n\t $k: $newValue" }
+          .mkString("")
+
+        // If DEBUG is enabled, log report details every iteration
+        // Otherwise, log them every time the application changes state
+        if (log.isDebugEnabled) {
+          logDebug(formattedDetails)
+        } else if (lastState != state) {
+          logInfo(formattedDetails)
+        }
+      }
+
+      if (state == YarnApplicationState.FINISHED ||
+        state == YarnApplicationState.FAILED ||
+        state == YarnApplicationState.KILLED) {
+        return (state, report.getFinalApplicationStatus)
+      }
+
+      if (returnOnRunning && state == YarnApplicationState.RUNNING) {
+        return (state, report.getFinalApplicationStatus)
+      }
+
+      lastState = state
+    }
+
+    // Never reached, but keeps compiler happy
+    throw new SparkException("While loop is depleted! This should never happen...")
+  }
+
+  /**
+   * Submit an application to the ResourceManager and monitor its state.
+   * This continues until the application has exited for any reason.
+   * If the application finishes with a failed, killed, or undefined status,
+   * throw an appropriate SparkException.
+   */
+  def run(): Unit = {
+    val (yarnApplicationState, finalApplicationStatus) = monitorApplication(submitApplication())
+    if (yarnApplicationState == YarnApplicationState.FAILED ||
+      finalApplicationStatus == FinalApplicationStatus.FAILED) {
+      throw new SparkException("Application finished with failed status")
+    }
+    if (yarnApplicationState == YarnApplicationState.KILLED ||
+      finalApplicationStatus == FinalApplicationStatus.KILLED) {
+      throw new SparkException("Application is killed")
+    }
+    if (finalApplicationStatus == FinalApplicationStatus.UNDEFINED) {
+      throw new SparkException("The final status of application is undefined")
+    }
+  }
 }
 
-object Client {
+object Client extends Logging {
   def main(argStrings: Array[String]) {
     if (!sys.props.contains("SPARK_SUBMIT")) {
       println("WARNING: This client is deprecated and will be removed in a " +
@@ -138,4 +627,311 @@ object Client {
     val args = new ClientArguments(argStrings, sparkConf)
     new Client(args, sparkConf).run()
   }
+
+  // Alias for the Spark assembly jar and the user jar
+  val SPARK_JAR: String = "__spark__.jar"
+  val APP_JAR: String = "__app__.jar"
+
+  // URI scheme that identifies local resources
+  val LOCAL_SCHEME = "local"
+
+  // Staging directory for any temporary jars or files
+  val SPARK_STAGING: String = ".sparkStaging"
+
+  // Location of any user-defined Spark jars
+  val CONF_SPARK_JAR = "spark.yarn.jar"
+  val ENV_SPARK_JAR = "SPARK_JAR"
+
+  // Internal config to propagate the location of the user's jar to the driver/executors
+  val CONF_SPARK_USER_JAR = "spark.yarn.user.jar"
+
+  // Internal config to propagate the locations of any extra jars to add to the classpath
+  // of the executors
+  val CONF_SPARK_YARN_SECONDARY_JARS = "spark.yarn.secondary.jars"
+
+  // Staging directory is private! -> rwx--------
+  val STAGING_DIR_PERMISSION: FsPermission =
+    FsPermission.createImmutable(Integer.parseInt("700", 8).toShort)
+
+  // App files are world-wide readable and owner writable -> rw-r--r--
+  val APP_FILE_PERMISSION: FsPermission =
+    FsPermission.createImmutable(Integer.parseInt("644", 8).toShort)
+
+  // Distribution-defined classpath to add to processes
+  val ENV_DIST_CLASSPATH = "SPARK_DIST_CLASSPATH"
+
+  /**
+   * Find the user-defined Spark jar if configured, or return the jar containing this
+   * class if not.
+   *
+   * This method first looks in the SparkConf object for the CONF_SPARK_JAR key, and in the
+   * user environment if that is not found (for backwards compatibility).
+   */
+  private def sparkJar(conf: SparkConf): String = {
+    if (conf.contains(CONF_SPARK_JAR)) {
+      conf.get(CONF_SPARK_JAR)
+    } else if (System.getenv(ENV_SPARK_JAR) != null) {
+      logWarning(
+        s"$ENV_SPARK_JAR detected in the system environment. This variable has been deprecated " +
+          s"in favor of the $CONF_SPARK_JAR configuration variable.")
+      System.getenv(ENV_SPARK_JAR)
+    } else {
+      SparkContext.jarOfClass(this.getClass).head
+    }
+  }
+
+  /**
+   * Return the path to the given application's staging directory.
+   */
+  private def getAppStagingDir(appId: ApplicationId): String = {
+    SPARK_STAGING + Path.SEPARATOR + appId.toString() + Path.SEPARATOR
+  }
+
+  /**
+   * Populate the classpath entry in the given environment map with any application
+   * classpath specified through the Hadoop and Yarn configurations.
+   */
+  private[yarn] def populateHadoopClasspath(conf: Configuration, env: HashMap[String, String])
+    : Unit = {
+    val classPathElementsToAdd = getYarnAppClasspath(conf) ++ getMRAppClasspath(conf)
+    for (c <- classPathElementsToAdd.flatten) {
+      YarnSparkHadoopUtil.addPathToEnvironment(env, Environment.CLASSPATH.name, c.trim)
+    }
+  }
+
+  private def getYarnAppClasspath(conf: Configuration): Option[Seq[String]] =
+    Option(conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH)) match {
+      case Some(s) => Some(s.toSeq)
+      case None => getDefaultYarnApplicationClasspath
+    }
+
+  private def getMRAppClasspath(conf: Configuration): Option[Seq[String]] =
+    Option(conf.getStrings("mapreduce.application.classpath")) match {
+      case Some(s) => Some(s.toSeq)
+      case None => getDefaultMRApplicationClasspath
+    }
+
+  private[yarn] def getDefaultYarnApplicationClasspath: Option[Seq[String]] = {
+    val triedDefault = Try[Seq[String]] {
+      val field = classOf[YarnConfiguration].getField("DEFAULT_YARN_APPLICATION_CLASSPATH")
+      val value = field.get(null).asInstanceOf[Array[String]]
+      value.toSeq
+    } recoverWith {
+      case e: NoSuchFieldException => Success(Seq.empty[String])
+    }
+
+    triedDefault match {
+      case f: Failure[_] =>
+        logError("Unable to obtain the default YARN Application classpath.", f.exception)
+      case s: Success[Seq[String]] =>
+        logDebug(s"Using the default YARN application classpath: ${s.get.mkString(",")}")
+    }
+
+    triedDefault.toOption
+  }
+
+  /**
+   * In Hadoop 0.23, the MR application classpath comes with the YARN application
+   * classpath. In Hadoop 2.0, it's an array of Strings, and in 2.2+ it's a String.
+   * So we need to use reflection to retrieve it.
+   */
+  private[yarn] def getDefaultMRApplicationClasspath: Option[Seq[String]] = {
+    val triedDefault = Try[Seq[String]] {
+      val field = classOf[MRJobConfig].getField("DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH")
+      val value = if (field.getType == classOf[String]) {
+        StringUtils.getStrings(field.get(null).asInstanceOf[String]).toArray
+      } else {
+        field.get(null).asInstanceOf[Array[String]]
+      }
+      value.toSeq
+    } recoverWith {
+      case e: NoSuchFieldException => Success(Seq.empty[String])
+    }
+
+    triedDefault match {
+      case f: Failure[_] =>
+        logError("Unable to obtain the default MR Application classpath.", f.exception)
+      case s: Success[Seq[String]] =>
+        logDebug(s"Using the default MR application classpath: ${s.get.mkString(",")}")
+    }
+
+    triedDefault.toOption
+  }
+
+  /**
+   * Populate the classpath entry in the given environment map.
+   * This includes the user jar, Spark jar, and any extra application jars.
+   */
+  private[yarn] def populateClasspath(
+      args: ClientArguments,
+      conf: Configuration,
+      sparkConf: SparkConf,
+      env: HashMap[String, String],
+      extraClassPath: Option[String] = None): Unit = {
+    extraClassPath.foreach(addClasspathEntry(_, env))
+    addClasspathEntry(Environment.PWD.$(), env)
+
+    // Normally the users app.jar is last in case conflicts with spark jars
+    if (sparkConf.getBoolean("spark.yarn.user.classpath.first", false)) {
+      addUserClasspath(args, sparkConf, env)
+      addFileToClasspath(sparkJar(sparkConf), SPARK_JAR, env)
+      populateHadoopClasspath(conf, env)
+    } else {
+      addFileToClasspath(sparkJar(sparkConf), SPARK_JAR, env)
+      populateHadoopClasspath(conf, env)
+      addUserClasspath(args, sparkConf, env)
+    }
+
+    // Append all jar files under the working directory to the classpath.
+    addClasspathEntry(Environment.PWD.$() + Path.SEPARATOR + "*", env)
+  }
+
+  /**
+   * Adds the user jars which have local: URIs (or alternate names, such as APP_JAR) explicitly
+   * to the classpath.
+   */
+  private def addUserClasspath(
+      args: ClientArguments,
+      conf: SparkConf,
+      env: HashMap[String, String]): Unit = {
+
+    // If `args` is not null, we are launching an AM container.
+    // Otherwise, we are launching executor containers.
+    val (mainJar, secondaryJars) =
+      if (args != null) {
+        (args.userJar, args.addJars)
+      } else {
+        (conf.get(CONF_SPARK_USER_JAR, null), conf.get(CONF_SPARK_YARN_SECONDARY_JARS, null))
+      }
+
+    addFileToClasspath(mainJar, APP_JAR, env)
+    if (secondaryJars != null) {
+      secondaryJars.split(",").filter(_.nonEmpty).foreach { jar =>
+        addFileToClasspath(jar, null, env)
+      }
+    }
+  }
+
+  /**
+   * Adds the given path to the classpath, handling "local:" URIs correctly.
+   *
+   * If an alternate name for the file is given, and it's not a "local:" file, the alternate
+   * name will be added to the classpath (relative to the job's work directory).
+   *
+   * If not a "local:" file and no alternate name, the environment is not modified.
+   *
+   * @param path      Path to add to classpath (optional).
+   * @param fileName  Alternate name for the file (optional).
+   * @param env       Map holding the environment variables.
+   */
+  private def addFileToClasspath(
+      path: String,
+      fileName: String,
+      env: HashMap[String, String]): Unit = {
+    if (path != null) {
+      scala.util.control.Exception.ignoring(classOf[URISyntaxException]) {
+        val uri = new URI(path)
+        if (uri.getScheme == LOCAL_SCHEME) {
+          addClasspathEntry(uri.getPath, env)
+          return
+        }
+      }
+    }
+    if (fileName != null) {
+      addClasspathEntry(Environment.PWD.$() + Path.SEPARATOR + fileName, env)
+    }
+  }
+
+  /**
+   * Add the given path to the classpath entry of the given environment map.
+   * If the classpath is already set, this appends the new path to the existing classpath.
+   */
+  private def addClasspathEntry(path: String, env: HashMap[String, String]): Unit =
+    YarnSparkHadoopUtil.addPathToEnvironment(env, Environment.CLASSPATH.name, path)
+
+  /**
+   * Get the list of namenodes the user may access.
+   */
+  private[yarn] def getNameNodesToAccess(sparkConf: SparkConf): Set[Path] = {
+    sparkConf.get("spark.yarn.access.namenodes", "")
+      .split(",")
+      .map(_.trim())
+      .filter(!_.isEmpty)
+      .map(new Path(_))
+      .toSet
+  }
+
+  private[yarn] def getTokenRenewer(conf: Configuration): String = {
+    val delegTokenRenewer = Master.getMasterPrincipal(conf)
+    logDebug("delegation token renewer is: " + delegTokenRenewer)
+    if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) {
+      val errorMessage = "Can't get Master Kerberos principal for use as renewer"
+      logError(errorMessage)
+      throw new SparkException(errorMessage)
+    }
+    delegTokenRenewer
+  }
+
+  /**
+   * Obtains tokens for the namenodes passed in and adds them to the credentials.
+   */
+  private def obtainTokensForNamenodes(
+      paths: Set[Path],
+      conf: Configuration,
+      creds: Credentials): Unit = {
+    if (UserGroupInformation.isSecurityEnabled()) {
+      val delegTokenRenewer = getTokenRenewer(conf)
+      paths.foreach { dst =>
+        val dstFs = dst.getFileSystem(conf)
+        logDebug("getting token for namenode: " + dst)
+        dstFs.addDelegationTokens(delegTokenRenewer, creds)
+      }
+    }
+  }
+
+  /**
+   * Return whether the two file systems are the same.
+   */
+  private def compareFs(srcFs: FileSystem, destFs: FileSystem): Boolean = {
+    val srcUri = srcFs.getUri()
+    val dstUri = destFs.getUri()
+    if (srcUri.getScheme() == null || srcUri.getScheme() != dstUri.getScheme()) {
+      return false
+    }
+
+    var srcHost = srcUri.getHost()
+    var dstHost = dstUri.getHost()
+
+    // In HA or when using viewfs, the host part of the URI may not actually be a host, but the
+    // name of the HDFS namespace. Those names won't resolve, so avoid even trying if they
+    // match.
+    if (srcHost != null && dstHost != null && srcHost != dstHost) {
+      try {
+        srcHost = InetAddress.getByName(srcHost).getCanonicalHostName()
+        dstHost = InetAddress.getByName(dstHost).getCanonicalHostName()
+      } catch {
+        case e: UnknownHostException =>
+          return false
+      }
+    }
+
+    Objects.equal(srcHost, dstHost) && srcUri.getPort() == dstUri.getPort()
+  }
+
+  /**
+   * Given a local URI, resolve it and return a qualified local path that corresponds to the URI.
+   * This is used for preparing local resources to be included in the container launch context.
+   */
+  private def getQualifiedLocalPath(localURI: URI, hadoopConf: Configuration): Path = {
+    val qualifiedURI =
+      if (localURI.getScheme == null) {
+        // If not specified, assume this is in the local filesystem to keep the behavior
+        // consistent with that of Hadoop
+        new URI(FileSystem.getLocal(hadoopConf).makeQualified(new Path(localURI)).toString)
+      } else {
+        localURI
+      }
+    new Path(qualifiedURI)
+  }
+
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
index c439969510fc9..f96b245512271 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
@@ -19,9 +19,9 @@ package org.apache.spark.deploy.yarn
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
-import org.apache.spark.util.{Utils, IntParam, MemoryParam}
+import org.apache.spark.util.{IntParam, MemoryParam, Utils}
 
 // TODO: Add code and support for ensuring that yarn resource 'tasks' are location aware !
 private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf) {
@@ -36,17 +36,18 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
   var numExecutors = DEFAULT_NUMBER_EXECUTORS
   var amQueue = sparkConf.get("spark.yarn.queue", "default")
   var amMemory: Int = 512 // MB
+  var amCores: Int = 1
   var appName: String = "Spark"
   var priority = 0
-
-  // Additional memory to allocate to containers
-  // For now, use driver's memory overhead as our AM container's memory overhead
-  val amMemoryOverhead = sparkConf.getInt("spark.yarn.driver.memoryOverhead",
-    math.max((MEMORY_OVERHEAD_FACTOR * amMemory).toInt, MEMORY_OVERHEAD_MIN))
-
-  val executorMemoryOverhead = sparkConf.getInt("spark.yarn.executor.memoryOverhead",
-    math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toInt, MEMORY_OVERHEAD_MIN))
-
+  def isClusterMode: Boolean = userClass != null
+
+  private var driverMemory: Int = 512 // MB
+  private var driverCores: Int = 1
+  private val driverMemOverheadKey = "spark.yarn.driver.memoryOverhead"
+  private val amMemKey = "spark.yarn.am.memory"
+  private val amMemOverheadKey = "spark.yarn.am.memoryOverhead"
+  private val driverCoresKey = "spark.driver.cores"
+  private val amCoresKey = "spark.yarn.am.cores"
   private val isDynamicAllocationEnabled =
     sparkConf.getBoolean("spark.dynamicAllocation.enabled", false)
 
@@ -54,17 +55,25 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
   loadEnvironmentArgs()
   validateArgs()
 
+  // Additional memory to allocate to containers
+  val amMemoryOverheadConf = if (isClusterMode) driverMemOverheadKey else amMemOverheadKey
+  val amMemoryOverhead = sparkConf.getInt(amMemoryOverheadConf,
+    math.max((MEMORY_OVERHEAD_FACTOR * amMemory).toInt, MEMORY_OVERHEAD_MIN))
+
+  val executorMemoryOverhead = sparkConf.getInt("spark.yarn.executor.memoryOverhead",
+    math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toInt, MEMORY_OVERHEAD_MIN))
+
   /** Load any default arguments provided through environment variables and Spark properties. */
   private def loadEnvironmentArgs(): Unit = {
     // For backward compatibility, SPARK_YARN_DIST_{ARCHIVES/FILES} should be resolved to hdfs://,
     // while spark.yarn.dist.{archives/files} should be resolved to file:// (SPARK-2051).
     files = Option(files)
-      .orElse(sys.env.get("SPARK_YARN_DIST_FILES"))
       .orElse(sparkConf.getOption("spark.yarn.dist.files").map(p => Utils.resolveURIs(p)))
+      .orElse(sys.env.get("SPARK_YARN_DIST_FILES"))
       .orNull
     archives = Option(archives)
-      .orElse(sys.env.get("SPARK_YARN_DIST_ARCHIVES"))
       .orElse(sparkConf.getOption("spark.yarn.dist.archives").map(p => Utils.resolveURIs(p)))
+      .orElse(sys.env.get("SPARK_YARN_DIST_ARCHIVES"))
       .orNull
     // If dynamic allocation is enabled, start at the max number of executors
     if (isDynamicAllocationEnabled) {
@@ -86,6 +95,31 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
       throw new IllegalArgumentException(
         "You must specify at least 1 executor!\n" + getUsageMessage())
     }
+    if (executorCores < sparkConf.getInt("spark.task.cpus", 1)) {
+      throw new SparkException("Executor cores must not be less than " +
+        "spark.task.cpus.")
+    }
+    if (isClusterMode) {
+      for (key <- Seq(amMemKey, amMemOverheadKey, amCoresKey)) {
+        if (sparkConf.contains(key)) {
+          println(s"$key is set but does not apply in cluster mode.")
+        }
+      }
+      amMemory = driverMemory
+      amCores = driverCores
+    } else {
+      for (key <- Seq(driverMemOverheadKey, driverCoresKey)) {
+        if (sparkConf.contains(key)) {
+          println(s"$key is set but does not apply in client mode.")
+        }
+      }
+      sparkConf.getOption(amMemKey)
+        .map(Utils.memoryStringToMb)
+        .foreach { mem => amMemory = mem }
+      sparkConf.getOption(amCoresKey)
+        .map(_.toInt)
+        .foreach { cores => amCores = cores }
+    }
   }
 
   private def parseArgs(inputArgs: List[String]): Unit = {
@@ -117,7 +151,11 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
           if (args(0) == "--master-memory") {
             println("--master-memory is deprecated. Use --driver-memory instead.")
           }
-          amMemory = value
+          driverMemory = value
+          args = tail
+
+        case ("--driver-cores") :: IntParam(value) :: tail =>
+          driverCores = value
           args = tail
 
         case ("--num-workers" | "--num-executors") :: IntParam(value) :: tail =>
@@ -178,7 +216,8 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
 
   private def getUsageMessage(unknownParam: List[String] = null): String = {
     val message = if (unknownParam != null) s"Unknown/unsupported param $unknownParam\n" else ""
-    message + """
+    message +
+      """
       |Usage: org.apache.spark.deploy.yarn.Client [options]
       |Options:
       |  --jar JAR_PATH           Path to your application's JAR file (required in yarn-cluster
@@ -187,8 +226,9 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
       |  --arg ARG                Argument to be passed to your application's main class.
       |                           Multiple invocations are possible, each will be passed in order.
       |  --num-executors NUM      Number of executors to start (Default: 2)
-      |  --executor-cores NUM     Number of cores for the executors (Default: 1).
+      |  --executor-cores NUM     Number of cores per executor (Default: 1).
       |  --driver-memory MEM      Memory for driver (e.g. 1000M, 2G) (Default: 512 Mb)
+      |  --driver-cores NUM       Number of cores used by the driver (Default: 1).
       |  --executor-memory MEM    Memory per executor (e.g. 1000M, 2G) (Default: 1G)
       |  --name NAME              The name of your application (Default: Spark)
       |  --queue QUEUE            The hadoop queue to use for allocation requests (Default:
@@ -197,6 +237,6 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
       |                           to work with.
       |  --files files            Comma separated list of files to be distributed with the job.
       |  --archives archives      Comma separated list of archives to be distributed with the job.
-      """
+      """.stripMargin
   }
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
deleted file mode 100644
index f95d72379171c..0000000000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
+++ /dev/null
@@ -1,823 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.net.{InetAddress, UnknownHostException, URI, URISyntaxException}
-
-import scala.collection.JavaConversions._
-import scala.collection.mutable.{HashMap, ListBuffer, Map}
-import scala.util.{Try, Success, Failure}
-
-import com.google.common.base.Objects
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs._
-import org.apache.hadoop.fs.permission.FsPermission
-import org.apache.hadoop.mapred.Master
-import org.apache.hadoop.mapreduce.MRJobConfig
-import org.apache.hadoop.security.{Credentials, UserGroupInformation}
-import org.apache.hadoop.util.StringUtils
-import org.apache.hadoop.yarn.api._
-import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
-import org.apache.hadoop.yarn.api.protocolrecords._
-import org.apache.hadoop.yarn.api.records._
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.util.Records
-
-import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkContext, SparkException}
-import org.apache.spark.util.Utils
-
-/**
- * The entry point (starting in Client#main() and Client#run()) for launching Spark on YARN.
- * The Client submits an application to the YARN ResourceManager.
- */
-private[spark] trait ClientBase extends Logging {
-  import ClientBase._
-
-  protected val args: ClientArguments
-  protected val hadoopConf: Configuration
-  protected val sparkConf: SparkConf
-  protected val yarnConf: YarnConfiguration
-  protected val credentials = UserGroupInformation.getCurrentUser.getCredentials
-  protected val amMemoryOverhead = args.amMemoryOverhead // MB
-  protected val executorMemoryOverhead = args.executorMemoryOverhead // MB
-  private val distCacheMgr = new ClientDistributedCacheManager()
-  private val isLaunchingDriver = args.userClass != null
-
-  /**
-   * Fail fast if we have requested more resources per container than is available in the cluster.
-   */
-  protected def verifyClusterResources(newAppResponse: GetNewApplicationResponse): Unit = {
-    val maxMem = newAppResponse.getMaximumResourceCapability().getMemory()
-    logInfo("Verifying our application has not requested more than the maximum " +
-      s"memory capability of the cluster ($maxMem MB per container)")
-    val executorMem = args.executorMemory + executorMemoryOverhead
-    if (executorMem > maxMem) {
-      throw new IllegalArgumentException(s"Required executor memory (${args.executorMemory}" +
-        s"+$executorMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster!")
-    }
-    val amMem = args.amMemory + amMemoryOverhead
-    if (amMem > maxMem) {
-      throw new IllegalArgumentException(s"Required AM memory (${args.amMemory}" +
-        s"+$amMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster!")
-    }
-    logInfo("Will allocate AM container, with %d MB memory including %d MB overhead".format(
-      amMem,
-      amMemoryOverhead))
-
-    // We could add checks to make sure the entire cluster has enough resources but that involves
-    // getting all the node reports and computing ourselves.
-  }
-
-  /**
-   * Copy the given file to a remote file system (e.g. HDFS) if needed.
-   * The file is only copied if the source and destination file systems are different. This is used
-   * for preparing resources for launching the ApplicationMaster container. Exposed for testing.
-   */
-  def copyFileToRemote(
-      destDir: Path,
-      srcPath: Path,
-      replication: Short,
-      setPerms: Boolean = false): Path = {
-    val destFs = destDir.getFileSystem(hadoopConf)
-    val srcFs = srcPath.getFileSystem(hadoopConf)
-    var destPath = srcPath
-    if (!compareFs(srcFs, destFs)) {
-      destPath = new Path(destDir, srcPath.getName())
-      logInfo(s"Uploading resource $srcPath -> $destPath")
-      FileUtil.copy(srcFs, srcPath, destFs, destPath, false, hadoopConf)
-      destFs.setReplication(destPath, replication)
-      if (setPerms) {
-        destFs.setPermission(destPath, new FsPermission(APP_FILE_PERMISSION))
-      }
-    } else {
-      logInfo(s"Source and destination file systems are the same. Not copying $srcPath")
-    }
-    // Resolve any symlinks in the URI path so using a "current" symlink to point to a specific
-    // version shows the specific version in the distributed cache configuration
-    val qualifiedDestPath = destFs.makeQualified(destPath)
-    val fc = FileContext.getFileContext(qualifiedDestPath.toUri(), hadoopConf)
-    fc.resolvePath(qualifiedDestPath)
-  }
-
-  /**
-   * Given a local URI, resolve it and return a qualified local path that corresponds to the URI.
-   * This is used for preparing local resources to be included in the container launch context.
-   */
-  private def getQualifiedLocalPath(localURI: URI): Path = {
-    val qualifiedURI =
-      if (localURI.getScheme == null) {
-        // If not specified, assume this is in the local filesystem to keep the behavior
-        // consistent with that of Hadoop
-        new URI(FileSystem.getLocal(hadoopConf).makeQualified(new Path(localURI)).toString)
-      } else {
-        localURI
-      }
-    new Path(qualifiedURI)
-  }
-
-  /**
-   * Upload any resources to the distributed cache if needed. If a resource is intended to be
-   * consumed locally, set up the appropriate config for downstream code to handle it properly.
-   * This is used for setting up a container launch context for our ApplicationMaster.
-   * Exposed for testing.
-   */
-  def prepareLocalResources(appStagingDir: String): HashMap[String, LocalResource] = {
-    logInfo("Preparing resources for our AM container")
-    // Upload Spark and the application JAR to the remote file system if necessary,
-    // and add them as local resources to the application master.
-    val fs = FileSystem.get(hadoopConf)
-    val dst = new Path(fs.getHomeDirectory(), appStagingDir)
-    val nns = getNameNodesToAccess(sparkConf) + dst
-    obtainTokensForNamenodes(nns, hadoopConf, credentials)
-
-    val replication = sparkConf.getInt("spark.yarn.submit.file.replication",
-      fs.getDefaultReplication(dst)).toShort
-    val localResources = HashMap[String, LocalResource]()
-    FileSystem.mkdirs(fs, dst, new FsPermission(STAGING_DIR_PERMISSION))
-
-    val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
-
-    val oldLog4jConf = Option(System.getenv("SPARK_LOG4J_CONF"))
-    if (oldLog4jConf.isDefined) {
-      logWarning(
-        "SPARK_LOG4J_CONF detected in the system environment. This variable has been " +
-        "deprecated. Please refer to the \"Launching Spark on YARN\" documentation " +
-        "for alternatives.")
-    }
-
-    /**
-     * Copy the given main resource to the distributed cache if the scheme is not "local".
-     * Otherwise, set the corresponding key in our SparkConf to handle it downstream.
-     * Each resource is represented by a 4-tuple of:
-     *   (1) destination resource name,
-     *   (2) local path to the resource,
-     *   (3) Spark property key to set if the scheme is not local, and
-     *   (4) whether to set permissions for this resource
-     */
-    List(
-      (SPARK_JAR, sparkJar(sparkConf), CONF_SPARK_JAR, false),
-      (APP_JAR, args.userJar, CONF_SPARK_USER_JAR, true),
-      ("log4j.properties", oldLog4jConf.orNull, null, false)
-    ).foreach { case (destName, _localPath, confKey, setPermissions) =>
-      val localPath: String = if (_localPath != null) _localPath.trim() else ""
-      if (!localPath.isEmpty()) {
-        val localURI = new URI(localPath)
-        if (localURI.getScheme != LOCAL_SCHEME) {
-          val src = getQualifiedLocalPath(localURI)
-          val destPath = copyFileToRemote(dst, src, replication, setPermissions)
-          val destFs = FileSystem.get(destPath.toUri(), hadoopConf)
-          distCacheMgr.addResource(destFs, hadoopConf, destPath,
-            localResources, LocalResourceType.FILE, destName, statCache)
-        } else if (confKey != null) {
-          // If the resource is intended for local use only, handle this downstream
-          // by setting the appropriate property
-          sparkConf.set(confKey, localPath)
-        }
-      }
-    }
-
-    /**
-     * Do the same for any additional resources passed in through ClientArguments.
-     * Each resource category is represented by a 3-tuple of:
-     *   (1) comma separated list of resources in this category,
-     *   (2) resource type, and
-     *   (3) whether to add these resources to the classpath
-     */
-    val cachedSecondaryJarLinks = ListBuffer.empty[String]
-    List(
-      (args.addJars, LocalResourceType.FILE, true),
-      (args.files, LocalResourceType.FILE, false),
-      (args.archives, LocalResourceType.ARCHIVE, false)
-    ).foreach { case (flist, resType, addToClasspath) =>
-      if (flist != null && !flist.isEmpty()) {
-        flist.split(',').foreach { file =>
-          val localURI = new URI(file.trim())
-          if (localURI.getScheme != LOCAL_SCHEME) {
-            val localPath = new Path(localURI)
-            val linkname = Option(localURI.getFragment()).getOrElse(localPath.getName())
-            val destPath = copyFileToRemote(dst, localPath, replication)
-            distCacheMgr.addResource(
-              fs, hadoopConf, destPath, localResources, resType, linkname, statCache)
-            if (addToClasspath) {
-              cachedSecondaryJarLinks += linkname
-            }
-          } else if (addToClasspath) {
-            // Resource is intended for local use only and should be added to the class path
-            cachedSecondaryJarLinks += file.trim()
-          }
-        }
-      }
-    }
-    if (cachedSecondaryJarLinks.nonEmpty) {
-      sparkConf.set(CONF_SPARK_YARN_SECONDARY_JARS, cachedSecondaryJarLinks.mkString(","))
-    }
-
-    localResources
-  }
-
-  /**
-   * Set up the environment for launching our ApplicationMaster container.
-   */
-  private def setupLaunchEnv(stagingDir: String): HashMap[String, String] = {
-    logInfo("Setting up the launch environment for our AM container")
-    val env = new HashMap[String, String]()
-    val extraCp = sparkConf.getOption("spark.driver.extraClassPath")
-    populateClasspath(args, yarnConf, sparkConf, env, extraCp)
-    env("SPARK_YARN_MODE") = "true"
-    env("SPARK_YARN_STAGING_DIR") = stagingDir
-    env("SPARK_USER") = UserGroupInformation.getCurrentUser().getShortUserName()
-
-    // Set the environment variables to be passed on to the executors.
-    distCacheMgr.setDistFilesEnv(env)
-    distCacheMgr.setDistArchivesEnv(env)
-
-    // Pick up any environment variables for the AM provided through spark.yarn.appMasterEnv.*
-    val amEnvPrefix = "spark.yarn.appMasterEnv."
-    sparkConf.getAll
-      .filter { case (k, v) => k.startsWith(amEnvPrefix) }
-      .map { case (k, v) => (k.substring(amEnvPrefix.length), v) }
-      .foreach { case (k, v) => YarnSparkHadoopUtil.addPathToEnvironment(env, k, v) }
-
-    // Keep this for backwards compatibility but users should move to the config
-    sys.env.get("SPARK_YARN_USER_ENV").foreach { userEnvs =>
-      // Allow users to specify some environment variables.
-      YarnSparkHadoopUtil.setEnvFromInputString(env, userEnvs)
-      // Pass SPARK_YARN_USER_ENV itself to the AM so it can use it to set up executor environments.
-      env("SPARK_YARN_USER_ENV") = userEnvs
-    }
-
-    // In cluster mode, if the deprecated SPARK_JAVA_OPTS is set, we need to propagate it to
-    // executors. But we can't just set spark.executor.extraJavaOptions, because the driver's
-    // SparkContext will not let that set spark* system properties, which is expected behavior for
-    // Yarn clients. So propagate it through the environment.
-    //
-    // Note that to warn the user about the deprecation in cluster mode, some code from
-    // SparkConf#validateSettings() is duplicated here (to avoid triggering the condition
-    // described above).
-    if (isLaunchingDriver) {
-      sys.env.get("SPARK_JAVA_OPTS").foreach { value =>
-        val warning =
-          s"""
-            |SPARK_JAVA_OPTS was detected (set to '$value').
-            |This is deprecated in Spark 1.0+.
-            |
-            |Please instead use:
-            | - ./spark-submit with conf/spark-defaults.conf to set defaults for an application
-            | - ./spark-submit with --driver-java-options to set -X options for a driver
-            | - spark.executor.extraJavaOptions to set -X options for executors
-          """.stripMargin
-        logWarning(warning)
-        for (proc <- Seq("driver", "executor")) {
-          val key = s"spark.$proc.extraJavaOptions"
-          if (sparkConf.contains(key)) {
-            throw new SparkException(s"Found both $key and SPARK_JAVA_OPTS. Use only the former.")
-          }
-        }
-        env("SPARK_JAVA_OPTS") = value
-      }
-    }
-
-    env
-  }
-
-  /**
-   * Set up a ContainerLaunchContext to launch our ApplicationMaster container.
-   * This sets up the launch environment, java options, and the command for launching the AM.
-   */
-  protected def createContainerLaunchContext(newAppResponse: GetNewApplicationResponse)
-      : ContainerLaunchContext = {
-    logInfo("Setting up container launch context for our AM")
-
-    val appId = newAppResponse.getApplicationId
-    val appStagingDir = getAppStagingDir(appId)
-    val localResources = prepareLocalResources(appStagingDir)
-    val launchEnv = setupLaunchEnv(appStagingDir)
-    val amContainer = Records.newRecord(classOf[ContainerLaunchContext])
-    amContainer.setLocalResources(localResources)
-    amContainer.setEnvironment(launchEnv)
-
-    val javaOpts = ListBuffer[String]()
-
-    // Set the environment variable through a command prefix
-    // to append to the existing value of the variable
-    var prefixEnv: Option[String] = None
-
-    // Add Xmx for AM memory
-    javaOpts += "-Xmx" + args.amMemory + "m"
-
-    val tmpDir = new Path(Environment.PWD.$(), YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR)
-    javaOpts += "-Djava.io.tmpdir=" + tmpDir
-
-    // TODO: Remove once cpuset version is pushed out.
-    // The context is, default gc for server class machines ends up using all cores to do gc -
-    // hence if there are multiple containers in same node, Spark GC affects all other containers'
-    // performance (which can be that of other Spark containers)
-    // Instead of using this, rely on cpusets by YARN to enforce "proper" Spark behavior in
-    // multi-tenant environments. Not sure how default Java GC behaves if it is limited to subset
-    // of cores on a node.
-    val useConcurrentAndIncrementalGC = launchEnv.get("SPARK_USE_CONC_INCR_GC").exists(_.toBoolean)
-    if (useConcurrentAndIncrementalGC) {
-      // In our expts, using (default) throughput collector has severe perf ramifications in
-      // multi-tenant machines
-      javaOpts += "-XX:+UseConcMarkSweepGC"
-      javaOpts += "-XX:+CMSIncrementalMode"
-      javaOpts += "-XX:+CMSIncrementalPacing"
-      javaOpts += "-XX:CMSIncrementalDutyCycleMin=0"
-      javaOpts += "-XX:CMSIncrementalDutyCycle=10"
-    }
-
-    // Forward the Spark configuration to the application master / executors.
-    // TODO: it might be nicer to pass these as an internal environment variable rather than
-    // as Java options, due to complications with string parsing of nested quotes.
-    for ((k, v) <- sparkConf.getAll) {
-      javaOpts += YarnSparkHadoopUtil.escapeForShell(s"-D$k=$v")
-    }
-
-    // Include driver-specific java options if we are launching a driver
-    if (isLaunchingDriver) {
-      sparkConf.getOption("spark.driver.extraJavaOptions")
-        .orElse(sys.env.get("SPARK_JAVA_OPTS"))
-        .foreach(opts => javaOpts += opts)
-      val libraryPaths = Seq(sys.props.get("spark.driver.extraLibraryPath"),
-        sys.props.get("spark.driver.libraryPath")).flatten
-      if (libraryPaths.nonEmpty) {
-        prefixEnv = Some(Utils.libraryPathEnvPrefix(libraryPaths))
-      }
-    }
-
-    // For log4j configuration to reference
-    javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR)
-
-    val userClass =
-      if (isLaunchingDriver) {
-        Seq("--class", YarnSparkHadoopUtil.escapeForShell(args.userClass))
-      } else {
-        Nil
-      }
-    val userJar =
-      if (args.userJar != null) {
-        Seq("--jar", args.userJar)
-      } else {
-        Nil
-      }
-    val amClass =
-      if (isLaunchingDriver) {
-        Class.forName("org.apache.spark.deploy.yarn.ApplicationMaster").getName
-      } else {
-        Class.forName("org.apache.spark.deploy.yarn.ExecutorLauncher").getName
-      }
-    val userArgs = args.userArgs.flatMap { arg =>
-      Seq("--arg", YarnSparkHadoopUtil.escapeForShell(arg))
-    }
-    val amArgs =
-      Seq(amClass) ++ userClass ++ userJar ++ userArgs ++
-      Seq(
-        "--executor-memory", args.executorMemory.toString + "m",
-        "--executor-cores", args.executorCores.toString,
-        "--num-executors ", args.numExecutors.toString)
-
-    // Command for the ApplicationMaster
-    val commands = prefixEnv ++ Seq(Environment.JAVA_HOME.$() + "/bin/java", "-server") ++
-      javaOpts ++ amArgs ++
-      Seq(
-        "1>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout",
-        "2>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr")
-
-    // TODO: it would be nicer to just make sure there are no null commands here
-    val printableCommands = commands.map(s => if (s == null) "null" else s).toList
-    amContainer.setCommands(printableCommands)
-
-    logDebug("===============================================================================")
-    logDebug("Yarn AM launch context:")
-    logDebug(s"    user class: ${Option(args.userClass).getOrElse("N/A")}")
-    logDebug("    env:")
-    launchEnv.foreach { case (k, v) => logDebug(s"        $k -> $v") }
-    logDebug("    resources:")
-    localResources.foreach { case (k, v) => logDebug(s"        $k -> $v")}
-    logDebug("    command:")
-    logDebug(s"        ${printableCommands.mkString(" ")}")
-    logDebug("===============================================================================")
-
-    // send the acl settings into YARN to control who has access via YARN interfaces
-    val securityManager = new SecurityManager(sparkConf)
-    amContainer.setApplicationACLs(YarnSparkHadoopUtil.getApplicationAclsForYarn(securityManager))
-    setupSecurityToken(amContainer)
-    UserGroupInformation.getCurrentUser().addCredentials(credentials)
-
-    amContainer
-  }
-
-  /**
-   * Report the state of an application until it has exited, either successfully or
-   * due to some failure, then return a pair of the yarn application state (FINISHED, FAILED,
-   * KILLED, or RUNNING) and the final application state (UNDEFINED, SUCCEEDED, FAILED,
-   * or KILLED).
-   *
-   * @param appId ID of the application to monitor.
-   * @param returnOnRunning Whether to also return the application state when it is RUNNING.
-   * @param logApplicationReport Whether to log details of the application report every iteration.
-   * @return A pair of the yarn application state and the final application state.
-   */
-  def monitorApplication(
-      appId: ApplicationId,
-      returnOnRunning: Boolean = false,
-      logApplicationReport: Boolean = true): (YarnApplicationState, FinalApplicationStatus) = {
-    val interval = sparkConf.getLong("spark.yarn.report.interval", 1000)
-    var lastState: YarnApplicationState = null
-    while (true) {
-      Thread.sleep(interval)
-      val report = getApplicationReport(appId)
-      val state = report.getYarnApplicationState
-
-      if (logApplicationReport) {
-        logInfo(s"Application report for $appId (state: $state)")
-        val details = Seq[(String, String)](
-          ("client token", getClientToken(report)),
-          ("diagnostics", report.getDiagnostics),
-          ("ApplicationMaster host", report.getHost),
-          ("ApplicationMaster RPC port", report.getRpcPort.toString),
-          ("queue", report.getQueue),
-          ("start time", report.getStartTime.toString),
-          ("final status", report.getFinalApplicationStatus.toString),
-          ("tracking URL", report.getTrackingUrl),
-          ("user", report.getUser)
-        )
-
-        // Use more loggable format if value is null or empty
-        val formattedDetails = details
-          .map { case (k, v) =>
-            val newValue = Option(v).filter(_.nonEmpty).getOrElse("N/A")
-            s"\n\t $k: $newValue" }
-          .mkString("")
-
-        // If DEBUG is enabled, log report details every iteration
-        // Otherwise, log them every time the application changes state
-        if (log.isDebugEnabled) {
-          logDebug(formattedDetails)
-        } else if (lastState != state) {
-          logInfo(formattedDetails)
-        }
-      }
-
-      if (state == YarnApplicationState.FINISHED ||
-        state == YarnApplicationState.FAILED ||
-        state == YarnApplicationState.KILLED) {
-        return (state, report.getFinalApplicationStatus)
-      }
-
-      if (returnOnRunning && state == YarnApplicationState.RUNNING) {
-        return (state, report.getFinalApplicationStatus)
-      }
-
-      lastState = state
-    }
-
-    // Never reached, but keeps compiler happy
-    throw new SparkException("While loop is depleted! This should never happen...")
-  }
-
-  /**
-   * Submit an application to the ResourceManager and monitor its state.
-   * This continues until the application has exited for any reason.
-   * If the application finishes with a failed, killed, or undefined status,
-   * throw an appropriate SparkException.
-   */
-  def run(): Unit = {
-    val (yarnApplicationState, finalApplicationStatus) = monitorApplication(submitApplication())
-    if (yarnApplicationState == YarnApplicationState.FAILED ||
-      finalApplicationStatus == FinalApplicationStatus.FAILED) {
-      throw new SparkException("Application finished with failed status")
-    }
-    if (yarnApplicationState == YarnApplicationState.KILLED ||
-      finalApplicationStatus == FinalApplicationStatus.KILLED) {
-      throw new SparkException("Application is killed")
-    }
-    if (finalApplicationStatus == FinalApplicationStatus.UNDEFINED) {
-      throw new SparkException("The final status of application is undefined")
-    }
-  }
-
-  /* --------------------------------------------------------------------------------------- *
-   |  Methods that cannot be implemented here due to API differences across hadoop versions  |
-   * --------------------------------------------------------------------------------------- */
-
-  /** Submit an application running our ApplicationMaster to the ResourceManager. */
-  def submitApplication(): ApplicationId
-
-  /** Set up security tokens for launching our ApplicationMaster container. */
-  protected def setupSecurityToken(containerContext: ContainerLaunchContext): Unit
-
-  /** Get the application report from the ResourceManager for an application we have submitted. */
-  protected def getApplicationReport(appId: ApplicationId): ApplicationReport
-
-  /**
-   * Return the security token used by this client to communicate with the ApplicationMaster.
-   * If no security is enabled, the token returned by the report is null.
-   */
-  protected def getClientToken(report: ApplicationReport): String
-}
-
-private[spark] object ClientBase extends Logging {
-
-  // Alias for the Spark assembly jar and the user jar
-  val SPARK_JAR: String = "__spark__.jar"
-  val APP_JAR: String = "__app__.jar"
-
-  // URI scheme that identifies local resources
-  val LOCAL_SCHEME = "local"
-
-  // Staging directory for any temporary jars or files
-  val SPARK_STAGING: String = ".sparkStaging"
-
-  // Location of any user-defined Spark jars
-  val CONF_SPARK_JAR = "spark.yarn.jar"
-  val ENV_SPARK_JAR = "SPARK_JAR"
-
-  // Internal config to propagate the location of the user's jar to the driver/executors
-  val CONF_SPARK_USER_JAR = "spark.yarn.user.jar"
-
-  // Internal config to propagate the locations of any extra jars to add to the classpath
-  // of the executors
-  val CONF_SPARK_YARN_SECONDARY_JARS = "spark.yarn.secondary.jars"
-
-  // Staging directory is private! -> rwx--------
-  val STAGING_DIR_PERMISSION: FsPermission =
-    FsPermission.createImmutable(Integer.parseInt("700", 8).toShort)
-
-  // App files are world-wide readable and owner writable -> rw-r--r--
-  val APP_FILE_PERMISSION: FsPermission =
-    FsPermission.createImmutable(Integer.parseInt("644", 8).toShort)
-
-  /**
-   * Find the user-defined Spark jar if configured, or return the jar containing this
-   * class if not.
-   *
-   * This method first looks in the SparkConf object for the CONF_SPARK_JAR key, and in the
-   * user environment if that is not found (for backwards compatibility).
-   */
-  private def sparkJar(conf: SparkConf): String = {
-    if (conf.contains(CONF_SPARK_JAR)) {
-      conf.get(CONF_SPARK_JAR)
-    } else if (System.getenv(ENV_SPARK_JAR) != null) {
-      logWarning(
-        s"$ENV_SPARK_JAR detected in the system environment. This variable has been deprecated " +
-        s"in favor of the $CONF_SPARK_JAR configuration variable.")
-      System.getenv(ENV_SPARK_JAR)
-    } else {
-      SparkContext.jarOfClass(this.getClass).head
-    }
-  }
-
-  /**
-   * Return the path to the given application's staging directory.
-   */
-  private def getAppStagingDir(appId: ApplicationId): String = {
-    SPARK_STAGING + Path.SEPARATOR + appId.toString() + Path.SEPARATOR
-  }
-
-  /**
-   * Populate the classpath entry in the given environment map with any application
-   * classpath specified through the Hadoop and Yarn configurations.
-   */
-  def populateHadoopClasspath(conf: Configuration, env: HashMap[String, String]): Unit = {
-    val classPathElementsToAdd = getYarnAppClasspath(conf) ++ getMRAppClasspath(conf)
-    for (c <- classPathElementsToAdd.flatten) {
-      YarnSparkHadoopUtil.addPathToEnvironment(env, Environment.CLASSPATH.name, c.trim)
-    }
-  }
-
-  private def getYarnAppClasspath(conf: Configuration): Option[Seq[String]] =
-    Option(conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH)) match {
-      case Some(s) => Some(s.toSeq)
-      case None => getDefaultYarnApplicationClasspath
-  }
-
-  private def getMRAppClasspath(conf: Configuration): Option[Seq[String]] =
-    Option(conf.getStrings("mapreduce.application.classpath")) match {
-      case Some(s) => Some(s.toSeq)
-      case None => getDefaultMRApplicationClasspath
-    }
-
-  def getDefaultYarnApplicationClasspath: Option[Seq[String]] = {
-    val triedDefault = Try[Seq[String]] {
-      val field = classOf[YarnConfiguration].getField("DEFAULT_YARN_APPLICATION_CLASSPATH")
-      val value = field.get(null).asInstanceOf[Array[String]]
-      value.toSeq
-    } recoverWith {
-      case e: NoSuchFieldException => Success(Seq.empty[String])
-    }
-
-    triedDefault match {
-      case f: Failure[_] =>
-        logError("Unable to obtain the default YARN Application classpath.", f.exception)
-      case s: Success[_] =>
-        logDebug(s"Using the default YARN application classpath: ${s.get.mkString(",")}")
-    }
-
-    triedDefault.toOption
-  }
-
-  /**
-   * In Hadoop 0.23, the MR application classpath comes with the YARN application
-   * classpath. In Hadoop 2.0, it's an array of Strings, and in 2.2+ it's a String.
-   * So we need to use reflection to retrieve it.
-   */
-  def getDefaultMRApplicationClasspath: Option[Seq[String]] = {
-    val triedDefault = Try[Seq[String]] {
-      val field = classOf[MRJobConfig].getField("DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH")
-      val value = if (field.getType == classOf[String]) {
-        StringUtils.getStrings(field.get(null).asInstanceOf[String]).toArray
-      } else {
-        field.get(null).asInstanceOf[Array[String]]
-      }
-      value.toSeq
-    } recoverWith {
-      case e: NoSuchFieldException => Success(Seq.empty[String])
-    }
-
-    triedDefault match {
-      case f: Failure[_] =>
-        logError("Unable to obtain the default MR Application classpath.", f.exception)
-      case s: Success[_] =>
-        logDebug(s"Using the default MR application classpath: ${s.get.mkString(",")}")
-    }
-
-    triedDefault.toOption
-  }
-
-  /**
-   * Populate the classpath entry in the given environment map.
-   * This includes the user jar, Spark jar, and any extra application jars.
-   */
-  def populateClasspath(
-      args: ClientArguments,
-      conf: Configuration,
-      sparkConf: SparkConf,
-      env: HashMap[String, String],
-      extraClassPath: Option[String] = None): Unit = {
-    extraClassPath.foreach(addClasspathEntry(_, env))
-    addClasspathEntry(Environment.PWD.$(), env)
-
-    // Normally the users app.jar is last in case conflicts with spark jars
-    if (sparkConf.get("spark.yarn.user.classpath.first", "false").toBoolean) {
-      addUserClasspath(args, sparkConf, env)
-      addFileToClasspath(sparkJar(sparkConf), SPARK_JAR, env)
-      populateHadoopClasspath(conf, env)
-    } else {
-      addFileToClasspath(sparkJar(sparkConf), SPARK_JAR, env)
-      populateHadoopClasspath(conf, env)
-      addUserClasspath(args, sparkConf, env)
-    }
-
-    // Append all jar files under the working directory to the classpath.
-    addClasspathEntry(Environment.PWD.$() + Path.SEPARATOR + "*", env)
-  }
-
-  /**
-   * Adds the user jars which have local: URIs (or alternate names, such as APP_JAR) explicitly
-   * to the classpath.
-   */
-  private def addUserClasspath(
-      args: ClientArguments,
-      conf: SparkConf,
-      env: HashMap[String, String]): Unit = {
-
-    // If `args` is not null, we are launching an AM container.
-    // Otherwise, we are launching executor containers.
-    val (mainJar, secondaryJars) =
-      if (args != null) {
-        (args.userJar, args.addJars)
-      } else {
-        (conf.get(CONF_SPARK_USER_JAR, null), conf.get(CONF_SPARK_YARN_SECONDARY_JARS, null))
-      }
-
-    addFileToClasspath(mainJar, APP_JAR, env)
-    if (secondaryJars != null) {
-      secondaryJars.split(",").filter(_.nonEmpty).foreach { jar =>
-        addFileToClasspath(jar, null, env)
-      }
-    }
-  }
-
-  /**
-   * Adds the given path to the classpath, handling "local:" URIs correctly.
-   *
-   * If an alternate name for the file is given, and it's not a "local:" file, the alternate
-   * name will be added to the classpath (relative to the job's work directory).
-   *
-   * If not a "local:" file and no alternate name, the environment is not modified.
-   *
-   * @param path      Path to add to classpath (optional).
-   * @param fileName  Alternate name for the file (optional).
-   * @param env       Map holding the environment variables.
-   */
-  private def addFileToClasspath(
-      path: String,
-      fileName: String,
-      env: HashMap[String, String]): Unit = {
-    if (path != null) {
-      scala.util.control.Exception.ignoring(classOf[URISyntaxException]) {
-        val uri = new URI(path)
-        if (uri.getScheme == LOCAL_SCHEME) {
-          addClasspathEntry(uri.getPath, env)
-          return
-        }
-      }
-    }
-    if (fileName != null) {
-      addClasspathEntry(Environment.PWD.$() + Path.SEPARATOR + fileName, env)
-    }
-  }
-
-  /**
-   * Add the given path to the classpath entry of the given environment map.
-   * If the classpath is already set, this appends the new path to the existing classpath.
-   */
-  private def addClasspathEntry(path: String, env: HashMap[String, String]): Unit =
-    YarnSparkHadoopUtil.addPathToEnvironment(env, Environment.CLASSPATH.name, path)
-
-  /**
-   * Get the list of namenodes the user may access.
-   */
-  def getNameNodesToAccess(sparkConf: SparkConf): Set[Path] = {
-    sparkConf.get("spark.yarn.access.namenodes", "")
-      .split(",")
-      .map(_.trim())
-      .filter(!_.isEmpty)
-      .map(new Path(_))
-      .toSet
-  }
-
-  def getTokenRenewer(conf: Configuration): String = {
-    val delegTokenRenewer = Master.getMasterPrincipal(conf)
-    logDebug("delegation token renewer is: " + delegTokenRenewer)
-    if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) {
-      val errorMessage = "Can't get Master Kerberos principal for use as renewer"
-      logError(errorMessage)
-      throw new SparkException(errorMessage)
-    }
-    delegTokenRenewer
-  }
-
-  /**
-   * Obtains tokens for the namenodes passed in and adds them to the credentials.
-   */
-  def obtainTokensForNamenodes(
-      paths: Set[Path],
-      conf: Configuration,
-      creds: Credentials): Unit = {
-    if (UserGroupInformation.isSecurityEnabled()) {
-      val delegTokenRenewer = getTokenRenewer(conf)
-      paths.foreach { dst =>
-        val dstFs = dst.getFileSystem(conf)
-        logDebug("getting token for namenode: " + dst)
-        dstFs.addDelegationTokens(delegTokenRenewer, creds)
-      }
-    }
-  }
-
-  /**
-   * Return whether the two file systems are the same.
-   */
-  private def compareFs(srcFs: FileSystem, destFs: FileSystem): Boolean = {
-    val srcUri = srcFs.getUri()
-    val dstUri = destFs.getUri()
-    if (srcUri.getScheme() == null || srcUri.getScheme() != dstUri.getScheme()) {
-      return false
-    }
-
-    var srcHost = srcUri.getHost()
-    var dstHost = dstUri.getHost()
-
-    // In HA or when using viewfs, the host part of the URI may not actually be a host, but the
-    // name of the HDFS namespace. Those names won't resolve, so avoid even trying if they
-    // match.
-    if (srcHost != null && dstHost != null && srcHost != dstHost) {
-      try {
-        srcHost = InetAddress.getByName(srcHost).getCanonicalHostName()
-        dstHost = InetAddress.getByName(dstHost).getCanonicalHostName()
-      } catch {
-        case e: UnknownHostException =>
-          return false
-      }
-    }
-
-    Objects.equal(srcHost, dstHost) && srcUri.getPort() == dstUri.getPort()
-  }
-
-}
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
index fdd3c2300fa78..c537da9f67552 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -17,32 +17,33 @@
 
 package org.apache.spark.deploy.yarn
 
+import java.net.URI
 import java.nio.ByteBuffer
-import java.security.PrivilegedExceptionAction
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
+import org.apache.spark.util.Utils
 
 import scala.collection.JavaConversions._
+import scala.collection.mutable.{HashMap, ListBuffer}
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.io.DataOutputBuffer
-import org.apache.hadoop.net.NetUtils
 import org.apache.hadoop.security.UserGroupInformation
 import org.apache.hadoop.yarn.api._
 import org.apache.hadoop.yarn.api.records._
-import org.apache.hadoop.yarn.api.records.impl.pb.ProtoUtils
-import org.apache.hadoop.yarn.api.protocolrecords._
 import org.apache.hadoop.yarn.client.api.NMClient
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.ipc.YarnRPC
-import org.apache.hadoop.yarn.util.{Apps, ConverterUtils, Records}
+import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
 
 import org.apache.spark.{SecurityManager, SparkConf, Logging}
 import org.apache.spark.network.util.JavaUtils
 
-
 class ExecutorRunnable(
     container: Container,
     conf: Configuration,
-    spConf: SparkConf,
+    sparkConf: SparkConf,
     masterAddress: String,
     slaveId: String,
     hostname: String,
@@ -50,13 +51,13 @@ class ExecutorRunnable(
     executorCores: Int,
     appId: String,
     securityMgr: SecurityManager)
-  extends Runnable with ExecutorRunnableUtil with Logging {
+  extends Runnable with Logging {
 
   var rpc: YarnRPC = YarnRPC.create(conf)
   var nmClient: NMClient = _
-  val sparkConf = spConf
   val yarnConf: YarnConfiguration = new YarnConfiguration(conf)
-
+  lazy val env = prepareEnvironment
+  
   def run = {
     logInfo("Starting Executor Container")
     nmClient = NMClient.createNMClient()
@@ -110,4 +111,162 @@ class ExecutorRunnable(
     nmClient.startContainer(container, ctx)
   }
 
+  private def prepareCommand(
+      masterAddress: String,
+      slaveId: String,
+      hostname: String,
+      executorMemory: Int,
+      executorCores: Int,
+      appId: String,
+      localResources: HashMap[String, LocalResource]): List[String] = {
+    // Extra options for the JVM
+    val javaOpts = ListBuffer[String]()
+
+    // Set the environment variable through a command prefix
+    // to append to the existing value of the variable
+    var prefixEnv: Option[String] = None
+
+    // Set the JVM memory
+    val executorMemoryString = executorMemory + "m"
+    javaOpts += "-Xms" + executorMemoryString + " -Xmx" + executorMemoryString + " "
+
+    // Set extra Java options for the executor, if defined
+    sys.props.get("spark.executor.extraJavaOptions").foreach { opts =>
+      javaOpts += opts
+    }
+    sys.env.get("SPARK_JAVA_OPTS").foreach { opts =>
+      javaOpts += opts
+    }
+    sys.props.get("spark.executor.extraLibraryPath").foreach { p =>
+      prefixEnv = Some(Utils.libraryPathEnvPrefix(Seq(p)))
+    }
+
+    javaOpts += "-Djava.io.tmpdir=" +
+      new Path(Environment.PWD.$(), YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR)
+
+    // Certain configs need to be passed here because they are needed before the Executor
+    // registers with the Scheduler and transfers the spark configs. Since the Executor backend
+    // uses Akka to connect to the scheduler, the akka settings are needed as well as the
+    // authentication settings.
+    sparkConf.getAll
+      .filter { case (k, v) => SparkConf.isExecutorStartupConf(k) }
+      .foreach { case (k, v) => javaOpts += YarnSparkHadoopUtil.escapeForShell(s"-D$k=$v") }
+
+    // Commenting it out for now - so that people can refer to the properties if required. Remove
+    // it once cpuset version is pushed out.
+    // The context is, default gc for server class machines end up using all cores to do gc - hence
+    // if there are multiple containers in same node, spark gc effects all other containers
+    // performance (which can also be other spark containers)
+    // Instead of using this, rely on cpusets by YARN to enforce spark behaves 'properly' in
+    // multi-tenant environments. Not sure how default java gc behaves if it is limited to subset
+    // of cores on a node.
+    /*
+        else {
+          // If no java_opts specified, default to using -XX:+CMSIncrementalMode
+          // It might be possible that other modes/config is being done in
+          // spark.executor.extraJavaOptions, so we dont want to mess with it.
+          // In our expts, using (default) throughput collector has severe perf ramnifications in
+          // multi-tennent machines
+          // The options are based on
+          // http://www.oracle.com/technetwork/java/gc-tuning-5-138395.html#0.0.0.%20When%20to%20Use
+          // %20the%20Concurrent%20Low%20Pause%20Collector|outline
+          javaOpts += " -XX:+UseConcMarkSweepGC "
+          javaOpts += " -XX:+CMSIncrementalMode "
+          javaOpts += " -XX:+CMSIncrementalPacing "
+          javaOpts += " -XX:CMSIncrementalDutyCycleMin=0 "
+          javaOpts += " -XX:CMSIncrementalDutyCycle=10 "
+        }
+    */
+
+    // For log4j configuration to reference
+    javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR)
+
+    val commands = prefixEnv ++ Seq(Environment.JAVA_HOME.$() + "/bin/java",
+      "-server",
+      // Kill if OOM is raised - leverage yarn's failure handling to cause rescheduling.
+      // Not killing the task leaves various aspects of the executor and (to some extent) the jvm in
+      // an inconsistent state.
+      // TODO: If the OOM is not recoverable by rescheduling it on different node, then do
+      // 'something' to fail job ... akin to blacklisting trackers in mapred ?
+      "-XX:OnOutOfMemoryError='kill %p'") ++
+      javaOpts ++
+      Seq("org.apache.spark.executor.CoarseGrainedExecutorBackend",
+        masterAddress.toString,
+        slaveId.toString,
+        hostname.toString,
+        executorCores.toString,
+        appId,
+        "1>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout",
+        "2>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr")
+
+    // TODO: it would be nicer to just make sure there are no null commands here
+    commands.map(s => if (s == null) "null" else s).toList
+  }
+
+  private def setupDistributedCache(
+      file: String,
+      rtype: LocalResourceType,
+      localResources: HashMap[String, LocalResource],
+      timestamp: String,
+      size: String,
+      vis: String): Unit = {
+    val uri = new URI(file)
+    val amJarRsrc = Records.newRecord(classOf[LocalResource])
+    amJarRsrc.setType(rtype)
+    amJarRsrc.setVisibility(LocalResourceVisibility.valueOf(vis))
+    amJarRsrc.setResource(ConverterUtils.getYarnUrlFromURI(uri))
+    amJarRsrc.setTimestamp(timestamp.toLong)
+    amJarRsrc.setSize(size.toLong)
+    localResources(uri.getFragment()) = amJarRsrc
+  }
+
+  private def prepareLocalResources: HashMap[String, LocalResource] = {
+    logInfo("Preparing Local resources")
+    val localResources = HashMap[String, LocalResource]()
+
+    if (System.getenv("SPARK_YARN_CACHE_FILES") != null) {
+      val timeStamps = System.getenv("SPARK_YARN_CACHE_FILES_TIME_STAMPS").split(',')
+      val fileSizes = System.getenv("SPARK_YARN_CACHE_FILES_FILE_SIZES").split(',')
+      val distFiles = System.getenv("SPARK_YARN_CACHE_FILES").split(',')
+      val visibilities = System.getenv("SPARK_YARN_CACHE_FILES_VISIBILITIES").split(',')
+      for( i <- 0 to distFiles.length - 1) {
+        setupDistributedCache(distFiles(i), LocalResourceType.FILE, localResources, timeStamps(i),
+          fileSizes(i), visibilities(i))
+      }
+    }
+
+    if (System.getenv("SPARK_YARN_CACHE_ARCHIVES") != null) {
+      val timeStamps = System.getenv("SPARK_YARN_CACHE_ARCHIVES_TIME_STAMPS").split(',')
+      val fileSizes = System.getenv("SPARK_YARN_CACHE_ARCHIVES_FILE_SIZES").split(',')
+      val distArchives = System.getenv("SPARK_YARN_CACHE_ARCHIVES").split(',')
+      val visibilities = System.getenv("SPARK_YARN_CACHE_ARCHIVES_VISIBILITIES").split(',')
+      for( i <- 0 to distArchives.length - 1) {
+        setupDistributedCache(distArchives(i), LocalResourceType.ARCHIVE, localResources,
+          timeStamps(i), fileSizes(i), visibilities(i))
+      }
+    }
+
+    logInfo("Prepared Local resources " + localResources)
+    localResources
+  }
+
+  private def prepareEnvironment: HashMap[String, String] = {
+    val env = new HashMap[String, String]()
+    val extraCp = sparkConf.getOption("spark.executor.extraClassPath")
+    Client.populateClasspath(null, yarnConf, sparkConf, env, extraCp)
+
+    sparkConf.getExecutorEnv.foreach { case (key, value) =>
+      // This assumes each executor environment variable set here is a path
+      // This is kept for backward compatibility and consistency with hadoop
+      YarnSparkHadoopUtil.addPathToEnvironment(env, key, value)
+    }
+
+    // Keep this for backwards compatibility but users should move to the config
+    sys.env.get("SPARK_YARN_USER_ENV").foreach { userEnvs =>
+      YarnSparkHadoopUtil.setEnvFromInputString(env, userEnvs)
+    }
+
+    System.getenv().filterKeys(_.startsWith("SPARK")).foreach { case (k, v) => env(k) = v }
+    env
+  }
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala
deleted file mode 100644
index 22d73ecf6d010..0000000000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.net.URI
-
-import scala.collection.JavaConversions._
-import scala.collection.mutable.{HashMap, ListBuffer}
-
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.yarn.api._
-import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
-import org.apache.hadoop.yarn.api.records._
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
-
-import org.apache.spark.{Logging, SparkConf}
-import org.apache.spark.util.Utils
-
-trait ExecutorRunnableUtil extends Logging {
-
-  val yarnConf: YarnConfiguration
-  val sparkConf: SparkConf
-  lazy val env = prepareEnvironment
-
-  def prepareCommand(
-      masterAddress: String,
-      slaveId: String,
-      hostname: String,
-      executorMemory: Int,
-      executorCores: Int,
-      appId: String,
-      localResources: HashMap[String, LocalResource]): List[String] = {
-    // Extra options for the JVM
-    val javaOpts = ListBuffer[String]()
-
-    // Set the environment variable through a command prefix
-    // to append to the existing value of the variable
-    var prefixEnv: Option[String] = None
-
-    // Set the JVM memory
-    val executorMemoryString = executorMemory + "m"
-    javaOpts += "-Xms" + executorMemoryString + " -Xmx" + executorMemoryString + " "
-
-    // Set extra Java options for the executor, if defined
-    sys.props.get("spark.executor.extraJavaOptions").foreach { opts =>
-      javaOpts += opts
-    }
-    sys.env.get("SPARK_JAVA_OPTS").foreach { opts =>
-      javaOpts += opts
-    }
-    sys.props.get("spark.executor.extraLibraryPath").foreach { p =>
-      prefixEnv = Some(Utils.libraryPathEnvPrefix(Seq(p)))
-    }
-
-    javaOpts += "-Djava.io.tmpdir=" +
-      new Path(Environment.PWD.$(), YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR)
-
-    // Certain configs need to be passed here because they are needed before the Executor
-    // registers with the Scheduler and transfers the spark configs. Since the Executor backend
-    // uses Akka to connect to the scheduler, the akka settings are needed as well as the
-    // authentication settings.
-    sparkConf.getAll.
-      filter { case (k, v) => k.startsWith("spark.auth") || k.startsWith("spark.akka") }.
-      foreach { case (k, v) => javaOpts += YarnSparkHadoopUtil.escapeForShell(s"-D$k=$v") }
-
-    sparkConf.getAkkaConf.
-      foreach { case (k, v) => javaOpts += YarnSparkHadoopUtil.escapeForShell(s"-D$k=$v") }
-
-    // Commenting it out for now - so that people can refer to the properties if required. Remove
-    // it once cpuset version is pushed out.
-    // The context is, default gc for server class machines end up using all cores to do gc - hence
-    // if there are multiple containers in same node, spark gc effects all other containers
-    // performance (which can also be other spark containers)
-    // Instead of using this, rely on cpusets by YARN to enforce spark behaves 'properly' in
-    // multi-tenant environments. Not sure how default java gc behaves if it is limited to subset
-    // of cores on a node.
-    /*
-        else {
-          // If no java_opts specified, default to using -XX:+CMSIncrementalMode
-          // It might be possible that other modes/config is being done in
-          // spark.executor.extraJavaOptions, so we dont want to mess with it.
-          // In our expts, using (default) throughput collector has severe perf ramnifications in
-          // multi-tennent machines
-          // The options are based on
-          // http://www.oracle.com/technetwork/java/gc-tuning-5-138395.html#0.0.0.%20When%20to%20Use
-          // %20the%20Concurrent%20Low%20Pause%20Collector|outline
-          javaOpts += " -XX:+UseConcMarkSweepGC "
-          javaOpts += " -XX:+CMSIncrementalMode "
-          javaOpts += " -XX:+CMSIncrementalPacing "
-          javaOpts += " -XX:CMSIncrementalDutyCycleMin=0 "
-          javaOpts += " -XX:CMSIncrementalDutyCycle=10 "
-        }
-    */
-
-    // For log4j configuration to reference
-    javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR)
-
-    val commands = prefixEnv ++ Seq(Environment.JAVA_HOME.$() + "/bin/java",
-      "-server",
-      // Kill if OOM is raised - leverage yarn's failure handling to cause rescheduling.
-      // Not killing the task leaves various aspects of the executor and (to some extent) the jvm in
-      // an inconsistent state.
-      // TODO: If the OOM is not recoverable by rescheduling it on different node, then do
-      // 'something' to fail job ... akin to blacklisting trackers in mapred ?
-      "-XX:OnOutOfMemoryError='kill %p'") ++
-      javaOpts ++
-      Seq("org.apache.spark.executor.CoarseGrainedExecutorBackend",
-      masterAddress.toString,
-      slaveId.toString,
-      hostname.toString,
-      executorCores.toString,
-      appId,
-      "1>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout",
-      "2>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr")
-
-    // TODO: it would be nicer to just make sure there are no null commands here
-    commands.map(s => if (s == null) "null" else s).toList
-  }
-
-  private def setupDistributedCache(
-      file: String,
-      rtype: LocalResourceType,
-      localResources: HashMap[String, LocalResource],
-      timestamp: String,
-      size: String,
-      vis: String): Unit = {
-    val uri = new URI(file)
-    val amJarRsrc = Records.newRecord(classOf[LocalResource])
-    amJarRsrc.setType(rtype)
-    amJarRsrc.setVisibility(LocalResourceVisibility.valueOf(vis))
-    amJarRsrc.setResource(ConverterUtils.getYarnUrlFromURI(uri))
-    amJarRsrc.setTimestamp(timestamp.toLong)
-    amJarRsrc.setSize(size.toLong)
-    localResources(uri.getFragment()) = amJarRsrc
-  }
-
-  def prepareLocalResources: HashMap[String, LocalResource] = {
-    logInfo("Preparing Local resources")
-    val localResources = HashMap[String, LocalResource]()
-
-    if (System.getenv("SPARK_YARN_CACHE_FILES") != null) {
-      val timeStamps = System.getenv("SPARK_YARN_CACHE_FILES_TIME_STAMPS").split(',')
-      val fileSizes = System.getenv("SPARK_YARN_CACHE_FILES_FILE_SIZES").split(',')
-      val distFiles = System.getenv("SPARK_YARN_CACHE_FILES").split(',')
-      val visibilities = System.getenv("SPARK_YARN_CACHE_FILES_VISIBILITIES").split(',')
-      for( i <- 0 to distFiles.length - 1) {
-        setupDistributedCache(distFiles(i), LocalResourceType.FILE, localResources, timeStamps(i),
-          fileSizes(i), visibilities(i))
-      }
-    }
-
-    if (System.getenv("SPARK_YARN_CACHE_ARCHIVES") != null) {
-      val timeStamps = System.getenv("SPARK_YARN_CACHE_ARCHIVES_TIME_STAMPS").split(',')
-      val fileSizes = System.getenv("SPARK_YARN_CACHE_ARCHIVES_FILE_SIZES").split(',')
-      val distArchives = System.getenv("SPARK_YARN_CACHE_ARCHIVES").split(',')
-      val visibilities = System.getenv("SPARK_YARN_CACHE_ARCHIVES_VISIBILITIES").split(',')
-      for( i <- 0 to distArchives.length - 1) {
-        setupDistributedCache(distArchives(i), LocalResourceType.ARCHIVE, localResources,
-          timeStamps(i), fileSizes(i), visibilities(i))
-      }
-    }
-
-    logInfo("Prepared Local resources " + localResources)
-    localResources
-  }
-
-  def prepareEnvironment: HashMap[String, String] = {
-    val env = new HashMap[String, String]()
-    val extraCp = sparkConf.getOption("spark.executor.extraClassPath")
-    ClientBase.populateClasspath(null, yarnConf, sparkConf, env, extraCp)
-
-    sparkConf.getExecutorEnv.foreach { case (key, value) =>
-      // This assumes each executor environment variable set here is a path
-      // This is kept for backward compatibility and consistency with hadoop
-      YarnSparkHadoopUtil.addPathToEnvironment(env, key, value)
-    }
-
-    // Keep this for backwards compatibility but users should move to the config
-    sys.env.get("SPARK_YARN_USER_ENV").foreach { userEnvs =>
-      YarnSparkHadoopUtil.setEnvFromInputString(env, userEnvs)
-    }
-
-    System.getenv().filterKeys(_.startsWith("SPARK")).foreach { case (k, v) => env(k) = v }
-    env
-  }
-
-}
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala
deleted file mode 100644
index 2bbf5d7db8668..0000000000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import scala.collection.JavaConversions._
-import scala.collection.mutable.{ArrayBuffer, HashMap}
-
-import org.apache.spark.{SecurityManager, SparkConf} 
-import org.apache.spark.scheduler.SplitInfo
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.yarn.api.records._
-import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse
-import org.apache.hadoop.yarn.client.api.AMRMClient
-import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
-import org.apache.hadoop.yarn.util.Records
-
-/**
- * Acquires resources for executors from a ResourceManager and launches executors in new containers.
- */
-private[yarn] class YarnAllocationHandler(
-    conf: Configuration,
-    sparkConf: SparkConf,
-    amClient: AMRMClient[ContainerRequest],
-    appAttemptId: ApplicationAttemptId,
-    args: ApplicationMasterArguments,
-    preferredNodes: collection.Map[String, collection.Set[SplitInfo]], 
-    securityMgr: SecurityManager)
-  extends YarnAllocator(conf, sparkConf, appAttemptId, args, preferredNodes, securityMgr) {
-
-  override protected def releaseContainer(container: Container) = {
-    amClient.releaseAssignedContainer(container.getId())
-  }
-
-  // pending isn't used on stable as the AMRMClient handles incremental asks
-  override protected def allocateContainers(count: Int, pending: Int): YarnAllocateResponse = {
-    addResourceRequests(count)
-
-    // We have already set the container request. Poll the ResourceManager for a response.
-    // This doubles as a heartbeat if there are no pending container requests.
-    val progressIndicator = 0.1f
-    new StableAllocateResponse(amClient.allocate(progressIndicator))
-  }
-
-  private def createRackResourceRequests(
-      hostContainers: ArrayBuffer[ContainerRequest]
-    ): ArrayBuffer[ContainerRequest] = {
-    // Generate modified racks and new set of hosts under it before issuing requests.
-    val rackToCounts = new HashMap[String, Int]()
-
-    for (container <- hostContainers) {
-      val candidateHost = container.getNodes.last
-      assert(YarnSparkHadoopUtil.ANY_HOST != candidateHost)
-
-      val rack = YarnSparkHadoopUtil.lookupRack(conf, candidateHost)
-      if (rack != null) {
-        var count = rackToCounts.getOrElse(rack, 0)
-        count += 1
-        rackToCounts.put(rack, count)
-      }
-    }
-
-    val requestedContainers = new ArrayBuffer[ContainerRequest](rackToCounts.size)
-    for ((rack, count) <- rackToCounts) {
-      requestedContainers ++= createResourceRequests(
-        AllocationType.RACK,
-        rack,
-        count,
-        YarnSparkHadoopUtil.RM_REQUEST_PRIORITY)
-    }
-
-    requestedContainers
-  }
-
-  private def addResourceRequests(numExecutors: Int) {
-    val containerRequests: List[ContainerRequest] =
-      if (numExecutors <= 0) {
-        logDebug("numExecutors: " + numExecutors)
-        List()
-      } else if (preferredHostToCount.isEmpty) {
-        logDebug("host preferences is empty")
-        createResourceRequests(
-          AllocationType.ANY,
-          resource = null,
-          numExecutors,
-          YarnSparkHadoopUtil.RM_REQUEST_PRIORITY).toList
-      } else {
-        // Request for all hosts in preferred nodes and for numExecutors -
-        // candidates.size, request by default allocation policy.
-        val hostContainerRequests = new ArrayBuffer[ContainerRequest](preferredHostToCount.size)
-        for ((candidateHost, candidateCount) <- preferredHostToCount) {
-          val requiredCount = candidateCount - allocatedContainersOnHost(candidateHost)
-
-          if (requiredCount > 0) {
-            hostContainerRequests ++= createResourceRequests(
-              AllocationType.HOST,
-              candidateHost,
-              requiredCount,
-              YarnSparkHadoopUtil.RM_REQUEST_PRIORITY)
-          }
-        }
-        val rackContainerRequests: List[ContainerRequest] = createRackResourceRequests(
-          hostContainerRequests).toList
-
-        val anyContainerRequests = createResourceRequests(
-          AllocationType.ANY,
-          resource = null,
-          numExecutors,
-          YarnSparkHadoopUtil.RM_REQUEST_PRIORITY)
-
-        val containerRequestBuffer = new ArrayBuffer[ContainerRequest](
-          hostContainerRequests.size + rackContainerRequests.size() + anyContainerRequests.size)
-
-        containerRequestBuffer ++= hostContainerRequests
-        containerRequestBuffer ++= rackContainerRequests
-        containerRequestBuffer ++= anyContainerRequests
-        containerRequestBuffer.toList
-      }
-
-    for (request <- containerRequests) {
-      amClient.addContainerRequest(request)
-    }
-
-    for (request <- containerRequests) {
-      val nodes = request.getNodes
-      var hostStr = if (nodes == null || nodes.isEmpty) {
-        "Any"
-      } else {
-        nodes.last
-      }
-      logInfo("Container request (host: %s, priority: %s, capability: %s".format(
-        hostStr,
-        request.getPriority().getPriority,
-        request.getCapability))
-    }
-  }
-
-  private def createResourceRequests(
-      requestType: AllocationType.AllocationType,
-      resource: String,
-      numExecutors: Int,
-      priority: Int
-    ): ArrayBuffer[ContainerRequest] = {
-
-    // If hostname is specified, then we need at least two requests - node local and rack local.
-    // There must be a third request, which is ANY. That will be specially handled.
-    requestType match {
-      case AllocationType.HOST => {
-        assert(YarnSparkHadoopUtil.ANY_HOST != resource)
-        val hostname = resource
-        val nodeLocal = constructContainerRequests(
-          Array(hostname),
-          racks = null,
-          numExecutors,
-          priority)
-
-        // Add `hostname` to the global (singleton) host->rack mapping in YarnAllocationHandler.
-        YarnSparkHadoopUtil.populateRackInfo(conf, hostname)
-        nodeLocal
-      }
-      case AllocationType.RACK => {
-        val rack = resource
-        constructContainerRequests(hosts = null, Array(rack), numExecutors, priority)
-      }
-      case AllocationType.ANY => constructContainerRequests(
-        hosts = null, racks = null, numExecutors, priority)
-      case _ => throw new IllegalArgumentException(
-        "Unexpected/unsupported request type: " + requestType)
-    }
-  }
-
-  private def constructContainerRequests(
-      hosts: Array[String],
-      racks: Array[String],
-      numExecutors: Int,
-      priority: Int
-    ): ArrayBuffer[ContainerRequest] = {
-
-    val memoryRequest = executorMemory + memoryOverhead
-    val resource = Resource.newInstance(memoryRequest, executorCores)
-
-    val prioritySetting = Records.newRecord(classOf[Priority])
-    prioritySetting.setPriority(priority)
-
-    val requests = new ArrayBuffer[ContainerRequest]()
-    for (i <- 0 until numExecutors) {
-      requests += new ContainerRequest(resource, hosts, racks, prioritySetting)
-    }
-    requests
-  }
-
-  private class StableAllocateResponse(response: AllocateResponse) extends YarnAllocateResponse {
-    override def getAllocatedContainers() = response.getAllocatedContainers()
-    override def getAvailableResources() = response.getAvailableResources()
-    override def getCompletedContainersStatuses() = response.getCompletedContainersStatuses()
-  }
-
-}
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index b32e15738f28b..d00f29665a58f 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -17,93 +17,79 @@
 
 package org.apache.spark.deploy.yarn
 
-import java.util.{List => JList}
+import java.util.Collections
 import java.util.concurrent._
-import java.util.concurrent.atomic.AtomicInteger
 import java.util.regex.Pattern
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
 
+import com.google.common.util.concurrent.ThreadFactoryBuilder
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.yarn.api.records._
-import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse
-
-import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkEnv}
-import org.apache.spark.scheduler.{SplitInfo, TaskSchedulerImpl}
-import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
+import org.apache.hadoop.yarn.client.api.AMRMClient
+import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
+import org.apache.hadoop.yarn.util.RackResolver
 
-import com.google.common.util.concurrent.ThreadFactoryBuilder
+import org.apache.spark.{Logging, SecurityManager, SparkConf}
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
-
-object AllocationType extends Enumeration {
-  type AllocationType = Value
-  val HOST, RACK, ANY = Value
-}
-
-// TODO:
-// Too many params.
-// Needs to be mt-safe
-// Need to refactor this to make it 'cleaner' ... right now, all computation is reactive - should
-// make it more proactive and decoupled.
-
-// Note that right now, we assume all node asks as uniform in terms of capabilities and priority
-// Refer to http://developer.yahoo.com/blogs/hadoop/posts/2011/03/mapreduce-nextgen-scheduler/ for
-// more info on how we are requesting for containers.
+import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
 
 /**
- * Common code for the Yarn container allocator. Contains all the version-agnostic code to
- * manage container allocation for a running Spark application.
+ * YarnAllocator is charged with requesting containers from the YARN ResourceManager and deciding
+ * what to do with containers when YARN fulfills these requests.
+ *
+ * This class makes use of YARN's AMRMClient APIs. We interact with the AMRMClient in three ways:
+ * * Making our resource needs known, which updates local bookkeeping about containers requested.
+ * * Calling "allocate", which syncs our local container requests with the RM, and returns any
+ *   containers that YARN has granted to us.  This also functions as a heartbeat.
+ * * Processing the containers granted to us to possibly launch executors inside of them.
+ *
+ * The public methods of this class are thread-safe.  All methods that mutate state are
+ * synchronized.
  */
-private[yarn] abstract class YarnAllocator(
+private[yarn] class YarnAllocator(
     conf: Configuration,
     sparkConf: SparkConf,
+    amClient: AMRMClient[ContainerRequest],
     appAttemptId: ApplicationAttemptId,
     args: ApplicationMasterArguments,
-    preferredNodes: collection.Map[String, collection.Set[SplitInfo]],
     securityMgr: SecurityManager)
   extends Logging {
 
   import YarnAllocator._
 
-  // These three are locked on allocatedHostToContainersMap. Complementary data structures
-  // allocatedHostToContainersMap : containers which are running : host, Set<containerid>
-  // allocatedContainerToHostMap: container to host mapping.
-  private val allocatedHostToContainersMap =
-    new HashMap[String, collection.mutable.Set[ContainerId]]()
-
-  private val allocatedContainerToHostMap = new HashMap[ContainerId, String]()
-
-  // allocatedRackCount is populated ONLY if allocation happens (or decremented if this is an
-  // allocated node)
-  // As with the two data structures above, tightly coupled with them, and to be locked on
-  // allocatedHostToContainersMap
-  private val allocatedRackCount = new HashMap[String, Int]()
+  // Visible for testing.
+  val allocatedHostToContainersMap =
+    new HashMap[String, collection.mutable.Set[ContainerId]]
+  val allocatedContainerToHostMap = new HashMap[ContainerId, String]
 
-  // Containers to be released in next request to RM
-  private val releasedContainers = new ConcurrentHashMap[ContainerId, Boolean]
+  // Containers that we no longer care about. We've either already told the RM to release them or
+  // will on the next heartbeat. Containers get removed from this map after the RM tells us they've
+  // completed.
+  private val releasedContainers = Collections.newSetFromMap[ContainerId](
+    new ConcurrentHashMap[ContainerId, java.lang.Boolean])
 
-  // Number of container requests that have been sent to, but not yet allocated by the
-  // ApplicationMaster.
-  private val numPendingAllocate = new AtomicInteger()
-  private val numExecutorsRunning = new AtomicInteger()
-  // Used to generate a unique id per executor
-  private val executorIdCounter = new AtomicInteger()
-  private val numExecutorsFailed = new AtomicInteger()
+  @volatile private var numExecutorsRunning = 0
+  // Used to generate a unique ID per executor
+  private var executorIdCounter = 0
+  @volatile private var numExecutorsFailed = 0
 
-  private var maxExecutors = args.numExecutors
+  @volatile private var maxExecutors = args.numExecutors
 
   // Keep track of which container is running which executor to remove the executors later
   private val executorIdToContainer = new HashMap[String, Container]
 
+  // Executor memory in MB.
   protected val executorMemory = args.executorMemory
-  protected val executorCores = args.executorCores
-  protected val (preferredHostToCount, preferredRackToCount) =
-    generateNodeToWeight(conf, preferredNodes)
-
-  // Additional memory overhead - in mb.
+  // Additional memory overhead.
   protected val memoryOverhead: Int = sparkConf.getInt("spark.yarn.executor.memoryOverhead",
     math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toInt, MEMORY_OVERHEAD_MIN))
+  // Number of cores per executor.
+  protected val executorCores = args.executorCores
+  // Resource capability requested for each executors
+  private val resource = Resource.newInstance(executorMemory + memoryOverhead, executorCores)
 
   private val launcherPool = new ThreadPoolExecutor(
     // max pool size of Integer.MAX_VALUE is ignored because we use an unbounded queue
@@ -113,26 +99,34 @@ private[yarn] abstract class YarnAllocator(
     new ThreadFactoryBuilder().setNameFormat("ContainerLauncher #%d").setDaemon(true).build())
   launcherPool.allowCoreThreadTimeOut(true)
 
-  def getNumExecutorsRunning: Int = numExecutorsRunning.intValue
+  private val driverUrl = "akka.tcp://sparkDriver@%s:%s/user/%s".format(
+    sparkConf.get("spark.driver.host"),
+    sparkConf.get("spark.driver.port"),
+    CoarseGrainedSchedulerBackend.ACTOR_NAME)
+
+  // For testing
+  private val launchContainers = sparkConf.getBoolean("spark.yarn.launchContainers", true)
+
+  def getNumExecutorsRunning: Int = numExecutorsRunning
 
-  def getNumExecutorsFailed: Int = numExecutorsFailed.intValue
+  def getNumExecutorsFailed: Int = numExecutorsFailed
+
+  /**
+   * Number of container requests that have not yet been fulfilled.
+   */
+  def getNumPendingAllocate: Int = getNumPendingAtLocation(ANY_HOST)
+
+  /**
+   * Number of container requests at the given location that have not yet been fulfilled.
+   */
+  private def getNumPendingAtLocation(location: String): Int =
+    amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, resource).map(_.size).sum
 
   /**
    * Request as many executors from the ResourceManager as needed to reach the desired total.
-   * This takes into account executors already running or pending.
    */
   def requestTotalExecutors(requestedTotal: Int): Unit = synchronized {
-    val currentTotal = numPendingAllocate.get + numExecutorsRunning.get
-    if (requestedTotal > currentTotal) {
-      maxExecutors += (requestedTotal - currentTotal)
-      // We need to call `allocateResources` here to avoid the following race condition:
-      // If we request executors twice before `allocateResources` is called, then we will end up
-      // double counting the number requested because `numPendingAllocate` is not updated yet.
-      allocateResources()
-    } else {
-      logInfo(s"Not allocating more executors because there are already $currentTotal " +
-        s"(application requested $requestedTotal total)")
-    }
+    maxExecutors = requestedTotal
   }
 
   /**
@@ -142,7 +136,7 @@ private[yarn] abstract class YarnAllocator(
     if (executorIdToContainer.contains(executorId)) {
       val container = executorIdToContainer.remove(executorId).get
       internalReleaseContainer(container)
-      numExecutorsRunning.decrementAndGet()
+      numExecutorsRunning -= 1
       maxExecutors -= 1
       assert(maxExecutors >= 0, "Allocator killed more executors than are allocated!")
     } else {
@@ -151,373 +145,234 @@ private[yarn] abstract class YarnAllocator(
   }
 
   /**
-   * Allocate missing containers based on the number of executors currently pending and running.
+   * Request resources such that, if YARN gives us all we ask for, we'll have a number of containers
+   * equal to maxExecutors.
+   *
+   * Deal with any containers YARN has granted to us by possibly launching executors in them.
    *
-   * This method prioritizes the allocated container responses from the RM based on node and
-   * rack locality. Additionally, it releases any extra containers allocated for this application
-   * but are not needed. This must be synchronized because variables read in this block are
-   * mutated by other methods.
+   * This must be synchronized because variables read in this method are mutated by other methods.
    */
   def allocateResources(): Unit = synchronized {
-    val missing = maxExecutors - numPendingAllocate.get() - numExecutorsRunning.get()
+    val numPendingAllocate = getNumPendingAllocate
+    val missing = maxExecutors - numPendingAllocate - numExecutorsRunning
 
-    // this is needed by alpha, do it here since we add numPending right after this
-    val executorsPending = numPendingAllocate.get()
     if (missing > 0) {
-      val totalExecutorMemory = executorMemory + memoryOverhead
-      numPendingAllocate.addAndGet(missing)
-      logInfo(s"Will allocate $missing executor containers, each with $totalExecutorMemory MB " +
-        s"memory including $memoryOverhead MB overhead")
-    } else {
-      logDebug("Empty allocation request ...")
+      logInfo(s"Will request $missing executor containers, each with ${resource.getVirtualCores} " +
+        s"cores and ${resource.getMemory} MB memory including $memoryOverhead MB overhead")
     }
 
-    val allocateResponse = allocateContainers(missing, executorsPending)
+    addResourceRequests(missing)
+    val progressIndicator = 0.1f
+    // Poll the ResourceManager. This doubles as a heartbeat if there are no pending container
+    // requests.
+    val allocateResponse = amClient.allocate(progressIndicator)
+
     val allocatedContainers = allocateResponse.getAllocatedContainers()
 
     if (allocatedContainers.size > 0) {
-      var numPendingAllocateNow = numPendingAllocate.addAndGet(-1 * allocatedContainers.size)
-
-      if (numPendingAllocateNow < 0) {
-        numPendingAllocateNow = numPendingAllocate.addAndGet(-1 * numPendingAllocateNow)
-      }
-
-      logDebug("""
-        Allocated containers: %d
-        Current executor count: %d
-        Containers released: %s
-        Cluster resources: %s
-        """.format(
+      logDebug("Allocated containers: %d. Current executor count: %d. Cluster resources: %s."
+        .format(
           allocatedContainers.size,
-          numExecutorsRunning.get(),
-          releasedContainers,
+          numExecutorsRunning,
           allocateResponse.getAvailableResources))
 
-      val hostToContainers = new HashMap[String, ArrayBuffer[Container]]()
-
-      for (container <- allocatedContainers) {
-        if (isResourceConstraintSatisfied(container)) {
-          // Add the accepted `container` to the host's list of already accepted,
-          // allocated containers
-          val host = container.getNodeId.getHost
-          val containersForHost = hostToContainers.getOrElseUpdate(host,
-            new ArrayBuffer[Container]())
-          containersForHost += container
-        } else {
-          // Release container, since it doesn't satisfy resource constraints.
-          internalReleaseContainer(container)
-        }
-      }
-
-       // Find the appropriate containers to use.
-      // TODO: Cleanup this group-by...
-      val dataLocalContainers = new HashMap[String, ArrayBuffer[Container]]()
-      val rackLocalContainers = new HashMap[String, ArrayBuffer[Container]]()
-      val offRackContainers = new HashMap[String, ArrayBuffer[Container]]()
-
-      for (candidateHost <- hostToContainers.keySet) {
-        val maxExpectedHostCount = preferredHostToCount.getOrElse(candidateHost, 0)
-        val requiredHostCount = maxExpectedHostCount - allocatedContainersOnHost(candidateHost)
-
-        val remainingContainersOpt = hostToContainers.get(candidateHost)
-        assert(remainingContainersOpt.isDefined)
-        var remainingContainers = remainingContainersOpt.get
-
-        if (requiredHostCount >= remainingContainers.size) {
-          // Since we have <= required containers, add all remaining containers to
-          // `dataLocalContainers`.
-          dataLocalContainers.put(candidateHost, remainingContainers)
-          // There are no more free containers remaining.
-          remainingContainers = null
-        } else if (requiredHostCount > 0) {
-          // Container list has more containers than we need for data locality.
-          // Split the list into two: one based on the data local container count,
-          // (`remainingContainers.size` - `requiredHostCount`), and the other to hold remaining
-          // containers.
-          val (dataLocal, remaining) = remainingContainers.splitAt(
-            remainingContainers.size - requiredHostCount)
-          dataLocalContainers.put(candidateHost, dataLocal)
-
-          // Invariant: remainingContainers == remaining
-
-          // YARN has a nasty habit of allocating a ton of containers on a host - discourage this.
-          // Add each container in `remaining` to list of containers to release. If we have an
-          // insufficient number of containers, then the next allocation cycle will reallocate
-          // (but won't treat it as data local).
-          // TODO(harvey): Rephrase this comment some more.
-          for (container <- remaining) internalReleaseContainer(container)
-          remainingContainers = null
-        }
-
-        // For rack local containers
-        if (remainingContainers != null) {
-          val rack = YarnSparkHadoopUtil.lookupRack(conf, candidateHost)
-          if (rack != null) {
-            val maxExpectedRackCount = preferredRackToCount.getOrElse(rack, 0)
-            val requiredRackCount = maxExpectedRackCount - allocatedContainersOnRack(rack) -
-              rackLocalContainers.getOrElse(rack, List()).size
-
-            if (requiredRackCount >= remainingContainers.size) {
-              // Add all remaining containers to to `dataLocalContainers`.
-              dataLocalContainers.put(rack, remainingContainers)
-              remainingContainers = null
-            } else if (requiredRackCount > 0) {
-              // Container list has more containers that we need for data locality.
-              // Split the list into two: one based on the data local container count,
-              // (`remainingContainers.size` - `requiredHostCount`), and the other to hold remaining
-              // containers.
-              val (rackLocal, remaining) = remainingContainers.splitAt(
-                remainingContainers.size - requiredRackCount)
-              val existingRackLocal = rackLocalContainers.getOrElseUpdate(rack,
-                new ArrayBuffer[Container]())
-
-              existingRackLocal ++= rackLocal
-
-              remainingContainers = remaining
-            }
-          }
-        }
-
-        if (remainingContainers != null) {
-          // Not all containers have been consumed - add them to the list of off-rack containers.
-          offRackContainers.put(candidateHost, remainingContainers)
-        }
-      }
-
-      // Now that we have split the containers into various groups, go through them in order:
-      // first host-local, then rack-local, and finally off-rack.
-      // Note that the list we create below tries to ensure that not all containers end up within
-      // a host if there is a sufficiently large number of hosts/containers.
-      val allocatedContainersToProcess = new ArrayBuffer[Container](allocatedContainers.size)
-      allocatedContainersToProcess ++= TaskSchedulerImpl.prioritizeContainers(dataLocalContainers)
-      allocatedContainersToProcess ++= TaskSchedulerImpl.prioritizeContainers(rackLocalContainers)
-      allocatedContainersToProcess ++= TaskSchedulerImpl.prioritizeContainers(offRackContainers)
-
-      // Run each of the allocated containers.
-      for (container <- allocatedContainersToProcess) {
-        val numExecutorsRunningNow = numExecutorsRunning.incrementAndGet()
-        val executorHostname = container.getNodeId.getHost
-        val containerId = container.getId
-
-        val executorMemoryOverhead = (executorMemory + memoryOverhead)
-        assert(container.getResource.getMemory >= executorMemoryOverhead)
-
-        if (numExecutorsRunningNow > maxExecutors) {
-          logInfo("""Ignoring container %s at host %s, since we already have the required number of
-            containers for it.""".format(containerId, executorHostname))
-          internalReleaseContainer(container)
-          numExecutorsRunning.decrementAndGet()
-        } else {
-          val executorId = executorIdCounter.incrementAndGet().toString
-          val driverUrl = "akka.tcp://%s@%s:%s/user/%s".format(
-            SparkEnv.driverActorSystemName,
-            sparkConf.get("spark.driver.host"),
-            sparkConf.get("spark.driver.port"),
-            CoarseGrainedSchedulerBackend.ACTOR_NAME)
-
-          logInfo("Launching container %s for on host %s".format(containerId, executorHostname))
-          executorIdToContainer(executorId) = container
-
-          // To be safe, remove the container from `releasedContainers`.
-          releasedContainers.remove(containerId)
-
-          val rack = YarnSparkHadoopUtil.lookupRack(conf, executorHostname)
-          allocatedHostToContainersMap.synchronized {
-            val containerSet = allocatedHostToContainersMap.getOrElseUpdate(executorHostname,
-              new HashSet[ContainerId]())
-
-            containerSet += containerId
-            allocatedContainerToHostMap.put(containerId, executorHostname)
-
-            if (rack != null) {
-              allocatedRackCount.put(rack, allocatedRackCount.getOrElse(rack, 0) + 1)
-            }
-          }
-          logInfo("Launching ExecutorRunnable. driverUrl: %s,  executorHostname: %s".format(
-            driverUrl, executorHostname))
-          val executorRunnable = new ExecutorRunnable(
-            container,
-            conf,
-            sparkConf,
-            driverUrl,
-            executorId,
-            executorHostname,
-            executorMemory,
-            executorCores,
-            appAttemptId.getApplicationId.toString,
-            securityMgr)
-          launcherPool.execute(executorRunnable)
-        }
-      }
-      logDebug("""
-        Finished allocating %s containers (from %s originally).
-        Current number of executors running: %d,
-        Released containers: %s
-        """.format(
-          allocatedContainersToProcess,
-          allocatedContainers,
-          numExecutorsRunning.get(),
-          releasedContainers))
+      handleAllocatedContainers(allocatedContainers)
     }
 
     val completedContainers = allocateResponse.getCompletedContainersStatuses()
     if (completedContainers.size > 0) {
       logDebug("Completed %d containers".format(completedContainers.size))
 
-      for (completedContainer <- completedContainers) {
-        val containerId = completedContainer.getContainerId
-
-        if (releasedContainers.containsKey(containerId)) {
-          // YarnAllocationHandler already marked the container for release, so remove it from
-          // `releasedContainers`.
-          releasedContainers.remove(containerId)
-        } else {
-          // Decrement the number of executors running. The next iteration of
-          // the ApplicationMaster's reporting thread will take care of allocating.
-          numExecutorsRunning.decrementAndGet()
-          logInfo("Completed container %s (state: %s, exit status: %s)".format(
-            containerId,
-            completedContainer.getState,
-            completedContainer.getExitStatus))
-          // Hadoop 2.2.X added a ContainerExitStatus we should switch to use
-          // there are some exit status' we shouldn't necessarily count against us, but for
-          // now I think its ok as none of the containers are expected to exit
-          if (completedContainer.getExitStatus == -103) { // vmem limit exceeded
-            logWarning(memLimitExceededLogMessage(
-              completedContainer.getDiagnostics,
-              VMEM_EXCEEDED_PATTERN))
-          } else if (completedContainer.getExitStatus == -104) { // pmem limit exceeded
-            logWarning(memLimitExceededLogMessage(
-              completedContainer.getDiagnostics,
-              PMEM_EXCEEDED_PATTERN))
-          } else if (completedContainer.getExitStatus != 0) {
-            logInfo("Container marked as failed: " + containerId +
-              ". Exit status: " + completedContainer.getExitStatus +
-              ". Diagnostics: " + completedContainer.getDiagnostics)
-            numExecutorsFailed.incrementAndGet()
-          }
-        }
+      processCompletedContainers(completedContainers)
 
-        allocatedHostToContainersMap.synchronized {
-          if (allocatedContainerToHostMap.containsKey(containerId)) {
-            val hostOpt = allocatedContainerToHostMap.get(containerId)
-            assert(hostOpt.isDefined)
-            val host = hostOpt.get
-
-            val containerSetOpt = allocatedHostToContainersMap.get(host)
-            assert(containerSetOpt.isDefined)
-            val containerSet = containerSetOpt.get
-
-            containerSet.remove(containerId)
-            if (containerSet.isEmpty) {
-              allocatedHostToContainersMap.remove(host)
-            } else {
-              allocatedHostToContainersMap.update(host, containerSet)
-            }
-
-            allocatedContainerToHostMap.remove(containerId)
-
-            // TODO: Move this part outside the synchronized block?
-            val rack = YarnSparkHadoopUtil.lookupRack(conf, host)
-            if (rack != null) {
-              val rackCount = allocatedRackCount.getOrElse(rack, 0) - 1
-              if (rackCount > 0) {
-                allocatedRackCount.put(rack, rackCount)
-              } else {
-                allocatedRackCount.remove(rack)
-              }
-            }
-          }
-        }
-      }
-      logDebug("""
-        Finished processing %d completed containers.
-        Current number of executors running: %d,
-        Released containers: %s
-        """.format(
-          completedContainers.size,
-          numExecutorsRunning.get(),
-          releasedContainers))
+      logDebug("Finished processing %d completed containers. Current running executor count: %d."
+        .format(completedContainers.size, numExecutorsRunning))
     }
   }
 
-  protected def allocatedContainersOnHost(host: String): Int = {
-    var retval = 0
-    allocatedHostToContainersMap.synchronized {
-      retval = allocatedHostToContainersMap.getOrElse(host, Set()).size
+  /**
+   * Request numExecutors additional containers from YARN. Visible for testing.
+   */
+  def addResourceRequests(numExecutors: Int): Unit = {
+    for (i <- 0 until numExecutors) {
+      val request = new ContainerRequest(resource, null, null, RM_REQUEST_PRIORITY)
+      amClient.addContainerRequest(request)
+      val nodes = request.getNodes
+      val hostStr = if (nodes == null || nodes.isEmpty) "Any" else nodes.last
+      logInfo("Container request (host: %s, capability: %s".format(hostStr, resource))
     }
-    retval
   }
 
-  protected def allocatedContainersOnRack(rack: String): Int = {
-    var retval = 0
-    allocatedHostToContainersMap.synchronized {
-      retval = allocatedRackCount.getOrElse(rack, 0)
+  /**
+   * Handle containers granted by the RM by launching executors on them.
+   *
+   * Due to the way the YARN allocation protocol works, certain healthy race conditions can result
+   * in YARN granting containers that we no longer need. In this case, we release them.
+   *
+   * Visible for testing.
+   */
+  def handleAllocatedContainers(allocatedContainers: Seq[Container]): Unit = {
+    val containersToUse = new ArrayBuffer[Container](allocatedContainers.size)
+
+    // Match incoming requests by host
+    val remainingAfterHostMatches = new ArrayBuffer[Container]
+    for (allocatedContainer <- allocatedContainers) {
+      matchContainerToRequest(allocatedContainer, allocatedContainer.getNodeId.getHost,
+        containersToUse, remainingAfterHostMatches)
     }
-    retval
-  }
-
-  private def isResourceConstraintSatisfied(container: Container): Boolean = {
-    container.getResource.getMemory >= (executorMemory + memoryOverhead)
-  }
-
-  // A simple method to copy the split info map.
-  private def generateNodeToWeight(
-      conf: Configuration,
-      input: collection.Map[String, collection.Set[SplitInfo]]
-    ): (Map[String, Int], Map[String, Int]) = {
 
-    if (input == null) {
-      return (Map[String, Int](), Map[String, Int]())
+    // Match remaining by rack
+    val remainingAfterRackMatches = new ArrayBuffer[Container]
+    for (allocatedContainer <- remainingAfterHostMatches) {
+      val rack = RackResolver.resolve(conf, allocatedContainer.getNodeId.getHost).getNetworkLocation
+      matchContainerToRequest(allocatedContainer, rack, containersToUse,
+        remainingAfterRackMatches)
     }
 
-    val hostToCount = new HashMap[String, Int]
-    val rackToCount = new HashMap[String, Int]
-
-    for ((host, splits) <- input) {
-      val hostCount = hostToCount.getOrElse(host, 0)
-      hostToCount.put(host, hostCount + splits.size)
+    // Assign remaining that are neither node-local nor rack-local
+    val remainingAfterOffRackMatches = new ArrayBuffer[Container]
+    for (allocatedContainer <- remainingAfterRackMatches) {
+      matchContainerToRequest(allocatedContainer, ANY_HOST, containersToUse,
+        remainingAfterOffRackMatches)
+    }
 
-      val rack = YarnSparkHadoopUtil.lookupRack(conf, host)
-      if (rack != null) {
-        val rackCount = rackToCount.getOrElse(host, 0)
-        rackToCount.put(host, rackCount + splits.size)
+    if (!remainingAfterOffRackMatches.isEmpty) {
+      logDebug(s"Releasing ${remainingAfterOffRackMatches.size} unneeded containers that were " +
+        s"allocated to us")
+      for (container <- remainingAfterOffRackMatches) {
+        internalReleaseContainer(container)
       }
     }
 
-    (hostToCount.toMap, rackToCount.toMap)
-  }
+    runAllocatedContainers(containersToUse)
 
-  private def internalReleaseContainer(container: Container) = {
-    releasedContainers.put(container.getId(), true)
-    releaseContainer(container)
+    logInfo("Received %d containers from YARN, launching executors on %d of them."
+      .format(allocatedContainers.size, containersToUse.size))
   }
 
   /**
-   * Called to allocate containers in the cluster.
+   * Looks for requests for the given location that match the given container allocation. If it
+   * finds one, removes the request so that it won't be submitted again. Places the container into
+   * containersToUse or remaining.
    *
-   * @param count Number of containers to allocate.
-   *              If zero, should still contact RM (as a heartbeat).
-   * @param pending Number of containers pending allocate. Only used on alpha.
-   * @return Response to the allocation request.
+   * @param allocatedContainer container that was given to us by YARN
+   * @location resource name, either a node, rack, or *
+   * @param containersToUse list of containers that will be used
+   * @param remaining list of containers that will not be used
    */
-  protected def allocateContainers(count: Int, pending: Int): YarnAllocateResponse
-
-  /** Called to release a previously allocated container. */
-  protected def releaseContainer(container: Container): Unit
+  private def matchContainerToRequest(
+      allocatedContainer: Container,
+      location: String,
+      containersToUse: ArrayBuffer[Container],
+      remaining: ArrayBuffer[Container]): Unit = {
+    val matchingRequests = amClient.getMatchingRequests(allocatedContainer.getPriority, location,
+      allocatedContainer.getResource)
+
+    // Match the allocation to a request
+    if (!matchingRequests.isEmpty) {
+      val containerRequest = matchingRequests.get(0).iterator.next
+      amClient.removeContainerRequest(containerRequest)
+      containersToUse += allocatedContainer
+    } else {
+      remaining += allocatedContainer
+    }
+  }
 
   /**
-   * Defines the interface for an allocate response from the RM. This is needed since the alpha
-   * and stable interfaces differ here in ways that cannot be fixed using other routes.
+   * Launches executors in the allocated containers.
    */
-  protected trait YarnAllocateResponse {
+  private def runAllocatedContainers(containersToUse: ArrayBuffer[Container]): Unit = {
+    for (container <- containersToUse) {
+      numExecutorsRunning += 1
+      assert(numExecutorsRunning <= maxExecutors)
+      val executorHostname = container.getNodeId.getHost
+      val containerId = container.getId
+      executorIdCounter += 1
+      val executorId = executorIdCounter.toString
+
+      assert(container.getResource.getMemory >= resource.getMemory)
+
+      logInfo("Launching container %s for on host %s".format(containerId, executorHostname))
+
+      val containerSet = allocatedHostToContainersMap.getOrElseUpdate(executorHostname,
+        new HashSet[ContainerId])
+
+      containerSet += containerId
+      allocatedContainerToHostMap.put(containerId, executorHostname)
+
+      val executorRunnable = new ExecutorRunnable(
+        container,
+        conf,
+        sparkConf,
+        driverUrl,
+        executorId,
+        executorHostname,
+        executorMemory,
+        executorCores,
+        appAttemptId.getApplicationId.toString,
+        securityMgr)
+      if (launchContainers) {
+        logInfo("Launching ExecutorRunnable. driverUrl: %s,  executorHostname: %s".format(
+          driverUrl, executorHostname))
+        launcherPool.execute(executorRunnable)
+      }
+    }
+  }
+
+  private def processCompletedContainers(completedContainers: Seq[ContainerStatus]): Unit = {
+    for (completedContainer <- completedContainers) {
+      val containerId = completedContainer.getContainerId
+
+      if (releasedContainers.contains(containerId)) {
+        // Already marked the container for release, so remove it from
+        // `releasedContainers`.
+        releasedContainers.remove(containerId)
+      } else {
+        // Decrement the number of executors running. The next iteration of
+        // the ApplicationMaster's reporting thread will take care of allocating.
+        numExecutorsRunning -= 1
+        logInfo("Completed container %s (state: %s, exit status: %s)".format(
+          containerId,
+          completedContainer.getState,
+          completedContainer.getExitStatus))
+        // Hadoop 2.2.X added a ContainerExitStatus we should switch to use
+        // there are some exit status' we shouldn't necessarily count against us, but for
+        // now I think its ok as none of the containers are expected to exit
+        if (completedContainer.getExitStatus == -103) { // vmem limit exceeded
+          logWarning(memLimitExceededLogMessage(
+            completedContainer.getDiagnostics,
+            VMEM_EXCEEDED_PATTERN))
+        } else if (completedContainer.getExitStatus == -104) { // pmem limit exceeded
+          logWarning(memLimitExceededLogMessage(
+            completedContainer.getDiagnostics,
+            PMEM_EXCEEDED_PATTERN))
+        } else if (completedContainer.getExitStatus != 0) {
+          logInfo("Container marked as failed: " + containerId +
+            ". Exit status: " + completedContainer.getExitStatus +
+            ". Diagnostics: " + completedContainer.getDiagnostics)
+          numExecutorsFailed += 1
+        }
+      }
 
-    def getAllocatedContainers(): JList[Container]
+      if (allocatedContainerToHostMap.containsKey(containerId)) {
+        val host = allocatedContainerToHostMap.get(containerId).get
+        val containerSet = allocatedHostToContainersMap.get(host).get
 
-    def getAvailableResources(): Resource
+        containerSet.remove(containerId)
+        if (containerSet.isEmpty) {
+          allocatedHostToContainersMap.remove(host)
+        } else {
+          allocatedHostToContainersMap.update(host, containerSet)
+        }
 
-    def getCompletedContainersStatuses(): JList[ContainerStatus]
+        allocatedContainerToHostMap.remove(containerId)
+      }
+    }
+  }
 
+  private def internalReleaseContainer(container: Container): Unit = {
+    releasedContainers.add(container.getId())
+    amClient.releaseAssignedContainer(container.getId())
   }
 
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
index 2510b9c9cef68..b134751366522 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
@@ -17,19 +17,33 @@
 
 package org.apache.spark.deploy.yarn
 
+import java.util.{List => JList}
+
+import scala.collection.JavaConversions._
 import scala.collection.{Map, Set}
+import scala.util.Try
 
-import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.yarn.api.ApplicationConstants
 import org.apache.hadoop.yarn.api.records._
+import org.apache.hadoop.yarn.client.api.AMRMClient
+import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.apache.hadoop.yarn.util.ConverterUtils
+import org.apache.hadoop.yarn.webapp.util.WebAppUtils
 
-import org.apache.spark.{SecurityManager, SparkConf, SparkContext}
+import org.apache.spark.{Logging, SecurityManager, SparkConf}
 import org.apache.spark.scheduler.SplitInfo
+import org.apache.spark.util.Utils
 
 /**
- * Interface that defines a Yarn RM client. Abstracts away Yarn version-specific functionality that
- * is used by Spark's AM.
+ * Handles registering and unregistering the application with the YARN ResourceManager.
  */
-trait YarnRMClient {
+private[spark] class YarnRMClient(args: ApplicationMasterArguments) extends Logging {
+
+  private var amClient: AMRMClient[ContainerRequest] = _
+  private var uiHistoryAddress: String = _
+  private var registered: Boolean = false
 
   /**
    * Registers the application master with the RM.
@@ -46,7 +60,20 @@ trait YarnRMClient {
       preferredNodeLocations: Map[String, Set[SplitInfo]],
       uiAddress: String,
       uiHistoryAddress: String,
-      securityMgr: SecurityManager): YarnAllocator
+      securityMgr: SecurityManager
+    ): YarnAllocator = {
+    amClient = AMRMClient.createAMRMClient()
+    amClient.init(conf)
+    amClient.start()
+    this.uiHistoryAddress = uiHistoryAddress
+
+    logInfo("Registering the ApplicationMaster")
+    synchronized {
+      amClient.registerApplicationMaster(Utils.localHostName(), 0, uiAddress)
+      registered = true
+    }
+    new YarnAllocator(conf, sparkConf, amClient, getAttemptId(), args, securityMgr)
+  }
 
   /**
    * Unregister the AM. Guaranteed to only be called once.
@@ -54,15 +81,54 @@ trait YarnRMClient {
    * @param status The final status of the AM.
    * @param diagnostics Diagnostics message to include in the final status.
    */
-  def unregister(status: FinalApplicationStatus, diagnostics: String = ""): Unit
+  def unregister(status: FinalApplicationStatus, diagnostics: String = ""): Unit = synchronized {
+    if (registered) {
+      amClient.unregisterApplicationMaster(status, diagnostics, uiHistoryAddress)
+    }
+  }
 
   /** Returns the attempt ID. */
-  def getAttemptId(): ApplicationAttemptId
+  def getAttemptId(): ApplicationAttemptId = {
+    val containerIdString = System.getenv(ApplicationConstants.Environment.CONTAINER_ID.name())
+    val containerId = ConverterUtils.toContainerId(containerIdString)
+    containerId.getApplicationAttemptId()
+  }
 
   /** Returns the configuration for the AmIpFilter to add to the Spark UI. */
-  def getAmIpFilterParams(conf: YarnConfiguration, proxyBase: String): Map[String, String]
+  def getAmIpFilterParams(conf: YarnConfiguration, proxyBase: String): Map[String, String] = {
+    // Figure out which scheme Yarn is using. Note the method seems to have been added after 2.2,
+    // so not all stable releases have it.
+    val prefix = Try(classOf[WebAppUtils].getMethod("getHttpSchemePrefix", classOf[Configuration])
+      .invoke(null, conf).asInstanceOf[String]).getOrElse("http://")
+
+    // If running a new enough Yarn, use the HA-aware API for retrieving the RM addresses.
+    try {
+      val method = classOf[WebAppUtils].getMethod("getProxyHostsAndPortsForAmFilter",
+        classOf[Configuration])
+      val proxies = method.invoke(null, conf).asInstanceOf[JList[String]]
+      val hosts = proxies.map { proxy => proxy.split(":")(0) }
+      val uriBases = proxies.map { proxy => prefix + proxy + proxyBase }
+      Map("PROXY_HOSTS" -> hosts.mkString(","), "PROXY_URI_BASES" -> uriBases.mkString(","))
+    } catch {
+      case e: NoSuchMethodException =>
+        val proxy = WebAppUtils.getProxyHostAndPort(conf)
+        val parts = proxy.split(":")
+        val uriBase = prefix + proxy + proxyBase
+        Map("PROXY_HOST" -> parts(0), "PROXY_URI_BASE" -> uriBase)
+    }
+  }
 
   /** Returns the maximum number of attempts to register the AM. */
-  def getMaxRegAttempts(conf: YarnConfiguration): Int
+  def getMaxRegAttempts(sparkConf: SparkConf, yarnConf: YarnConfiguration): Int = {
+    val sparkMaxAttempts = sparkConf.getOption("spark.yarn.maxAppAttempts").map(_.toInt)
+    val yarnMaxAttempts = yarnConf.getInt(
+      YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)
+    val retval: Int = sparkMaxAttempts match {
+      case Some(x) => if (x <= yarnMaxAttempts) x else yarnMaxAttempts
+      case None => yarnMaxAttempts
+    }
+
+    retval
+  }
 
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClientImpl.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClientImpl.scala
deleted file mode 100644
index 8d4b96ed79933..0000000000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClientImpl.scala
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.util.{List => JList}
-
-import scala.collection.{Map, Set}
-import scala.collection.JavaConversions._
-import scala.util._
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.yarn.api._
-import org.apache.hadoop.yarn.api.protocolrecords._
-import org.apache.hadoop.yarn.api.records._
-import org.apache.hadoop.yarn.client.api.AMRMClient
-import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.util.ConverterUtils
-import org.apache.hadoop.yarn.webapp.util.WebAppUtils
-
-import org.apache.spark.{Logging, SecurityManager, SparkConf}
-import org.apache.spark.scheduler.SplitInfo
-import org.apache.spark.util.Utils
-
-
-/**
- * YarnRMClient implementation for the Yarn stable API.
- */
-private class YarnRMClientImpl(args: ApplicationMasterArguments) extends YarnRMClient with Logging {
-
-  private var amClient: AMRMClient[ContainerRequest] = _
-  private var uiHistoryAddress: String = _
-  private var registered: Boolean = false
-
-  override def register(
-      conf: YarnConfiguration,
-      sparkConf: SparkConf,
-      preferredNodeLocations: Map[String, Set[SplitInfo]],
-      uiAddress: String,
-      uiHistoryAddress: String,
-      securityMgr: SecurityManager) = {
-    amClient = AMRMClient.createAMRMClient()
-    amClient.init(conf)
-    amClient.start()
-    this.uiHistoryAddress = uiHistoryAddress
-
-    logInfo("Registering the ApplicationMaster")
-    synchronized {
-      amClient.registerApplicationMaster(Utils.localHostName(), 0, uiAddress)
-      registered = true
-    }
-    new YarnAllocationHandler(conf, sparkConf, amClient, getAttemptId(), args,
-      preferredNodeLocations, securityMgr)
-  }
-
-  override def unregister(status: FinalApplicationStatus, diagnostics: String = "") = synchronized {
-    if (registered) {
-      amClient.unregisterApplicationMaster(status, diagnostics, uiHistoryAddress)
-    }
-  }
-
-  override def getAttemptId() = {
-    val containerIdString = System.getenv(ApplicationConstants.Environment.CONTAINER_ID.name())
-    val containerId = ConverterUtils.toContainerId(containerIdString)
-    val appAttemptId = containerId.getApplicationAttemptId()
-    appAttemptId
-  }
-
-  override def getAmIpFilterParams(conf: YarnConfiguration, proxyBase: String) = {
-    // Figure out which scheme Yarn is using. Note the method seems to have been added after 2.2,
-    // so not all stable releases have it.
-    val prefix = Try(classOf[WebAppUtils].getMethod("getHttpSchemePrefix", classOf[Configuration])
-        .invoke(null, conf).asInstanceOf[String]).getOrElse("http://")
-
-    // If running a new enough Yarn, use the HA-aware API for retrieving the RM addresses.
-    try {
-      val method = classOf[WebAppUtils].getMethod("getProxyHostsAndPortsForAmFilter",
-        classOf[Configuration])
-      val proxies = method.invoke(null, conf).asInstanceOf[JList[String]]
-      val hosts = proxies.map { proxy => proxy.split(":")(0) }
-      val uriBases = proxies.map { proxy => prefix + proxy + proxyBase }
-      Map("PROXY_HOSTS" -> hosts.mkString(","), "PROXY_URI_BASES" -> uriBases.mkString(","))
-    } catch {
-      case e: NoSuchMethodException =>
-        val proxy = WebAppUtils.getProxyHostAndPort(conf)
-        val parts = proxy.split(":")
-        val uriBase = prefix + proxy + proxyBase
-        Map("PROXY_HOST" -> parts(0), "PROXY_URI_BASE" -> uriBase)
-    }
-  }
-
-  override def getMaxRegAttempts(conf: YarnConfiguration) =
-    conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)
-
-}
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
index d7cf904db1c9e..4bff846123619 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -31,7 +31,7 @@ import org.apache.hadoop.mapred.JobConf
 import org.apache.hadoop.security.Credentials
 import org.apache.hadoop.security.UserGroupInformation
 import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.api.records.ApplicationAccessType
+import org.apache.hadoop.yarn.api.records.{Priority, ApplicationAccessType}
 import org.apache.hadoop.yarn.util.RackResolver
 import org.apache.hadoop.conf.Configuration
 
@@ -99,13 +99,7 @@ object YarnSparkHadoopUtil {
 
   // All RM requests are issued with same priority : we do not (yet) have any distinction between
   // request types (like map/reduce in hadoop for example)
-  val RM_REQUEST_PRIORITY = 1
-
-  // Host to rack map - saved from allocation requests. We are expecting this not to change.
-  // Note that it is possible for this to change : and ResourceManager will indicate that to us via
-  // update response to allocate. But we are punting on handling that for now.
-  private val hostToRack = new ConcurrentHashMap[String, String]()
-  private val rackToHostSet = new ConcurrentHashMap[String, JSet[String]]()
+  val RM_REQUEST_PRIORITY = Priority.newInstance(1)
 
   /**
    * Add a path variable to the given environment map.
@@ -184,37 +178,6 @@ object YarnSparkHadoopUtil {
     }
   }
 
-  def lookupRack(conf: Configuration, host: String): String = {
-    if (!hostToRack.contains(host)) {
-      populateRackInfo(conf, host)
-    }
-    hostToRack.get(host)
-  }
-
-  def populateRackInfo(conf: Configuration, hostname: String) {
-    Utils.checkHost(hostname)
-
-    if (!hostToRack.containsKey(hostname)) {
-      // If there are repeated failures to resolve, all to an ignore list.
-      val rackInfo = RackResolver.resolve(conf, hostname)
-      if (rackInfo != null && rackInfo.getNetworkLocation != null) {
-        val rack = rackInfo.getNetworkLocation
-        hostToRack.put(hostname, rack)
-        if (! rackToHostSet.containsKey(rack)) {
-          rackToHostSet.putIfAbsent(rack,
-            Collections.newSetFromMap(new ConcurrentHashMap[String, JBoolean]()))
-        }
-        rackToHostSet.get(rack).add(hostname)
-
-        // TODO(harvey): Figure out what this comment means...
-        // Since RackResolver caches, we are disabling this for now ...
-      } /* else {
-        // right ? Else we will keep calling rack resolver in case we cant resolve rack info ...
-        hostToRack.put(hostname, null)
-      } */
-    }
-  }
-
   def getApplicationAclsForYarn(securityMgr: SecurityManager)
       : Map[ApplicationAccessType, String] = {
     Map[ApplicationAccessType, String] (
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientClusterScheduler.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientClusterScheduler.scala
index 254774a6b839e..2fa24cc43325e 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientClusterScheduler.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientClusterScheduler.scala
@@ -17,8 +17,9 @@
 
 package org.apache.spark.scheduler.cluster
 
+import org.apache.hadoop.yarn.util.RackResolver
+
 import org.apache.spark._
-import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil
 import org.apache.spark.scheduler.TaskSchedulerImpl
 import org.apache.spark.util.Utils
 
@@ -30,6 +31,6 @@ private[spark] class YarnClientClusterScheduler(sc: SparkContext) extends TaskSc
   // By default, rack is unknown
   override def getRackForHost(hostPort: String): Option[String] = {
     val host = Utils.parseHostPort(hostPort)._1
-    Option(YarnSparkHadoopUtil.lookupRack(sc.hadoopConfiguration, host))
+    Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
   }
 }
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
index 2923e6729cd6b..690f927e938c3 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -65,26 +65,42 @@ private[spark] class YarnClientSchedulerBackend(
    */
   private def getExtraClientArguments: Seq[String] = {
     val extraArgs = new ArrayBuffer[String]
-    val optionTuples = // List of (target Client argument, environment variable, Spark property)
+    // List of (target Client argument, environment variable, Spark property)
+    val optionTuples =
       List(
-        ("--driver-memory", "SPARK_MASTER_MEMORY", "spark.master.memory"),
-        ("--driver-memory", "SPARK_DRIVER_MEMORY", "spark.driver.memory"),
         ("--num-executors", "SPARK_WORKER_INSTANCES", "spark.executor.instances"),
         ("--num-executors", "SPARK_EXECUTOR_INSTANCES", "spark.executor.instances"),
         ("--executor-memory", "SPARK_WORKER_MEMORY", "spark.executor.memory"),
         ("--executor-memory", "SPARK_EXECUTOR_MEMORY", "spark.executor.memory"),
         ("--executor-cores", "SPARK_WORKER_CORES", "spark.executor.cores"),
         ("--executor-cores", "SPARK_EXECUTOR_CORES", "spark.executor.cores"),
-        ("--queue", "SPARK_YARN_QUEUE", "spark.yarn.queue"),
-        ("--name", "SPARK_YARN_APP_NAME", "spark.app.name")
+        ("--queue", "SPARK_YARN_QUEUE", "spark.yarn.queue")
       )
+    // Warn against the following deprecated environment variables: env var -> suggestion
+    val deprecatedEnvVars = Map(
+      "SPARK_MASTER_MEMORY" -> "SPARK_DRIVER_MEMORY or --driver-memory through spark-submit",
+      "SPARK_WORKER_INSTANCES" -> "SPARK_WORKER_INSTANCES or --num-executors through spark-submit",
+      "SPARK_WORKER_MEMORY" -> "SPARK_EXECUTOR_MEMORY or --executor-memory through spark-submit",
+      "SPARK_WORKER_CORES" -> "SPARK_EXECUTOR_CORES or --executor-cores through spark-submit")
+    // Do the same for deprecated properties: property -> suggestion
+    val deprecatedProps = Map("spark.master.memory" -> "--driver-memory through spark-submit")
     optionTuples.foreach { case (optionName, envVar, sparkProp) =>
-      if (System.getenv(envVar) != null) {
-        extraArgs += (optionName, System.getenv(envVar))
-      } else if (sc.getConf.contains(sparkProp)) {
+      if (sc.getConf.contains(sparkProp)) {
         extraArgs += (optionName, sc.getConf.get(sparkProp))
+        if (deprecatedProps.contains(sparkProp)) {
+          logWarning(s"NOTE: $sparkProp is deprecated. Use ${deprecatedProps(sparkProp)} instead.")
+        }
+      } else if (System.getenv(envVar) != null) {
+        extraArgs += (optionName, System.getenv(envVar))
+        if (deprecatedEnvVars.contains(envVar)) {
+          logWarning(s"NOTE: $envVar is deprecated. Use ${deprecatedEnvVars(envVar)} instead.")
+        }
       }
     }
+    // The app name is a special case because "spark.app.name" is required of all applications.
+    // As a result, the corresponding "SPARK_YARN_APP_NAME" is already handled preemptively in
+    // SparkSubmitArguments if "spark.app.name" is not explicitly set by the user. (SPARK-5222)
+    sc.getConf.getOption("spark.app.name").foreach(v => extraArgs += ("--name", v))
     extraArgs
   }
 
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala
index 4157ff95c2794..be55d26f1cf61 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala
@@ -17,8 +17,10 @@
 
 package org.apache.spark.scheduler.cluster
 
+import org.apache.hadoop.yarn.util.RackResolver
+
 import org.apache.spark._
-import org.apache.spark.deploy.yarn.{ApplicationMaster, YarnSparkHadoopUtil}
+import org.apache.spark.deploy.yarn.ApplicationMaster
 import org.apache.spark.scheduler.TaskSchedulerImpl
 import org.apache.spark.util.Utils
 
@@ -39,7 +41,7 @@ private[spark] class YarnClusterScheduler(sc: SparkContext) extends TaskSchedule
   // By default, rack is unknown
   override def getRackForHost(hostPort: String): Option[String] = {
     val host = Utils.parseHostPort(hostPort)._1
-    Option(YarnSparkHadoopUtil.lookupRack(sc.hadoopConfiguration, host))
+    Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
   }
 
   override def postStartHook() {
diff --git a/yarn/src/test/resources/log4j.properties b/yarn/src/test/resources/log4j.properties
index 9dd05f17f012b..287c8e3563503 100644
--- a/yarn/src/test/resources/log4j.properties
+++ b/yarn/src/test/resources/log4j.properties
@@ -15,10 +15,10 @@
 # limitations under the License.
 #
 
-# Set everything to be logged to the file core/target/unit-tests.log
+# Set everything to be logged to the file target/unit-tests.log
 log4j.rootCategory=INFO, file
 log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=false
+log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientBaseSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
similarity index 76%
rename from yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientBaseSuite.scala
rename to yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 17b79ae1d82c4..aad50015b717f 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientBaseSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -41,38 +41,38 @@ import scala.util.Try
 import org.apache.spark.{SparkException, SparkConf}
 import org.apache.spark.util.Utils
 
-class ClientBaseSuite extends FunSuite with Matchers {
+class ClientSuite extends FunSuite with Matchers {
 
   test("default Yarn application classpath") {
-    ClientBase.getDefaultYarnApplicationClasspath should be(Some(Fixtures.knownDefYarnAppCP))
+    Client.getDefaultYarnApplicationClasspath should be(Some(Fixtures.knownDefYarnAppCP))
   }
 
   test("default MR application classpath") {
-    ClientBase.getDefaultMRApplicationClasspath should be(Some(Fixtures.knownDefMRAppCP))
+    Client.getDefaultMRApplicationClasspath should be(Some(Fixtures.knownDefMRAppCP))
   }
 
   test("resultant classpath for an application that defines a classpath for YARN") {
     withAppConf(Fixtures.mapYARNAppConf) { conf =>
       val env = newEnv
-      ClientBase.populateHadoopClasspath(conf, env)
+      Client.populateHadoopClasspath(conf, env)
       classpath(env) should be(
-        flatten(Fixtures.knownYARNAppCP, ClientBase.getDefaultMRApplicationClasspath))
+        flatten(Fixtures.knownYARNAppCP, Client.getDefaultMRApplicationClasspath))
     }
   }
 
   test("resultant classpath for an application that defines a classpath for MR") {
     withAppConf(Fixtures.mapMRAppConf) { conf =>
       val env = newEnv
-      ClientBase.populateHadoopClasspath(conf, env)
+      Client.populateHadoopClasspath(conf, env)
       classpath(env) should be(
-        flatten(ClientBase.getDefaultYarnApplicationClasspath, Fixtures.knownMRAppCP))
+        flatten(Client.getDefaultYarnApplicationClasspath, Fixtures.knownMRAppCP))
     }
   }
 
   test("resultant classpath for an application that defines both classpaths, YARN and MR") {
     withAppConf(Fixtures.mapAppConf) { conf =>
       val env = newEnv
-      ClientBase.populateHadoopClasspath(conf, env)
+      Client.populateHadoopClasspath(conf, env)
       classpath(env) should be(flatten(Fixtures.knownYARNAppCP, Fixtures.knownMRAppCP))
     }
   }
@@ -83,16 +83,16 @@ class ClientBaseSuite extends FunSuite with Matchers {
 
   test("Local jar URIs") {
     val conf = new Configuration()
-    val sparkConf = new SparkConf().set(ClientBase.CONF_SPARK_JAR, SPARK)
+    val sparkConf = new SparkConf().set(Client.CONF_SPARK_JAR, SPARK)
     val env = new MutableHashMap[String, String]()
     val args = new ClientArguments(Array("--jar", USER, "--addJars", ADDED), sparkConf)
 
-    ClientBase.populateClasspath(args, conf, sparkConf, env)
+    Client.populateClasspath(args, conf, sparkConf, env)
 
     val cp = env("CLASSPATH").split(File.pathSeparator)
     s"$SPARK,$USER,$ADDED".split(",").foreach({ entry =>
       val uri = new URI(entry)
-      if (ClientBase.LOCAL_SCHEME.equals(uri.getScheme())) {
+      if (Client.LOCAL_SCHEME.equals(uri.getScheme())) {
         cp should contain (uri.getPath())
       } else {
         cp should not contain (uri.getPath())
@@ -100,31 +100,30 @@ class ClientBaseSuite extends FunSuite with Matchers {
     })
     cp should contain (Environment.PWD.$())
     cp should contain (s"${Environment.PWD.$()}${File.separator}*")
-    cp should not contain (ClientBase.SPARK_JAR)
-    cp should not contain (ClientBase.APP_JAR)
+    cp should not contain (Client.SPARK_JAR)
+    cp should not contain (Client.APP_JAR)
   }
 
   test("Jar path propagation through SparkConf") {
     val conf = new Configuration()
-    val sparkConf = new SparkConf().set(ClientBase.CONF_SPARK_JAR, SPARK)
-    val yarnConf = new YarnConfiguration()
+    val sparkConf = new SparkConf().set(Client.CONF_SPARK_JAR, SPARK)
     val args = new ClientArguments(Array("--jar", USER, "--addJars", ADDED), sparkConf)
 
-    val client = spy(new DummyClient(args, conf, sparkConf, yarnConf))
+    val client = spy(new Client(args, conf, sparkConf))
     doReturn(new Path("/")).when(client).copyFileToRemote(any(classOf[Path]),
       any(classOf[Path]), anyShort(), anyBoolean())
 
     val tempDir = Utils.createTempDir()
     try {
       client.prepareLocalResources(tempDir.getAbsolutePath())
-      sparkConf.getOption(ClientBase.CONF_SPARK_USER_JAR) should be (Some(USER))
+      sparkConf.getOption(Client.CONF_SPARK_USER_JAR) should be (Some(USER))
 
       // The non-local path should be propagated by name only, since it will end up in the app's
       // staging dir.
       val expected = ADDED.split(",")
         .map(p => {
           val uri = new URI(p)
-          if (ClientBase.LOCAL_SCHEME == uri.getScheme()) {
+          if (Client.LOCAL_SCHEME == uri.getScheme()) {
             p
           } else {
             Option(uri.getFragment()).getOrElse(new File(p).getName())
@@ -132,7 +131,7 @@ class ClientBaseSuite extends FunSuite with Matchers {
         })
         .mkString(",")
 
-      sparkConf.getOption(ClientBase.CONF_SPARK_YARN_SECONDARY_JARS) should be (Some(expected))
+      sparkConf.getOption(Client.CONF_SPARK_YARN_SECONDARY_JARS) should be (Some(expected))
     } finally {
       Utils.deleteRecursively(tempDir)
     }
@@ -141,34 +140,34 @@ class ClientBaseSuite extends FunSuite with Matchers {
   test("check access nns empty") {
     val sparkConf = new SparkConf()
     sparkConf.set("spark.yarn.access.namenodes", "")
-    val nns = ClientBase.getNameNodesToAccess(sparkConf)
+    val nns = Client.getNameNodesToAccess(sparkConf)
     nns should be(Set())
   }
 
   test("check access nns unset") {
     val sparkConf = new SparkConf()
-    val nns = ClientBase.getNameNodesToAccess(sparkConf)
+    val nns = Client.getNameNodesToAccess(sparkConf)
     nns should be(Set())
   }
 
   test("check access nns") {
     val sparkConf = new SparkConf()
     sparkConf.set("spark.yarn.access.namenodes", "hdfs://nn1:8032")
-    val nns = ClientBase.getNameNodesToAccess(sparkConf)
+    val nns = Client.getNameNodesToAccess(sparkConf)
     nns should be(Set(new Path("hdfs://nn1:8032")))
   }
 
   test("check access nns space") {
     val sparkConf = new SparkConf()
     sparkConf.set("spark.yarn.access.namenodes", "hdfs://nn1:8032, ")
-    val nns = ClientBase.getNameNodesToAccess(sparkConf)
+    val nns = Client.getNameNodesToAccess(sparkConf)
     nns should be(Set(new Path("hdfs://nn1:8032")))
   }
 
   test("check access two nns") {
     val sparkConf = new SparkConf()
     sparkConf.set("spark.yarn.access.namenodes", "hdfs://nn1:8032,hdfs://nn2:8032")
-    val nns = ClientBase.getNameNodesToAccess(sparkConf)
+    val nns = Client.getNameNodesToAccess(sparkConf)
     nns should be(Set(new Path("hdfs://nn1:8032"), new Path("hdfs://nn2:8032")))
   }
 
@@ -176,7 +175,7 @@ class ClientBaseSuite extends FunSuite with Matchers {
     val hadoopConf = new Configuration()
     hadoopConf.set("yarn.resourcemanager.address", "myrm:8033")
     hadoopConf.set("yarn.resourcemanager.principal", "yarn/myrm:8032@SPARKTEST.COM")
-    val renewer = ClientBase.getTokenRenewer(hadoopConf)
+    val renewer = Client.getTokenRenewer(hadoopConf)
     renewer should be ("yarn/myrm:8032@SPARKTEST.COM")
   }
 
@@ -184,7 +183,7 @@ class ClientBaseSuite extends FunSuite with Matchers {
     val hadoopConf = new Configuration()
     val caught =
       intercept[SparkException] {
-        ClientBase.getTokenRenewer(hadoopConf)
+        Client.getTokenRenewer(hadoopConf)
       }
     assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer")
   }
@@ -218,7 +217,7 @@ class ClientBaseSuite extends FunSuite with Matchers {
 
   def withAppConf(m: Map[String, String] = Map())(testCode: (Configuration) => Any) {
     val conf = new Configuration
-    m.foreach { case (k, v) => conf.set(k, v, "ClientBaseSpec") }
+    m.foreach { case (k, v) => conf.set(k, v, "ClientSpec") }
     testCode(conf)
   }
 
@@ -242,15 +241,4 @@ class ClientBaseSuite extends FunSuite with Matchers {
     }.toOption.getOrElse(defaults)
   }
 
-  private class DummyClient(
-      val args: ClientArguments,
-      val hadoopConf: Configuration,
-      val sparkConf: SparkConf,
-      val yarnConf: YarnConfiguration) extends ClientBase {
-    override def setupSecurityToken(amContainer: ContainerLaunchContext): Unit = ???
-    override def submitApplication(): ApplicationId = ???
-    override def getApplicationReport(appId: ApplicationId): ApplicationReport = ???
-    override def getClientToken(report: ApplicationReport): String = ???
-  }
-
 }
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
index 8d184a09d64cc..024b25f9d3365 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
@@ -17,18 +17,160 @@
 
 package org.apache.spark.deploy.yarn
 
+import java.util.{Arrays, List => JList}
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.CommonConfigurationKeysPublic
+import org.apache.hadoop.net.DNSToSwitchMapping
+import org.apache.hadoop.yarn.api.records._
+import org.apache.hadoop.yarn.client.api.AMRMClient
+import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
+
+import org.apache.spark.SecurityManager
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
 import org.apache.spark.deploy.yarn.YarnAllocator._
-import org.scalatest.FunSuite
+import org.apache.spark.scheduler.SplitInfo
+
+import org.scalatest.{BeforeAndAfterEach, FunSuite, Matchers}
+
+class MockResolver extends DNSToSwitchMapping {
+
+  override def resolve(names: JList[String]): JList[String] = {
+    if (names.size > 0 && names.get(0) == "host3") Arrays.asList("/rack2")
+    else Arrays.asList("/rack1")
+  }
+
+  override def reloadCachedMappings() {}
+
+  def reloadCachedMappings(names: JList[String]) {}
+}
+
+class YarnAllocatorSuite extends FunSuite with Matchers with BeforeAndAfterEach {
+  val conf = new Configuration()
+  conf.setClass(
+    CommonConfigurationKeysPublic.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY,
+    classOf[MockResolver], classOf[DNSToSwitchMapping])
+
+  val sparkConf = new SparkConf()
+  sparkConf.set("spark.driver.host", "localhost")
+  sparkConf.set("spark.driver.port", "4040")
+  sparkConf.set("spark.yarn.jar", "notarealjar.jar")
+  sparkConf.set("spark.yarn.launchContainers", "false")
+
+  val appAttemptId = ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 0), 0)
+
+  // Resource returned by YARN.  YARN can give larger containers than requested, so give 6 cores
+  // instead of the 5 requested and 3 GB instead of the 2 requested.
+  val containerResource = Resource.newInstance(3072, 6)
+
+  var rmClient: AMRMClient[ContainerRequest] = _
+
+  var containerNum = 0
+
+  override def beforeEach() {
+    rmClient = AMRMClient.createAMRMClient()
+    rmClient.init(conf)
+    rmClient.start()
+  }
+
+  override def afterEach() {
+    rmClient.stop()
+  }
+
+  class MockSplitInfo(host: String) extends SplitInfo(null, host, null, 1, null) {
+    override def equals(other: Any) = false
+  }
+
+  def createAllocator(maxExecutors: Int = 5): YarnAllocator = {
+    val args = Array(
+      "--num-executors", s"$maxExecutors",
+      "--executor-cores", "5",
+      "--executor-memory", "2048",
+      "--jar", "somejar.jar",
+      "--class", "SomeClass")
+    new YarnAllocator(
+      conf,
+      sparkConf,
+      rmClient,
+      appAttemptId,
+      new ApplicationMasterArguments(args),
+      new SecurityManager(sparkConf))
+  }
+
+  def createContainer(host: String): Container = {
+    val containerId = ContainerId.newInstance(appAttemptId, containerNum)
+    containerNum += 1
+    val nodeId = NodeId.newInstance(host, 1000)
+    Container.newInstance(containerId, nodeId, "", containerResource, RM_REQUEST_PRIORITY, null)
+  }
+
+  test("single container allocated") {
+    // request a single container and receive it
+    val handler = createAllocator()
+    handler.addResourceRequests(1)
+    handler.getNumExecutorsRunning should be (0)
+    handler.getNumPendingAllocate should be (1)
+
+    val container = createContainer("host1")
+    handler.handleAllocatedContainers(Array(container))
+
+    handler.getNumExecutorsRunning should be (1)
+    handler.allocatedContainerToHostMap.get(container.getId).get should be ("host1")
+    handler.allocatedHostToContainersMap.get("host1").get should contain (container.getId)
+    rmClient.getMatchingRequests(container.getPriority, "host1", containerResource).size should be (0)
+  }
+
+  test("some containers allocated") {
+    // request a few containers and receive some of them
+    val handler = createAllocator()
+    handler.addResourceRequests(4)
+    handler.getNumExecutorsRunning should be (0)
+    handler.getNumPendingAllocate should be (4)
+
+    val container1 = createContainer("host1")
+    val container2 = createContainer("host1")
+    val container3 = createContainer("host2")
+    handler.handleAllocatedContainers(Array(container1, container2, container3))
+
+    handler.getNumExecutorsRunning should be (3)
+    handler.allocatedContainerToHostMap.get(container1.getId).get should be ("host1")
+    handler.allocatedContainerToHostMap.get(container2.getId).get should be ("host1")
+    handler.allocatedContainerToHostMap.get(container3.getId).get should be ("host2")
+    handler.allocatedHostToContainersMap.get("host1").get should contain (container1.getId)
+    handler.allocatedHostToContainersMap.get("host1").get should contain (container2.getId)
+    handler.allocatedHostToContainersMap.get("host2").get should contain (container3.getId)
+  }
+
+  test("receive more containers than requested") {
+    val handler = createAllocator(2)
+    handler.addResourceRequests(2)
+    handler.getNumExecutorsRunning should be (0)
+    handler.getNumPendingAllocate should be (2)
+
+    val container1 = createContainer("host1")
+    val container2 = createContainer("host2")
+    val container3 = createContainer("host4")
+    handler.handleAllocatedContainers(Array(container1, container2, container3))
+
+    handler.getNumExecutorsRunning should be (2)
+    handler.allocatedContainerToHostMap.get(container1.getId).get should be ("host1")
+    handler.allocatedContainerToHostMap.get(container2.getId).get should be ("host2")
+    handler.allocatedContainerToHostMap.contains(container3.getId) should be (false)
+    handler.allocatedHostToContainersMap.get("host1").get should contain (container1.getId)
+    handler.allocatedHostToContainersMap.get("host2").get should contain (container2.getId)
+    handler.allocatedHostToContainersMap.contains("host4") should be (false)
+  }
 
-class YarnAllocatorSuite extends FunSuite {
   test("memory exceeded diagnostic regexes") {
     val diagnostics =
       "Container [pid=12465,containerID=container_1412887393566_0003_01_000002] is running " +
-      "beyond physical memory limits. Current usage: 2.1 MB of 2 GB physical memory used; " +
-      "5.8 GB of 4.2 GB virtual memory used. Killing container."
+        "beyond physical memory limits. Current usage: 2.1 MB of 2 GB physical memory used; " +
+        "5.8 GB of 4.2 GB virtual memory used. Killing container."
     val vmemMsg = memLimitExceededLogMessage(diagnostics, VMEM_EXCEEDED_PATTERN)
     val pmemMsg = memLimitExceededLogMessage(diagnostics, PMEM_EXCEEDED_PATTERN)
     assert(vmemMsg.contains("5.8 GB of 4.2 GB virtual memory used."))
     assert(pmemMsg.contains("2.1 MB of 2 GB physical memory used."))
   }
+
 }