From 16b830888734de260f460506b766edab79d30ecd Mon Sep 17 00:00:00 2001
From: Sandy Ryza <sandy@cloudera.com>
Date: Fri, 4 Apr 2014 13:28:42 -0700
Subject: [PATCH 01/21] SPARK-1375. Additional spark-submit cleanup

Author: Sandy Ryza <sandy@cloudera.com>

Closes #278 from sryza/sandy-spark-1375 and squashes the following commits:

5fbf1e9 [Sandy Ryza] SPARK-1375. Additional spark-submit cleanup
---
 .../scala/org/apache/spark/deploy/SparkSubmit.scala | 13 ++++++++-----
 .../apache/spark/deploy/SparkSubmitArguments.scala  |  2 +-
 docs/cluster-overview.md                            |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 1fa799190409f..e05fbfe321495 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -79,20 +79,23 @@ object SparkSubmit {
       printErrorAndExit("master must start with yarn, mesos, spark, or local")
     }
 
-    // Because "yarn-standalone" and "yarn-client" encapsulate both the master
+    // Because "yarn-cluster" and "yarn-client" encapsulate both the master
     // and deploy mode, we have some logic to infer the master and deploy mode
     // from each other if only one is specified, or exit early if they are at odds.
-    if (appArgs.deployMode == null && appArgs.master == "yarn-standalone") {
+    if (appArgs.deployMode == null &&
+        (appArgs.master == "yarn-standalone" || appArgs.master == "yarn-cluster")) {
       appArgs.deployMode = "cluster"
     }
     if (appArgs.deployMode == "cluster" && appArgs.master == "yarn-client") {
       printErrorAndExit("Deploy mode \"cluster\" and master \"yarn-client\" are not compatible")
     }
-    if (appArgs.deployMode == "client" && appArgs.master == "yarn-standalone") {
-      printErrorAndExit("Deploy mode \"client\" and master \"yarn-standalone\" are not compatible")
+    if (appArgs.deployMode == "client" &&
+        (appArgs.master == "yarn-standalone" || appArgs.master == "yarn-cluster")) {
+      printErrorAndExit("Deploy mode \"client\" and master \"" + appArgs.master
+        + "\" are not compatible")
     }
     if (appArgs.deployMode == "cluster" && appArgs.master.startsWith("yarn")) {
-      appArgs.master = "yarn-standalone"
+      appArgs.master = "yarn-cluster"
     }
     if (appArgs.deployMode != "cluster" && appArgs.master.startsWith("yarn")) {
       appArgs.master = "yarn-client"
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 9c8f54ea6f77a..834b3df2f164b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -171,7 +171,7 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {
       outStream.println("Unknown/unsupported param " + unknownParam)
     }
     outStream.println(
-      """Usage: spark-submit <primary binary> [options]
+      """Usage: spark-submit <app jar> [options]
         |Options:
         |  --master MASTER_URL         spark://host:port, mesos://host:port, yarn, or local.
         |  --deploy-mode DEPLOY_MODE   Mode to deploy the app in, either 'client' or 'cluster'.
diff --git a/docs/cluster-overview.md b/docs/cluster-overview.md
index b69e3416fb322..7f75ea44e4cea 100644
--- a/docs/cluster-overview.md
+++ b/docs/cluster-overview.md
@@ -56,7 +56,7 @@ The recommended way to launch a compiled Spark application is through the spark-
 bin directory), which takes care of setting up the classpath with Spark and its dependencies, as well as
 provides a layer over the different cluster managers and deploy modes that Spark supports.  It's usage is
 
-  spark-submit `<jar>` `<options>`
+  spark-submit `<app jar>` `<options>`
 
 Where options are any of:
 

From a02b535d5e18e987a4b9c4c352838d294f9e853b Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Fri, 4 Apr 2014 14:46:32 -0700
Subject: [PATCH 02/21] Don't create SparkContext in JobProgressListenerSuite.

This reduces the time of the test from 11 seconds to 20 milliseconds.

Author: Patrick Wendell <pwendell@gmail.com>

Closes #324 from pwendell/job-test and squashes the following commits:

868d9eb [Patrick Wendell] Don't create SparkContext in JobProgressListenerSuite.
---
 .../org/apache/spark/ui/jobs/JobProgressListenerSuite.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
index 67ceee505db3c..beac656f573b4 100644
--- a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
@@ -55,8 +55,8 @@ class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Shou
   }
 
   test("test executor id to summary") {
-    val sc = new SparkContext("local", "test")
-    val listener = new JobProgressListener(sc.conf)
+    val conf = new SparkConf()
+    val listener = new JobProgressListener(conf)
     val taskMetrics = new TaskMetrics()
     val shuffleReadMetrics = new ShuffleReadMetrics()
 

From 198892fe8d39a2fad585fa2a7579d8b478456c33 Mon Sep 17 00:00:00 2001
From: Thomas Graves <tgraves@apache.org>
Date: Fri, 4 Apr 2014 17:16:31 -0700
Subject: [PATCH 03/21] [SPARK-1198] Allow pipes tasks to run in different
 sub-directories

This works as is on Linux/Mac/etc but doesn't cover working on Windows.  In here I use ln -sf for symlinks. Putting this up for comments on that. Do we want to create perhaps some classes for doing shell commands - Linux vs Windows.  Is there some other way we want to do this?   I assume we are still supporting jdk1.6?

Also should I update the Java API for pipes to allow this parameter?

Author: Thomas Graves <tgraves@apache.org>

Closes #128 from tgravescs/SPARK1198 and squashes the following commits:

abc1289 [Thomas Graves] remove extra tag in pom file
ba23fc0 [Thomas Graves] Add support for symlink on windows, remove commons-io usage
da4b221 [Thomas Graves] Merge branch 'master' of https://github.com/tgravescs/spark into SPARK1198
61be271 [Thomas Graves] Fix file name filter
6b783bd [Thomas Graves] style fixes
1ab49ca [Thomas Graves] Add support for running pipe tasks is separate directories
---
 .../scala/org/apache/spark/rdd/PipedRDD.scala | 64 ++++++++++++++++++-
 .../main/scala/org/apache/spark/rdd/RDD.scala |  7 +-
 .../scala/org/apache/spark/util/Utils.scala   | 45 ++++++++++++-
 .../org/apache/spark/PipedRDDSuite.scala      | 28 +++++++-
 4 files changed, 137 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala
index 4250a9d02f764..41ae0fec823e7 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.rdd
 
+import java.io.File
+import java.io.FilenameFilter
+import java.io.IOException
 import java.io.PrintWriter
 import java.util.StringTokenizer
 
@@ -27,6 +30,7 @@ import scala.io.Source
 import scala.reflect.ClassTag
 
 import org.apache.spark.{Partition, SparkEnv, TaskContext}
+import org.apache.spark.util.Utils
 
 
 /**
@@ -38,7 +42,8 @@ class PipedRDD[T: ClassTag](
     command: Seq[String],
     envVars: Map[String, String],
     printPipeContext: (String => Unit) => Unit,
-    printRDDElement: (T, String => Unit) => Unit)
+    printRDDElement: (T, String => Unit) => Unit,
+    separateWorkingDir: Boolean)
   extends RDD[String](prev) {
 
   // Similar to Runtime.exec(), if we are given a single string, split it into words
@@ -48,12 +53,24 @@ class PipedRDD[T: ClassTag](
       command: String,
       envVars: Map[String, String] = Map(),
       printPipeContext: (String => Unit) => Unit = null,
-      printRDDElement: (T, String => Unit) => Unit = null) =
-    this(prev, PipedRDD.tokenize(command), envVars, printPipeContext, printRDDElement)
+      printRDDElement: (T, String => Unit) => Unit = null,
+      separateWorkingDir: Boolean = false) =
+    this(prev, PipedRDD.tokenize(command), envVars, printPipeContext, printRDDElement,
+      separateWorkingDir)
 
 
   override def getPartitions: Array[Partition] = firstParent[T].partitions
 
+  /**
+   * A FilenameFilter that accepts anything that isn't equal to the name passed in.
+   * @param name of file or directory to leave out
+   */
+  class NotEqualsFileNameFilter(filterName: String) extends FilenameFilter {
+    def accept(dir: File, name: String): Boolean = {
+      !name.equals(filterName)
+    }
+  }
+
   override def compute(split: Partition, context: TaskContext): Iterator[String] = {
     val pb = new ProcessBuilder(command)
     // Add the environmental variables to the process.
@@ -67,6 +84,38 @@ class PipedRDD[T: ClassTag](
       currentEnvVars.putAll(hadoopSplit.getPipeEnvVars())
     }
 
+    // When spark.worker.separated.working.directory option is turned on, each
+    // task will be run in separate directory. This should be resolve file
+    // access conflict issue
+    val taskDirectory = "./tasks/" + java.util.UUID.randomUUID.toString
+    var workInTaskDirectory = false
+    logDebug("taskDirectory = " + taskDirectory)
+    if (separateWorkingDir == true) {
+      val currentDir = new File(".")
+      logDebug("currentDir = " + currentDir.getAbsolutePath())
+      val taskDirFile = new File(taskDirectory)
+      taskDirFile.mkdirs()
+
+      try {
+        val tasksDirFilter = new NotEqualsFileNameFilter("tasks")
+
+        // Need to add symlinks to jars, files, and directories.  On Yarn we could have
+        // directories and other files not known to the SparkContext that were added via the
+        // Hadoop distributed cache.  We also don't want to symlink to the /tasks directories we
+        // are creating here.
+        for (file <- currentDir.list(tasksDirFilter)) {
+          val fileWithDir = new File(currentDir, file)
+          Utils.symlink(new File(fileWithDir.getAbsolutePath()),
+            new File(taskDirectory + "/" + fileWithDir.getName()))
+        }
+        pb.directory(taskDirFile)
+        workInTaskDirectory = true
+      } catch {
+        case e: Exception => logError("Unable to setup task working directory: " + e.getMessage +
+          " (" + taskDirectory + ")")
+      }
+    }
+
     val proc = pb.start()
     val env = SparkEnv.get
 
@@ -112,6 +161,15 @@ class PipedRDD[T: ClassTag](
           if (exitStatus != 0) {
             throw new Exception("Subprocess exited with status " + exitStatus)
           }
+
+          // cleanup task working directory if used
+          if (workInTaskDirectory == true) {
+            scala.util.control.Exception.ignoring(classOf[IOException]) {
+              Utils.deleteRecursively(new File(taskDirectory))
+            }
+            logDebug("Removed task working directory " + taskDirectory)
+          }
+
           false
         }
       }
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index ce2b8ac27206b..08c42c5ee87b6 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -481,16 +481,19 @@ abstract class RDD[T: ClassTag](
    *                        instead of constructing a huge String to concat all the elements:
    *                        def printRDDElement(record:(String, Seq[String]), f:String=>Unit) =
    *                          for (e <- record._2){f(e)}
+   * @param separateWorkingDir Use separate working directories for each task.
    * @return the result RDD
    */
   def pipe(
       command: Seq[String],
       env: Map[String, String] = Map(),
       printPipeContext: (String => Unit) => Unit = null,
-      printRDDElement: (T, String => Unit) => Unit = null): RDD[String] = {
+      printRDDElement: (T, String => Unit) => Unit = null,
+      separateWorkingDir: Boolean = false): RDD[String] = {
     new PipedRDD(this, command, env,
       if (printPipeContext ne null) sc.clean(printPipeContext) else null,
-      if (printRDDElement ne null) sc.clean(printRDDElement) else null)
+      if (printRDDElement ne null) sc.clean(printRDDElement) else null,
+      separateWorkingDir)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 62ee704d580c2..737b765e2aed6 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -26,6 +26,7 @@ import java.util.concurrent.{ConcurrentHashMap, Executors, ThreadPoolExecutor}
 import scala.collection.JavaConversions._
 import scala.collection.Map
 import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.SortedSet
 import scala.io.Source
 import scala.reflect.ClassTag
 
@@ -43,6 +44,8 @@ import org.apache.spark.serializer.{DeserializationStream, SerializationStream,
  */
 private[spark] object Utils extends Logging {
 
+  val osName = System.getProperty("os.name")
+
   /** Serialize an object using Java serialization */
   def serialize[T](o: T): Array[Byte] = {
     val bos = new ByteArrayOutputStream()
@@ -521,9 +524,10 @@ private[spark] object Utils extends Logging {
 
   /**
    * Delete a file or directory and its contents recursively.
+   * Don't follow directories if they are symlinks.
    */
   def deleteRecursively(file: File) {
-    if (file.isDirectory) {
+    if ((file.isDirectory) && !isSymlink(file)) {
       for (child <- listFilesSafely(file)) {
         deleteRecursively(child)
       }
@@ -536,6 +540,25 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  /**
+   * Check to see if file is a symbolic link. 
+   */
+  def isSymlink(file: File): Boolean = {
+    if (file == null) throw new NullPointerException("File must not be null")
+    if (osName.startsWith("Windows")) return false
+    val fileInCanonicalDir = if (file.getParent() == null) {
+      file
+    } else {
+      new File(file.getParentFile().getCanonicalFile(), file.getName())
+    }
+
+    if (fileInCanonicalDir.getCanonicalFile().equals(fileInCanonicalDir.getAbsoluteFile())) {
+      return false;
+    } else {
+      return true;
+    }
+  }
+
   /**
    * Convert a Java memory parameter passed to -Xmx (such as 300m or 1g) to a number of megabytes.
    */
@@ -898,6 +921,26 @@ private[spark] object Utils extends Logging {
     count
   }
 
+  /**
+   * Creates a symlink. Note jdk1.7 has Files.createSymbolicLink but not used here
+   * for jdk1.6 support.  Supports windows by doing copy, everything else uses "ln -sf".
+   * @param src absolute path to the source
+   * @param dst relative path for the destination
+   */
+  def symlink(src: File, dst: File) {
+    if (!src.isAbsolute()) {
+      throw new IOException("Source must be absolute")
+    }
+    if (dst.isAbsolute()) {
+      throw new IOException("Destination must be relative")
+    }
+    val linkCmd = if (osName.startsWith("Windows")) "copy" else "ln -sf"
+    import scala.sys.process._
+    (linkCmd + " " + src.getAbsolutePath() + " " + dst.getPath()) lines_! ProcessLogger(line =>
+       (logInfo(line)))
+  }
+
+
   /** Return the class name of the given object, removing all dollar signs */
   def getFormattedClassName(obj: AnyRef) = {
     obj.getClass.getSimpleName.replace("$", "")
diff --git a/core/src/test/scala/org/apache/spark/PipedRDDSuite.scala b/core/src/test/scala/org/apache/spark/PipedRDDSuite.scala
index 6e7fd55fa4bb1..627e9b5cd9060 100644
--- a/core/src/test/scala/org/apache/spark/PipedRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/PipedRDDSuite.scala
@@ -17,8 +17,11 @@
 
 package org.apache.spark
 
-import org.scalatest.FunSuite
+import java.io.File
+
+import com.google.common.io.Files
 
+import org.scalatest.FunSuite
 
 import org.apache.spark.rdd.{HadoopRDD, PipedRDD, HadoopPartition}
 import org.apache.hadoop.mapred.{JobConf, TextInputFormat, FileSplit}
@@ -126,6 +129,29 @@ class PipedRDDSuite extends FunSuite with SharedSparkContext {
     }
   }
 
+  test("basic pipe with separate working directory") {
+    if (testCommandAvailable("cat")) {
+      val nums = sc.makeRDD(Array(1, 2, 3, 4), 2)
+      val piped = nums.pipe(Seq("cat"), separateWorkingDir = true)
+      val c = piped.collect()
+      assert(c.size === 4)
+      assert(c(0) === "1")
+      assert(c(1) === "2")
+      assert(c(2) === "3")
+      assert(c(3) === "4")
+      val pipedPwd = nums.pipe(Seq("pwd"), separateWorkingDir = true)
+      val collectPwd = pipedPwd.collect()
+      assert(collectPwd(0).contains("tasks/"))
+      val pipedLs = nums.pipe(Seq("ls"), separateWorkingDir = true).collect()
+      // make sure symlinks were created
+      assert(pipedLs.length > 0)
+      // clean up top level tasks directory
+      new File("tasks").delete()
+    } else {
+      assert(true)
+    }
+  }
+
   test("test pipe exports map_input_file") {
     testExportInputFile("map_input_file")
   }

From d956cc251676d67d87bd6dbfa82be864933d8136 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Fri, 4 Apr 2014 17:23:17 -0700
Subject: [PATCH 04/21] [SQL] Minor fixes.

Author: Michael Armbrust <michael@databricks.com>

Closes #315 from marmbrus/minorFixes and squashes the following commits:

b23a15d [Michael Armbrust] fix scaladoc
11062ac [Michael Armbrust] Fix registering "SELECT *" queries as tables and caching them.  As some tests for this and self-joins.
3997dc9 [Michael Armbrust] Move Row extractor to catalyst.
208bf5e [Michael Armbrust] More idiomatic naming of DSL functions. * subquery => as * for join condition => on, i.e., `r.join(s, condition = 'a == 'b)` =>`r.join(s, on = 'a == 'b)`
87211ce [Michael Armbrust] Correctly handle self joins of in-memory cached tables.
69e195e [Michael Armbrust] Change != to !== in the DSL since != will always translate to != on Any.
01f2dd5 [Michael Armbrust] Correctly assign aliases to tables in SqlParser.
---
 .../apache/spark/sql/catalyst/SqlParser.scala    |  2 +-
 .../apache/spark/sql/catalyst/dsl/package.scala  |  2 +-
 .../spark/sql/catalyst/expressions/Row.scala     | 15 +++++++++++++++
 .../catalyst/plans/logical/basicOperators.scala  |  1 +
 .../scala/org/apache/spark/sql/package.scala     | 15 +--------------
 .../scala/org/apache/spark/sql/SchemaRDD.scala   | 16 ++++++++--------
 .../apache/spark/sql/execution/SparkPlan.scala   |  3 +++
 .../org/apache/spark/sql/CachedTableSuite.scala  | 13 +++++++++++++
 .../org/apache/spark/sql/DslQuerySuite.scala     | 16 ++++++++--------
 .../spark/sql/parquet/ParquetQuerySuite.scala    |  4 ++--
 10 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index 4ea80fee23e1e..5b6aea81cb7d1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -219,7 +219,7 @@ class SqlParser extends StandardTokenParsers {
 
   protected lazy val relationFactor: Parser[LogicalPlan] =
     ident ~ (opt(AS) ~> opt(ident)) ^^ {
-      case ident ~ alias => UnresolvedRelation(alias, ident)
+      case tableName ~ alias => UnresolvedRelation(None, tableName, alias)
     } |
     "(" ~> query ~ ")" ~ opt(AS) ~ ident ^^ { case s ~ _ ~ _ ~ a => Subquery(a, s) }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 2c4bf1715b646..2d62e4cbbce01 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -70,7 +70,7 @@ package object dsl {
     def > (other: Expression) = GreaterThan(expr, other)
     def >= (other: Expression) = GreaterThanOrEqual(expr, other)
     def === (other: Expression) = Equals(expr, other)
-    def != (other: Expression) = Not(Equals(expr, other))
+    def !== (other: Expression) = Not(Equals(expr, other))
 
     def like(other: Expression) = Like(expr, other)
     def rlike(other: Expression) = RLike(expr, other)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala
index 6f939e6c41f6b..9f4d84466e6ac 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala
@@ -19,6 +19,21 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.types.NativeType
 
+object Row {
+  /**
+   * This method can be used to extract fields from a [[Row]] object in a pattern match. Example:
+   * {{{
+   * import org.apache.spark.sql._
+   *
+   * val pairs = sql("SELECT key, value FROM src").rdd.map {
+   *   case Row(key: Int, value: String) =>
+   *     key -> value
+   * }
+   * }}}
+   */
+  def unapplySeq(row: Row): Some[Seq[Any]] = Some(row)
+}
+
 /**
  * Represents one row of output from a relational operator.  Allows both generic access by ordinal,
  * which will incur boxing overhead for primitives, as well as native primitive access.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index b39c2b32cc42c..cfc0b0c3a8d98 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -162,6 +162,7 @@ case class LowerCaseSchema(child: LogicalPlan) extends UnaryNode {
         a.nullable)(
         a.exprId,
         a.qualifiers)
+    case other => other
   }
 
   def references = Set.empty
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/package.scala
index 9ec31689b5098..4589129cd1c90 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/package.scala
@@ -32,18 +32,5 @@ package object sql {
 
   type Row = catalyst.expressions.Row
 
-  object Row {
-    /**
-     * This method can be used to extract fields from a [[Row]] object in a pattern match. Example:
-     * {{{
-     * import org.apache.spark.sql._
-     *
-     * val pairs = sql("SELECT key, value FROM src").rdd.map {
-     *   case Row(key: Int, value: String) =>
-     *     key -> value
-     * }
-     * }}}
-     */
-    def unapplySeq(row: Row): Some[Seq[Any]] = Some(row)
-  }
+  val Row = catalyst.expressions.Row
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
index a62cb8aa1321f..fc95781448569 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
@@ -148,17 +148,17 @@ class SchemaRDD(
    *
    * @param otherPlan the [[SchemaRDD]] that should be joined with this one.
    * @param joinType One of `Inner`, `LeftOuter`, `RightOuter`, or `FullOuter`. Defaults to `Inner.`
-   * @param condition An optional condition for the join operation.  This is equivilent to the `ON`
-   *                  clause in standard SQL.  In the case of `Inner` joins, specifying a
-   *                  `condition` is equivilent to adding `where` clauses after the `join`.
+   * @param on       An optional condition for the join operation.  This is equivilent to the `ON`
+   *                 clause in standard SQL.  In the case of `Inner` joins, specifying a
+   *                 `condition` is equivilent to adding `where` clauses after the `join`.
    *
    * @group Query
    */
   def join(
       otherPlan: SchemaRDD,
       joinType: JoinType = Inner,
-      condition: Option[Expression] = None): SchemaRDD =
-    new SchemaRDD(sqlContext, Join(logicalPlan, otherPlan.logicalPlan, joinType, condition))
+      on: Option[Expression] = None): SchemaRDD =
+    new SchemaRDD(sqlContext, Join(logicalPlan, otherPlan.logicalPlan, joinType, on))
 
   /**
    * Sorts the results by the given expressions.
@@ -195,14 +195,14 @@ class SchemaRDD(
    * with the same name, for example, when peforming self-joins.
    *
    * {{{
-   *   val x = schemaRDD.where('a === 1).subquery('x)
-   *   val y = schemaRDD.where('a === 2).subquery('y)
+   *   val x = schemaRDD.where('a === 1).as('x)
+   *   val y = schemaRDD.where('a === 2).as('y)
    *   x.join(y).where("x.a".attr === "y.a".attr),
    * }}}
    *
    * @group Query
    */
-  def subquery(alias: Symbol) =
+  def as(alias: Symbol) =
     new SchemaRDD(sqlContext, Subquery(alias.name, logicalPlan))
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index acb1ee83a72f6..daa423cb8ea1a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions.GenericRow
 import org.apache.spark.sql.catalyst.plans.{QueryPlan, logical}
 import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.columnar.InMemoryColumnarTableScan
 
 abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging {
   self: Product =>
@@ -69,6 +70,8 @@ case class SparkLogicalPlan(alreadyPlanned: SparkPlan)
     SparkLogicalPlan(
       alreadyPlanned match {
         case ExistingRdd(output, rdd) => ExistingRdd(output.map(_.newInstance), rdd)
+        case InMemoryColumnarTableScan(output, child) =>
+          InMemoryColumnarTableScan(output.map(_.newInstance), child)
         case _ => sys.error("Multiple instance of the same relation detected.")
       }).asInstanceOf[this.type]
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index e5902c3cae381..7c6a642278226 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -58,4 +58,17 @@ class CachedTableSuite extends QueryTest {
       TestSQLContext.uncacheTable("testData")
     }
   }
+
+  test("SELECT Star Cached Table") {
+    TestSQLContext.sql("SELECT * FROM testData").registerAsTable("selectStar")
+    TestSQLContext.cacheTable("selectStar")
+    TestSQLContext.sql("SELECT * FROM selectStar")
+    TestSQLContext.uncacheTable("selectStar")
+  }
+
+  test("Self-join cached") {
+    TestSQLContext.cacheTable("testData")
+    TestSQLContext.sql("SELECT * FROM testData a JOIN testData b ON a.key = b.key")
+    TestSQLContext.uncacheTable("testData")
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala
index 2524a37cbac13..be0f4a4c73b36 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala
@@ -119,8 +119,8 @@ class DslQuerySuite extends QueryTest {
   }
 
   test("inner join, where, multiple matches") {
-    val x = testData2.where('a === 1).subquery('x)
-    val y = testData2.where('a === 1).subquery('y)
+    val x = testData2.where('a === 1).as('x)
+    val y = testData2.where('a === 1).as('y)
     checkAnswer(
       x.join(y).where("x.a".attr === "y.a".attr),
       (1,1,1,1) ::
@@ -131,8 +131,8 @@ class DslQuerySuite extends QueryTest {
   }
 
   test("inner join, no matches") {
-    val x = testData2.where('a === 1).subquery('x)
-    val y = testData2.where('a === 2).subquery('y)
+    val x = testData2.where('a === 1).as('x)
+    val y = testData2.where('a === 2).as('y)
     checkAnswer(
       x.join(y).where("x.a".attr === "y.a".attr),
       Nil)
@@ -140,8 +140,8 @@ class DslQuerySuite extends QueryTest {
 
   test("big inner join, 4 matches per row") {
     val bigData = testData.unionAll(testData).unionAll(testData).unionAll(testData)
-    val bigDataX = bigData.subquery('x)
-    val bigDataY = bigData.subquery('y)
+    val bigDataX = bigData.as('x)
+    val bigDataY = bigData.as('y)
 
     checkAnswer(
       bigDataX.join(bigDataY).where("x.key".attr === "y.key".attr),
@@ -181,8 +181,8 @@ class DslQuerySuite extends QueryTest {
   }
 
   test("full outer join") {
-    val left = upperCaseData.where('N <= 4).subquery('left)
-    val right = upperCaseData.where('N >= 3).subquery('right)
+    val left = upperCaseData.where('N <= 4).as('left)
+    val right = upperCaseData.where('N >= 3).as('right)
 
     checkAnswer(
       left.join(right, FullOuter, Some("left.N".attr === "right.N".attr)),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index a62a3c4d02354..fc68d6c5620d3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -56,8 +56,8 @@ class ParquetQuerySuite extends FunSuite with BeforeAndAfterAll {
   }
 
   test("self-join parquet files") {
-    val x = ParquetTestData.testData.subquery('x)
-    val y = ParquetTestData.testData.subquery('y)
+    val x = ParquetTestData.testData.as('x)
+    val y = ParquetTestData.testData.as('y)
     val query = x.join(y).where("x.myint".attr === "y.myint".attr)
 
     // Check to make sure that the attributes from either side of the join have unique expression

From 60e18ce7dd1016647b63586520b713efc45494a8 Mon Sep 17 00:00:00 2001
From: Matei Zaharia <matei@databricks.com>
Date: Fri, 4 Apr 2014 17:29:29 -0700
Subject: [PATCH 05/21] SPARK-1414. Python API for SparkContext.wholeTextFiles

Also clarified comment on each file having to fit in memory

Author: Matei Zaharia <matei@databricks.com>

Closes #327 from mateiz/py-whole-files and squashes the following commits:

9ad64a5 [Matei Zaharia] SPARK-1414. Python API for SparkContext.wholeTextFiles
---
 .../scala/org/apache/spark/SparkContext.scala |  2 +-
 .../spark/api/java/JavaSparkContext.scala     |  2 +-
 .../apache/spark/api/python/PythonRDD.scala   |  6 ++-
 python/pyspark/context.py                     | 44 ++++++++++++++++++-
 python/pyspark/serializers.py                 |  2 +-
 5 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 28a865c0ad3b5..835cffe37a938 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -395,7 +395,7 @@ class SparkContext(
    *   (a-hdfs-path/part-nnnnn, its content)
    * }}}
    *
-   * @note Small files are perferred, large file is also allowable, but may cause bad performance.
+   * @note Small files are preferred, as each file will be loaded fully in memory.
    */
   def wholeTextFiles(path: String): RDD[(String, String)] = {
     newAPIHadoopFile(
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
index 6cbdeac58d5e2..a2855d4db1d2e 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -177,7 +177,7 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
    *   (a-hdfs-path/part-nnnnn, its content)
    * }}}
    *
-   * @note Small files are perferred, large file is also allowable, but may cause bad performance.
+   * @note Small files are preferred, as each file will be loaded fully in memory.
    */
   def wholeTextFiles(path: String): JavaPairRDD[String, String] =
     new JavaPairRDD(sc.wholeTextFiles(path))
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index b67286a4e3b75..32f1100406d74 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -19,6 +19,7 @@ package org.apache.spark.api.python
 
 import java.io._
 import java.net._
+import java.nio.charset.Charset
 import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
 
 import scala.collection.JavaConversions._
@@ -206,6 +207,7 @@ private object SpecialLengths {
 }
 
 private[spark] object PythonRDD {
+  val UTF8 = Charset.forName("UTF-8")
 
   def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int):
   JavaRDD[Array[Byte]] = {
@@ -266,7 +268,7 @@ private[spark] object PythonRDD {
   }
 
   def writeUTF(str: String, dataOut: DataOutputStream) {
-    val bytes = str.getBytes("UTF-8")
+    val bytes = str.getBytes(UTF8)
     dataOut.writeInt(bytes.length)
     dataOut.write(bytes)
   }
@@ -286,7 +288,7 @@ private[spark] object PythonRDD {
 
 private
 class BytesToString extends org.apache.spark.api.java.function.Function[Array[Byte], String] {
-  override def call(arr: Array[Byte]) : String = new String(arr, "UTF-8")
+  override def call(arr: Array[Byte]) : String = new String(arr, PythonRDD.UTF8)
 }
 
 /**
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index bf2454fd7e38e..ff1023bbfa539 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -28,7 +28,8 @@
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
-from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
+from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer, \
+        PairDeserializer
 from pyspark.storagelevel import StorageLevel
 from pyspark import rdd
 from pyspark.rdd import RDD
@@ -257,6 +258,45 @@ def textFile(self, name, minSplits=None):
         return RDD(self._jsc.textFile(name, minSplits), self,
                    UTF8Deserializer())
 
+    def wholeTextFiles(self, path):
+        """
+        Read a directory of text files from HDFS, a local file system
+        (available on all nodes), or any  Hadoop-supported file system
+        URI. Each file is read as a single record and returned in a
+        key-value pair, where the key is the path of each file, the
+        value is the content of each file.
+
+        For example, if you have the following files::
+
+          hdfs://a-hdfs-path/part-00000
+          hdfs://a-hdfs-path/part-00001
+          ...
+          hdfs://a-hdfs-path/part-nnnnn
+
+        Do C{rdd = sparkContext.wholeTextFiles("hdfs://a-hdfs-path")},
+        then C{rdd} contains::
+
+          (a-hdfs-path/part-00000, its content)
+          (a-hdfs-path/part-00001, its content)
+          ...
+          (a-hdfs-path/part-nnnnn, its content)
+
+        NOTE: Small files are preferred, as each file will be loaded
+        fully in memory.
+
+        >>> dirPath = os.path.join(tempdir, "files")
+        >>> os.mkdir(dirPath)
+        >>> with open(os.path.join(dirPath, "1.txt"), "w") as file1:
+        ...    file1.write("1")
+        >>> with open(os.path.join(dirPath, "2.txt"), "w") as file2:
+        ...    file2.write("2")
+        >>> textFiles = sc.wholeTextFiles(dirPath)
+        >>> sorted(textFiles.collect())
+        [(u'.../1.txt', u'1'), (u'.../2.txt', u'2')]
+        """
+        return RDD(self._jsc.wholeTextFiles(path), self,
+                   PairDeserializer(UTF8Deserializer(), UTF8Deserializer()))
+
     def _checkpointFile(self, name, input_deserializer):
         jrdd = self._jsc.checkpointFile(name)
         return RDD(jrdd, self, input_deserializer)
@@ -425,7 +465,7 @@ def _test():
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
     globs['tempdir'] = tempfile.mkdtemp()
     atexit.register(lambda: shutil.rmtree(globs['tempdir']))
-    (failure_count, test_count) = doctest.testmod(globs=globs)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 12c63f186a2b7..4d802924df4a1 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -290,7 +290,7 @@ class MarshalSerializer(FramedSerializer):
 
 class UTF8Deserializer(Serializer):
     """
-    Deserializes streams written by getBytes.
+    Deserializes streams written by String.getBytes.
     """
 
     def loads(self, stream):

From 5f3c1bb5136b3389bea3af4fb39a083d979efa4c Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Fri, 4 Apr 2014 19:15:15 -0700
Subject: [PATCH 06/21] Add test utility for generating Jar files with compiled
 classes.

This was requested by a few different people and may be generally
useful, so I'd like to contribute this and not block on a different
PR for it to get in.

Author: Patrick Wendell <pwendell@gmail.com>

Closes #326 from pwendell/class-loader-test-utils and squashes the following commits:

ff3e88e [Patrick Wendell] Add test utility for generating Jar files with compiled classes.
---
 .../scala/org/apache/spark/TestUtils.scala    | 98 +++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 core/src/test/scala/org/apache/spark/TestUtils.scala

diff --git a/core/src/test/scala/org/apache/spark/TestUtils.scala b/core/src/test/scala/org/apache/spark/TestUtils.scala
new file mode 100644
index 0000000000000..1611d09652d40
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/TestUtils.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import java.io.{File, FileInputStream, FileOutputStream}
+import java.net.{URI, URL}
+import java.util.jar.{JarEntry, JarOutputStream}
+
+import scala.collection.JavaConversions._
+
+import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider}
+import com.google.common.io.Files
+
+object TestUtils {
+
+  /**
+   * Create a jar that defines classes with the given names.
+   *
+   * Note: if this is used during class loader tests, class names should be unique
+   * in order to avoid interference between tests.
+   */
+  def createJarWithClasses(classNames: Seq[String]): URL = {
+    val tempDir = Files.createTempDir()
+    val files = for (name <- classNames) yield createCompiledClass(name, tempDir)
+    val jarFile = new File(tempDir, "testJar-%s.jar".format(System.currentTimeMillis()))
+    createJar(files, jarFile)
+  }
+
+  /**
+   * Create a jar file that contains this set of files. All files will be located at the root
+   * of the jar.
+   */
+  def createJar(files: Seq[File], jarFile: File): URL = {
+    val jarFileStream = new FileOutputStream(jarFile)
+    val jarStream = new JarOutputStream(jarFileStream, new java.util.jar.Manifest())
+
+    for (file <- files) {
+      val jarEntry = new JarEntry(file.getName)
+      jarStream.putNextEntry(jarEntry)
+
+      val in = new FileInputStream(file)
+      val buffer = new Array[Byte](10240)
+      var nRead = 0
+      while (nRead <= 0) {
+        nRead = in.read(buffer, 0, buffer.length)
+        jarStream.write(buffer, 0, nRead)
+      }
+      in.close()
+    }
+    jarStream.close()
+    jarFileStream.close()
+
+    jarFile.toURI.toURL
+  }
+
+  // Adapted from the JavaCompiler.java doc examples
+  private val SOURCE = JavaFileObject.Kind.SOURCE
+  private def createURI(name: String) = {
+    URI.create(s"string:///${name.replace(".", "/")}${SOURCE.extension}")
+  }
+
+  private class JavaSourceFromString(val name: String, val code: String)
+    extends SimpleJavaFileObject(createURI(name), SOURCE) {
+    override def getCharContent(ignoreEncodingErrors: Boolean) = code
+  }
+
+  /** Creates a compiled class with the given name. Class file will be placed in destDir. */
+  def createCompiledClass(className: String, destDir: File): File = {
+    val compiler = ToolProvider.getSystemJavaCompiler
+    val sourceFile = new JavaSourceFromString(className, s"public class $className {}")
+
+    // Calling this outputs a class file in pwd. It's easier to just rename the file than
+    // build a custom FileManager that controls the output location.
+    compiler.getTask(null, null, null, null, null, Seq(sourceFile)).call()
+
+    val fileName = className + ".class"
+    val result = new File(fileName)
+    if (!result.exists()) throw new Exception("Compiled file not found: " + fileName)
+    val out = new File(destDir, fileName)
+    result.renameTo(out)
+    out
+  }
+}

From 1347ebd4b52ffb9197fc4137a55dff6badb149ba Mon Sep 17 00:00:00 2001
From: Mark Hamstra <markhamstra@gmail.com>
Date: Fri, 4 Apr 2014 19:19:48 -0700
Subject: [PATCH 07/21] [SPARK-1419] Bumped parent POM to apache 14

Keeping up-to-date with the parent, which includes some bugfixes.

Author: Mark Hamstra <markhamstra@gmail.com>

Closes #328 from markhamstra/Apache14 and squashes the following commits:

3f19975 [Mark Hamstra] Bumped parent POM to apache 14
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 7d58060cba606..01341d21b7f23 100644
--- a/pom.xml
+++ b/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache</groupId>
     <artifactId>apache</artifactId>
-    <version>13</version>
+    <version>14</version>
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent</artifactId>

From b50ddfde0342990979979e58348f54c10b500c90 Mon Sep 17 00:00:00 2001
From: Haoyuan Li <haoyuan@cs.berkeley.edu>
Date: Fri, 4 Apr 2014 20:36:24 -0700
Subject: [PATCH 08/21] SPARK-1305: Support persisting RDD's directly to
 Tachyon

Move the PR#468 of apache-incubator-spark to the apache-spark
"Adding an option to persist Spark RDD blocks into Tachyon."

Author: Haoyuan Li <haoyuan@cs.berkeley.edu>
Author: RongGu <gurongwalker@gmail.com>

Closes #158 from RongGu/master and squashes the following commits:

72b7768 [Haoyuan Li] merge master
9f7fa1b [Haoyuan Li] fix code style
ae7834b [Haoyuan Li] minor cleanup
a8b3ec6 [Haoyuan Li] merge master branch
e0f4891 [Haoyuan Li] better check offheap.
55b5918 [RongGu] address matei's comment on the replication of offHeap storagelevel
7cd4600 [RongGu] remove some logic code for tachyonstore's replication
51149e7 [RongGu] address aaron's comment on returning value of the remove() function in tachyonstore
8adfcfa [RongGu] address arron's comment on inTachyonSize
120e48a [RongGu] changed the root-level dir name in Tachyon
5cc041c [Haoyuan Li] address aaron's comments
9b97935 [Haoyuan Li] address aaron's comments
d9a6438 [Haoyuan Li] fix for pspark
77d2703 [Haoyuan Li] change python api.git status
3dcace4 [Haoyuan Li] address matei's comments
91fa09d [Haoyuan Li] address patrick's comments
589eafe [Haoyuan Li] use TRY_CACHE instead of MUST_CACHE
64348b2 [Haoyuan Li] update conf docs.
ed73e19 [Haoyuan Li] Merge branch 'master' of github.com:RongGu/spark-1
619a9a8 [RongGu] set number of directories in TachyonStore back to 64; added a TODO tag for duplicated code from the DiskStore
be79d77 [RongGu] find a way to clean up some unnecessay metods and classed to make the code simpler
49cc724 [Haoyuan Li] update docs with off_headp option
4572f9f [RongGu] reserving the old apply function API of StorageLevel
04301d3 [RongGu] rename StorageLevel.TACHYON to Storage.OFF_HEAP
c9aeabf [RongGu] rename the StorgeLevel.TACHYON as StorageLevel.OFF_HEAP
76805aa [RongGu] unifies the config properties name prefix; add the configs into docs/configuration.md
e700d9c [RongGu] add the SparkTachyonHdfsLR example and some comments
fd84156 [RongGu] use randomUUID to generate sparkapp directory name on tachyon;minor code style fix
939e467 [Haoyuan Li] 0.4.1-thrift from maven central
86a2eab [Haoyuan Li] tachyon 0.4.1-thrift is in the staging repo. but jenkins failed to download it. temporarily revert it back to 0.4.1
16c5798 [RongGu] make the dependency on tachyon as tachyon-0.4.1-thrift
eacb2e8 [RongGu] Merge branch 'master' of https://github.com/RongGu/spark-1
bbeb4de [RongGu] fix the JsonProtocolSuite test failure problem
6adb58f [RongGu] Merge branch 'master' of https://github.com/RongGu/spark-1
d827250 [RongGu] fix JsonProtocolSuie test failure
716e93b [Haoyuan Li] revert the version
ca14469 [Haoyuan Li] bump tachyon version to 0.4.1-thrift
2825a13 [RongGu] up-merging to the current master branch of the apache spark
6a22c1a [Haoyuan Li] fix scalastyle
8968b67 [Haoyuan Li] exclude more libraries from tachyon dependency to be the same as referencing tachyon-client.
77be7e8 [RongGu] address mateiz's comment about the temp folder name problem. The implementation followed mateiz's advice.
1dcadf9 [Haoyuan Li] typo
bf278fa [Haoyuan Li] fix python tests
e82909c [Haoyuan Li] minor cleanup
776a56c [Haoyuan Li] address patrick's and ali's comments from the previous PR
8859371 [Haoyuan Li] various minor fixes and clean up
e3ddbba [Haoyuan Li] add doc to use Tachyon cache mode.
fcaeab2 [Haoyuan Li] address Aaron's comment
e554b1e [Haoyuan Li] add python code
47304b3 [Haoyuan Li] make tachyonStore in BlockMananger lazy val; add more comments StorageLevels.
dc8ef24 [Haoyuan Li] add old storelevel constructor
e01a271 [Haoyuan Li] update tachyon 0.4.1
8011a96 [RongGu] fix a brought-in mistake in StorageLevel
70ca182 [RongGu] a bit change in comment
556978b [RongGu] fix the scalastyle errors
791189b [RongGu] "Adding an option to persist Spark RDD blocks into Tachyon." move the PR#468 of apache-incubator-spark to the apache-spark
---
 core/pom.xml                                  |  47 ++++++
 .../apache/spark/api/java/StorageLevels.java  |  46 ++++--
 .../scala/org/apache/spark/SparkContext.scala |  10 +-
 .../CoarseGrainedExecutorBackend.scala        |   6 +-
 .../spark/executor/ExecutorExitCode.scala     |   9 +
 .../apache/spark/storage/BlockManager.scala   |  86 ++++++++--
 .../spark/storage/BlockManagerMaster.scala    |   5 +-
 .../storage/BlockManagerMasterActor.scala     |  37 +++--
 .../spark/storage/BlockManagerMessages.scala  |  17 +-
 .../apache/spark/storage/StorageLevel.scala   |  72 +++++---
 .../spark/storage/StorageStatusListener.scala |   2 +-
 .../apache/spark/storage/StorageUtils.scala   |  23 ++-
 .../spark/storage/TachyonBlockManager.scala   | 155 ++++++++++++++++++
 .../spark/storage/TachyonFileSegment.scala    |  28 ++++
 .../apache/spark/storage/TachyonStore.scala   | 142 ++++++++++++++++
 .../apache/spark/ui/storage/IndexPage.scala   |   3 +
 .../org/apache/spark/util/JsonProtocol.scala  |  11 +-
 .../scala/org/apache/spark/util/Utils.scala   |  46 +++++-
 .../spark/storage/BlockManagerSuite.scala     |  25 ++-
 .../apache/spark/util/JsonProtocolSuite.scala |  20 +--
 docs/configuration.md                         |  39 +++--
 docs/scala-programming-guide.md               | 127 ++++++++++----
 .../org/apache/spark/examples/SparkPi.scala   |   2 +-
 .../spark/examples/SparkTachyonHdfsLR.scala   |  80 +++++++++
 .../spark/examples/SparkTachyonPi.scala       |  52 ++++++
 project/SparkBuild.scala                      |  17 +-
 python/pyspark/context.py                     |   7 +-
 python/pyspark/rdd.py                         |   3 +-
 python/pyspark/storagelevel.py                |  28 ++--
 29 files changed, 976 insertions(+), 169 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala
 create mode 100644 core/src/main/scala/org/apache/spark/storage/TachyonFileSegment.scala
 create mode 100644 core/src/main/scala/org/apache/spark/storage/TachyonStore.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/SparkTachyonHdfsLR.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/SparkTachyonPi.scala

diff --git a/core/pom.xml b/core/pom.xml
index e4c32eff0cd77..66f9fc4961b03 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -200,6 +200,53 @@
       <artifactId>derby</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.tachyonproject</groupId>
+      <artifactId>tachyon</artifactId>
+      <version>0.4.1-thrift</version>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-client</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.curator</groupId>
+          <artifactId>curator-recipes</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-jsp</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-webapp</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-server</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-servlet</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>junit</groupId>
+          <artifactId>junit</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.powermock</groupId>
+          <artifactId>powermock-module-junit4</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.powermock</groupId>
+          <artifactId>powermock-api-mockito</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.curator</groupId>
+          <artifactId>curator-test</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_${scala.binary.version}</artifactId>
diff --git a/core/src/main/java/org/apache/spark/api/java/StorageLevels.java b/core/src/main/java/org/apache/spark/api/java/StorageLevels.java
index 9f13b39909481..840a1bd93bfbb 100644
--- a/core/src/main/java/org/apache/spark/api/java/StorageLevels.java
+++ b/core/src/main/java/org/apache/spark/api/java/StorageLevels.java
@@ -23,17 +23,18 @@
  * Expose some commonly useful storage level constants.
  */
 public class StorageLevels {
-  public static final StorageLevel NONE = create(false, false, false, 1);
-  public static final StorageLevel DISK_ONLY = create(true, false, false, 1);
-  public static final StorageLevel DISK_ONLY_2 = create(true, false, false, 2);
-  public static final StorageLevel MEMORY_ONLY = create(false, true, true, 1);
-  public static final StorageLevel MEMORY_ONLY_2 = create(false, true, true, 2);
-  public static final StorageLevel MEMORY_ONLY_SER = create(false, true, false, 1);
-  public static final StorageLevel MEMORY_ONLY_SER_2 = create(false, true, false, 2);
-  public static final StorageLevel MEMORY_AND_DISK = create(true, true, true, 1);
-  public static final StorageLevel MEMORY_AND_DISK_2 = create(true, true, true, 2);
-  public static final StorageLevel MEMORY_AND_DISK_SER = create(true, true, false, 1);
-  public static final StorageLevel MEMORY_AND_DISK_SER_2 = create(true, true, false, 2);
+  public static final StorageLevel NONE = create(false, false, false, false, 1);
+  public static final StorageLevel DISK_ONLY = create(true, false, false, false, 1);
+  public static final StorageLevel DISK_ONLY_2 = create(true, false, false, false, 2);
+  public static final StorageLevel MEMORY_ONLY = create(false, true, false, true, 1);
+  public static final StorageLevel MEMORY_ONLY_2 = create(false, true, false, true, 2);
+  public static final StorageLevel MEMORY_ONLY_SER = create(false, true, false, false, 1);
+  public static final StorageLevel MEMORY_ONLY_SER_2 = create(false, true, false, false, 2);
+  public static final StorageLevel MEMORY_AND_DISK = create(true, true, false, true, 1);
+  public static final StorageLevel MEMORY_AND_DISK_2 = create(true, true, false, true, 2);
+  public static final StorageLevel MEMORY_AND_DISK_SER = create(true, true, false, false, 1);
+  public static final StorageLevel MEMORY_AND_DISK_SER_2 = create(true, true, false, false, 2);
+  public static final StorageLevel OFF_HEAP = create(false, false, true, false, 1);
 
   /**
    * Create a new StorageLevel object.
@@ -42,7 +43,26 @@ public class StorageLevels {
    * @param deserialized saved as deserialized objects, if true
    * @param replication replication factor
    */
-  public static StorageLevel create(boolean useDisk, boolean useMemory, boolean deserialized, int replication) {
-    return StorageLevel.apply(useDisk, useMemory, deserialized, replication);
+  @Deprecated
+  public static StorageLevel create(boolean useDisk, boolean useMemory, boolean deserialized,
+      int replication) {
+    return StorageLevel.apply(useDisk, useMemory, false, deserialized, replication);
+  }
+
+  /**
+   * Create a new StorageLevel object.
+   * @param useDisk saved to disk, if true
+   * @param useMemory saved to memory, if true
+   * @param useOffHeap saved to Tachyon, if true
+   * @param deserialized saved as deserialized objects, if true
+   * @param replication replication factor
+   */
+  public static StorageLevel create(
+    boolean useDisk,
+    boolean useMemory,
+    boolean useOffHeap,
+    boolean deserialized,
+    int replication) {
+    return StorageLevel.apply(useDisk, useMemory, useOffHeap, deserialized, replication);
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 835cffe37a938..fcf16ce1b278e 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -19,14 +19,13 @@ package org.apache.spark
 
 import java.io._
 import java.net.URI
-import java.util.{Properties, UUID}
 import java.util.concurrent.atomic.AtomicInteger
-
+import java.util.{Properties, UUID}
+import java.util.UUID.randomUUID
 import scala.collection.{Map, Set}
 import scala.collection.generic.Growable
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.reflect.{ClassTag, classTag}
-
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable, FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
@@ -130,6 +129,11 @@ class SparkContext(
   val master = conf.get("spark.master")
   val appName = conf.get("spark.app.name")
 
+  // Generate the random name for a temp folder in Tachyon
+  // Add a timestamp as the suffix here to make it more safe
+  val tachyonFolderName = "spark-" + randomUUID.toString()
+  conf.set("spark.tachyonStore.folderName", tachyonFolderName)
+
   val isLocal = (master == "local" || master.startsWith("local["))
 
   if (master == "yarn-client") System.setProperty("SPARK_YARN_MODE", "true")
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index 3486092a140fb..16887d8892b31 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -53,7 +53,8 @@ private[spark] class CoarseGrainedExecutorBackend(
     case RegisteredExecutor(sparkProperties) =>
       logInfo("Successfully registered with driver")
       // Make this host instead of hostPort ?
-      executor = new Executor(executorId, Utils.parseHostPort(hostPort)._1, sparkProperties)
+      executor = new Executor(executorId, Utils.parseHostPort(hostPort)._1, sparkProperties, 
+        false)
 
     case RegisterExecutorFailed(message) =>
       logError("Slave registration failed: " + message)
@@ -105,7 +106,8 @@ private[spark] object CoarseGrainedExecutorBackend {
     // set it
     val sparkHostPort = hostname + ":" + boundPort
     actorSystem.actorOf(
-      Props(classOf[CoarseGrainedExecutorBackend], driverUrl, executorId, sparkHostPort, cores),
+      Props(classOf[CoarseGrainedExecutorBackend], driverUrl, executorId,
+        sparkHostPort, cores),
       name = "Executor")
     workerUrl.foreach{ url =>
       actorSystem.actorOf(Props(classOf[WorkerWatcher], url), name = "WorkerWatcher")
diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala
index 210f3dbeebaca..ceff3a067d72a 100644
--- a/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala
+++ b/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala
@@ -41,6 +41,12 @@ object ExecutorExitCode {
   /** DiskStore failed to create a local temporary directory after many attempts. */
   val DISK_STORE_FAILED_TO_CREATE_DIR = 53
 
+  /** TachyonStore failed to initialize after many attempts. */
+  val TACHYON_STORE_FAILED_TO_INITIALIZE = 54
+  
+  /** TachyonStore failed to create a local temporary directory after many attempts. */
+  val TACHYON_STORE_FAILED_TO_CREATE_DIR = 55
+  
   def explainExitCode(exitCode: Int): String = {
     exitCode match {
       case UNCAUGHT_EXCEPTION => "Uncaught exception"
@@ -48,6 +54,9 @@ object ExecutorExitCode {
       case OOM => "OutOfMemoryError"
       case DISK_STORE_FAILED_TO_CREATE_DIR =>
         "Failed to create local directory (bad spark.local.dir?)"
+      case TACHYON_STORE_FAILED_TO_INITIALIZE => "TachyonStore failed to initialize."
+      case TACHYON_STORE_FAILED_TO_CREATE_DIR =>
+        "TachyonStore failed to create a local temporary directory."
       case _ => 
         "Unknown executor exit code (" + exitCode + ")" + (
           if (exitCode > 128) {
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 71584b6eb102a..19138d9dde697 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -19,22 +19,20 @@ package org.apache.spark.storage
 
 import java.io.{File, InputStream, OutputStream}
 import java.nio.{ByteBuffer, MappedByteBuffer}
-
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.concurrent.{Await, Future}
 import scala.concurrent.duration._
 import scala.util.Random
-
 import akka.actor.{ActorSystem, Cancellable, Props}
 import it.unimi.dsi.fastutil.io.{FastBufferedOutputStream, FastByteArrayOutputStream}
 import sun.nio.ch.DirectBuffer
-
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkEnv, SparkException}
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.network._
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.util._
 
+
 sealed trait Values
 
 case class ByteBufferValues(buffer: ByteBuffer) extends Values
@@ -59,6 +57,17 @@ private[spark] class BlockManager(
 
   private[storage] val memoryStore: BlockStore = new MemoryStore(this, maxMemory)
   private[storage] val diskStore = new DiskStore(this, diskBlockManager)
+  var tachyonInitialized = false
+  private[storage] lazy val tachyonStore: TachyonStore = {
+    val storeDir = conf.get("spark.tachyonStore.baseDir", "/tmp_spark_tachyon")
+    val appFolderName = conf.get("spark.tachyonStore.folderName")
+    val tachyonStorePath = s"${storeDir}/${appFolderName}/${this.executorId}"
+    val tachyonMaster = conf.get("spark.tachyonStore.url",  "tachyon://localhost:19998")
+    val tachyonBlockManager = new TachyonBlockManager(
+      shuffleBlockManager, tachyonStorePath, tachyonMaster)
+    tachyonInitialized = true
+    new TachyonStore(this, tachyonBlockManager)
+  }
 
   // If we use Netty for shuffle, start a new Netty-based shuffle sender service.
   private val nettyPort: Int = {
@@ -248,8 +257,10 @@ private[spark] class BlockManager(
     if (info.tellMaster) {
       val storageLevel = status.storageLevel
       val inMemSize = Math.max(status.memSize, droppedMemorySize)
+      val inTachyonSize = status.tachyonSize
       val onDiskSize = status.diskSize
-      master.updateBlockInfo(blockManagerId, blockId, storageLevel, inMemSize, onDiskSize)
+      master.updateBlockInfo(
+        blockManagerId, blockId, storageLevel, inMemSize, onDiskSize, inTachyonSize)
     } else true
   }
 
@@ -259,22 +270,24 @@ private[spark] class BlockManager(
    * and the updated in-memory and on-disk sizes.
    */
   private def getCurrentBlockStatus(blockId: BlockId, info: BlockInfo): BlockStatus = {
-    val (newLevel, inMemSize, onDiskSize) = info.synchronized {
+    val (newLevel, inMemSize, onDiskSize, inTachyonSize) = info.synchronized {
       info.level match {
         case null =>
-          (StorageLevel.NONE, 0L, 0L)
+          (StorageLevel.NONE, 0L, 0L, 0L)
         case level =>
           val inMem = level.useMemory && memoryStore.contains(blockId)
+          val inTachyon = level.useOffHeap && tachyonStore.contains(blockId)
           val onDisk = level.useDisk && diskStore.contains(blockId)
           val deserialized = if (inMem) level.deserialized else false
-          val replication = if (inMem || onDisk) level.replication else 1
-          val storageLevel = StorageLevel(onDisk, inMem, deserialized, replication)
+          val replication = if (inMem || inTachyon || onDisk) level.replication else 1
+          val storageLevel = StorageLevel(onDisk, inMem, inTachyon, deserialized, replication)
           val memSize = if (inMem) memoryStore.getSize(blockId) else 0L
+          val tachyonSize = if (inTachyon) tachyonStore.getSize(blockId) else 0L
           val diskSize = if (onDisk) diskStore.getSize(blockId) else 0L
-          (storageLevel, memSize, diskSize)
+          (storageLevel, memSize, diskSize, tachyonSize)
       }
     }
-    BlockStatus(newLevel, inMemSize, onDiskSize)
+    BlockStatus(newLevel, inMemSize, onDiskSize, inTachyonSize)
   }
 
   /**
@@ -354,6 +367,24 @@ private[spark] class BlockManager(
               logDebug("Block " + blockId + " not found in memory")
           }
         }
+        
+        // Look for the block in Tachyon
+        if (level.useOffHeap) {
+          logDebug("Getting block " + blockId + " from tachyon")
+          if (tachyonStore.contains(blockId)) {
+            tachyonStore.getBytes(blockId) match {
+              case Some(bytes) => {
+                if (!asValues) {
+                  return Some(bytes)
+                } else {
+                  return Some(dataDeserialize(blockId, bytes))
+                }
+              }
+              case None =>
+                logDebug("Block " + blockId + " not found in tachyon")
+            }
+          }
+        }
 
         // Look for block on disk, potentially storing it back into memory if required:
         if (level.useDisk) {
@@ -620,6 +651,23 @@ private[spark] class BlockManager(
           }
           // Keep track of which blocks are dropped from memory
           res.droppedBlocks.foreach { block => updatedBlocks += block }
+        } else if (level.useOffHeap) {
+          // Save to Tachyon.
+          val res = data match {
+            case IteratorValues(iterator) =>
+              tachyonStore.putValues(blockId, iterator, level, false)
+            case ArrayBufferValues(array) =>
+              tachyonStore.putValues(blockId, array, level, false)
+            case ByteBufferValues(bytes) => {
+              bytes.rewind();
+              tachyonStore.putBytes(blockId, bytes, level)
+            }
+          }
+          size = res.size
+          res.data match {
+            case Right(newBytes) => bytesAfterPut = newBytes
+            case _ =>
+          }
         } else {
           // Save directly to disk.
           // Don't get back the bytes unless we replicate them.
@@ -644,8 +692,8 @@ private[spark] class BlockManager(
 
         val putBlockStatus = getCurrentBlockStatus(blockId, putBlockInfo)
         if (putBlockStatus.storageLevel != StorageLevel.NONE) {
-          // Now that the block is in either the memory or disk store, let other threads read it,
-          // and tell the master about it.
+          // Now that the block is in either the memory, tachyon, or disk store,
+          // let other threads read it, and tell the master about it.
           marked = true
           putBlockInfo.markReady(size)
           if (tellMaster) {
@@ -707,7 +755,8 @@ private[spark] class BlockManager(
    */
   var cachedPeers: Seq[BlockManagerId] = null
   private def replicate(blockId: BlockId, data: ByteBuffer, level: StorageLevel) {
-    val tLevel = StorageLevel(level.useDisk, level.useMemory, level.deserialized, 1)
+    val tLevel = StorageLevel(
+      level.useDisk, level.useMemory, level.useOffHeap, level.deserialized, 1)
     if (cachedPeers == null) {
       cachedPeers = master.getPeers(blockManagerId, level.replication - 1)
     }
@@ -832,9 +881,10 @@ private[spark] class BlockManager(
       // Removals are idempotent in disk store and memory store. At worst, we get a warning.
       val removedFromMemory = memoryStore.remove(blockId)
       val removedFromDisk = diskStore.remove(blockId)
-      if (!removedFromMemory && !removedFromDisk) {
+      val removedFromTachyon = if (tachyonInitialized) tachyonStore.remove(blockId) else false
+      if (!removedFromMemory && !removedFromDisk && !removedFromTachyon) {
         logWarning("Block " + blockId + " could not be removed as it was not found in either " +
-          "the disk or memory store")
+          "the disk, memory, or tachyon store")
       }
       blockInfo.remove(blockId)
       if (tellMaster && info.tellMaster) {
@@ -871,6 +921,9 @@ private[spark] class BlockManager(
           if (level.useDisk) {
             diskStore.remove(id)
           }
+          if (level.useOffHeap) {
+            tachyonStore.remove(id)
+          }
           iterator.remove()
           logInfo("Dropped block " + id)
         }
@@ -946,6 +999,9 @@ private[spark] class BlockManager(
     blockInfo.clear()
     memoryStore.clear()
     diskStore.clear()
+    if (tachyonInitialized) {
+      tachyonStore.clear() 
+    }
     metadataCleaner.cancel()
     broadcastCleaner.cancel()
     logInfo("BlockManager stopped")
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
index ed6937851b836..4bc1b407ad106 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
@@ -63,9 +63,10 @@ class BlockManagerMaster(var driverActor: ActorRef, conf: SparkConf) extends Log
       blockId: BlockId,
       storageLevel: StorageLevel,
       memSize: Long,
-      diskSize: Long): Boolean = {
+      diskSize: Long,
+      tachyonSize: Long): Boolean = {
     val res = askDriverWithReply[Boolean](
-      UpdateBlockInfo(blockManagerId, blockId, storageLevel, memSize, diskSize))
+      UpdateBlockInfo(blockManagerId, blockId, storageLevel, memSize, diskSize, tachyonSize))
     logInfo("Updated info of block " + blockId)
     res
   }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala
index ff2652b640272..378f4cadc17d7 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala
@@ -73,10 +73,11 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
       register(blockManagerId, maxMemSize, slaveActor)
       sender ! true
 
-    case UpdateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size) =>
+    case UpdateBlockInfo(
+      blockManagerId, blockId, storageLevel, deserializedSize, size, tachyonSize) =>
       // TODO: Ideally we want to handle all the message replies in receive instead of in the
       // individual private methods.
-      updateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size)
+      updateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size, tachyonSize)
 
     case GetLocations(blockId) =>
       sender ! getLocations(blockId)
@@ -246,7 +247,8 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
       blockId: BlockId,
       storageLevel: StorageLevel,
       memSize: Long,
-      diskSize: Long) {
+      diskSize: Long,
+      tachyonSize: Long) {
 
     if (!blockManagerInfo.contains(blockManagerId)) {
       if (blockManagerId.executorId == "<driver>" && !isLocal) {
@@ -265,7 +267,8 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
       return
     }
 
-    blockManagerInfo(blockManagerId).updateBlockInfo(blockId, storageLevel, memSize, diskSize)
+    blockManagerInfo(blockManagerId).updateBlockInfo(
+      blockId, storageLevel, memSize, diskSize, tachyonSize)
 
     var locations: mutable.HashSet[BlockManagerId] = null
     if (blockLocations.containsKey(blockId)) {
@@ -309,8 +312,11 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
   }
 }
 
-
-private[spark] case class BlockStatus(storageLevel: StorageLevel, memSize: Long, diskSize: Long)
+private[spark] case class BlockStatus(
+    storageLevel: StorageLevel,
+    memSize: Long,
+    diskSize: Long,
+    tachyonSize: Long)
 
 private[spark] class BlockManagerInfo(
     val blockManagerId: BlockManagerId,
@@ -336,7 +342,8 @@ private[spark] class BlockManagerInfo(
       blockId: BlockId,
       storageLevel: StorageLevel,
       memSize: Long,
-      diskSize: Long) {
+      diskSize: Long,
+      tachyonSize: Long) {
 
     updateLastSeenMs()
 
@@ -350,23 +357,29 @@ private[spark] class BlockManagerInfo(
     }
 
     if (storageLevel.isValid) {
-      /* isValid means it is either stored in-memory or on-disk.
+      /* isValid means it is either stored in-memory, on-disk or on-Tachyon.
        * But the memSize here indicates the data size in or dropped from memory,
+       * tachyonSize here indicates the data size in or dropped from Tachyon,
        * and the diskSize here indicates the data size in or dropped to disk.
        * They can be both larger than 0, when a block is dropped from memory to disk.
        * Therefore, a safe way to set BlockStatus is to set its info in accurate modes. */
       if (storageLevel.useMemory) {
-        _blocks.put(blockId, BlockStatus(storageLevel, memSize, 0))
+        _blocks.put(blockId, BlockStatus(storageLevel, memSize, 0, 0))
         _remainingMem -= memSize
         logInfo("Added %s in memory on %s (size: %s, free: %s)".format(
           blockId, blockManagerId.hostPort, Utils.bytesToString(memSize),
           Utils.bytesToString(_remainingMem)))
       }
       if (storageLevel.useDisk) {
-        _blocks.put(blockId, BlockStatus(storageLevel, 0, diskSize))
+        _blocks.put(blockId, BlockStatus(storageLevel, 0, diskSize, 0))
         logInfo("Added %s on disk on %s (size: %s)".format(
           blockId, blockManagerId.hostPort, Utils.bytesToString(diskSize)))
       }
+      if (storageLevel.useOffHeap) {
+        _blocks.put(blockId, BlockStatus(storageLevel, 0, 0, tachyonSize))
+        logInfo("Added %s on tachyon on %s (size: %s)".format(
+          blockId, blockManagerId.hostPort, Utils.bytesToString(tachyonSize)))
+      }
     } else if (_blocks.containsKey(blockId)) {
       // If isValid is not true, drop the block.
       val blockStatus: BlockStatus = _blocks.get(blockId)
@@ -381,6 +394,10 @@ private[spark] class BlockManagerInfo(
         logInfo("Removed %s on %s on disk (size: %s)".format(
           blockId, blockManagerId.hostPort, Utils.bytesToString(blockStatus.diskSize)))
       }
+      if (blockStatus.storageLevel.useOffHeap) {
+        logInfo("Removed %s on %s on tachyon (size: %s)".format(
+          blockId, blockManagerId.hostPort, Utils.bytesToString(blockStatus.tachyonSize)))
+      }
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
index bbb9529b5a0ca..8a36b5cc42dfd 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
@@ -53,11 +53,12 @@ private[storage] object BlockManagerMessages {
       var blockId: BlockId,
       var storageLevel: StorageLevel,
       var memSize: Long,
-      var diskSize: Long)
+      var diskSize: Long,
+      var tachyonSize: Long)
     extends ToBlockManagerMaster
     with Externalizable {
 
-    def this() = this(null, null, null, 0, 0)  // For deserialization only
+    def this() = this(null, null, null, 0, 0, 0)  // For deserialization only
 
     override def writeExternal(out: ObjectOutput) {
       blockManagerId.writeExternal(out)
@@ -65,6 +66,7 @@ private[storage] object BlockManagerMessages {
       storageLevel.writeExternal(out)
       out.writeLong(memSize)
       out.writeLong(diskSize)
+      out.writeLong(tachyonSize)
     }
 
     override def readExternal(in: ObjectInput) {
@@ -73,6 +75,7 @@ private[storage] object BlockManagerMessages {
       storageLevel = StorageLevel(in)
       memSize = in.readLong()
       diskSize = in.readLong()
+      tachyonSize = in.readLong()
     }
   }
 
@@ -81,13 +84,15 @@ private[storage] object BlockManagerMessages {
         blockId: BlockId,
         storageLevel: StorageLevel,
         memSize: Long,
-        diskSize: Long): UpdateBlockInfo = {
-      new UpdateBlockInfo(blockManagerId, blockId, storageLevel, memSize, diskSize)
+        diskSize: Long,
+        tachyonSize: Long): UpdateBlockInfo = {
+      new UpdateBlockInfo(blockManagerId, blockId, storageLevel, memSize, diskSize, tachyonSize)
     }
 
     // For pattern-matching
-    def unapply(h: UpdateBlockInfo): Option[(BlockManagerId, BlockId, StorageLevel, Long, Long)] = {
-      Some((h.blockManagerId, h.blockId, h.storageLevel, h.memSize, h.diskSize))
+    def unapply(h: UpdateBlockInfo)
+      : Option[(BlockManagerId, BlockId, StorageLevel, Long, Long, Long)] = {
+      Some((h.blockManagerId, h.blockId, h.storageLevel, h.memSize, h.diskSize, h.tachyonSize))
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/StorageLevel.scala b/core/src/main/scala/org/apache/spark/storage/StorageLevel.scala
index 4212a539dab4b..95e71de2d3f1d 100644
--- a/core/src/main/scala/org/apache/spark/storage/StorageLevel.scala
+++ b/core/src/main/scala/org/apache/spark/storage/StorageLevel.scala
@@ -21,8 +21,9 @@ import java.io.{Externalizable, IOException, ObjectInput, ObjectOutput}
 
 /**
  * Flags for controlling the storage of an RDD. Each StorageLevel records whether to use memory,
- * whether to drop the RDD to disk if it falls out of memory, whether to keep the data in memory
- * in a serialized format, and whether to replicate the RDD partitions on multiple nodes.
+ * or Tachyon, whether to drop the RDD to disk if it falls out of memory or Tachyon , whether to
+ * keep the data in memory in a serialized format, and whether to replicate the RDD partitions on
+ * multiple nodes.
  * The [[org.apache.spark.storage.StorageLevel$]] singleton object contains some static constants
  * for commonly useful storage levels. To create your own storage level object, use the
  * factory method of the singleton object (`StorageLevel(...)`).
@@ -30,45 +31,58 @@ import java.io.{Externalizable, IOException, ObjectInput, ObjectOutput}
 class StorageLevel private(
     private var useDisk_ : Boolean,
     private var useMemory_ : Boolean,
+    private var useOffHeap_ : Boolean,
     private var deserialized_ : Boolean,
     private var replication_ : Int = 1)
   extends Externalizable {
 
   // TODO: Also add fields for caching priority, dataset ID, and flushing.
   private def this(flags: Int, replication: Int) {
-    this((flags & 4) != 0, (flags & 2) != 0, (flags & 1) != 0, replication)
+    this((flags & 8) != 0, (flags & 4) != 0, (flags & 2) != 0, (flags & 1) != 0, replication)
   }
 
-  def this() = this(false, true, false)  // For deserialization
+  def this() = this(false, true, false, false)  // For deserialization
 
   def useDisk = useDisk_
   def useMemory = useMemory_
+  def useOffHeap = useOffHeap_
   def deserialized = deserialized_
   def replication = replication_
 
   assert(replication < 40, "Replication restricted to be less than 40 for calculating hashcodes")
 
+  if (useOffHeap) {
+    require(useDisk == false, "Off-heap storage level does not support using disk")
+    require(useMemory == false, "Off-heap storage level does not support using heap memory")
+    require(deserialized == false, "Off-heap storage level does not support deserialized storage")
+    require(replication == 1, "Off-heap storage level does not support multiple replication")
+  }
+
   override def clone(): StorageLevel = new StorageLevel(
-    this.useDisk, this.useMemory, this.deserialized, this.replication)
+    this.useDisk, this.useMemory, this.useOffHeap, this.deserialized, this.replication)
 
   override def equals(other: Any): Boolean = other match {
     case s: StorageLevel =>
       s.useDisk == useDisk &&
       s.useMemory == useMemory &&
+      s.useOffHeap == useOffHeap &&
       s.deserialized == deserialized &&
       s.replication == replication
     case _ =>
       false
   }
 
-  def isValid = ((useMemory || useDisk) && (replication > 0))
+  def isValid = ((useMemory || useDisk || useOffHeap) && (replication > 0))
 
   def toInt: Int = {
     var ret = 0
     if (useDisk_) {
-      ret |= 4
+      ret |= 8
     }
     if (useMemory_) {
+      ret |= 4
+    }
+    if (useOffHeap_) {
       ret |= 2
     }
     if (deserialized_) {
@@ -84,8 +98,9 @@ class StorageLevel private(
 
   override def readExternal(in: ObjectInput) {
     val flags = in.readByte()
-    useDisk_ = (flags & 4) != 0
-    useMemory_ = (flags & 2) != 0
+    useDisk_ = (flags & 8) != 0
+    useMemory_ = (flags & 4) != 0
+    useOffHeap_ = (flags & 2) != 0
     deserialized_ = (flags & 1) != 0
     replication_ = in.readByte()
   }
@@ -93,14 +108,15 @@ class StorageLevel private(
   @throws(classOf[IOException])
   private def readResolve(): Object = StorageLevel.getCachedStorageLevel(this)
 
-  override def toString: String =
-    "StorageLevel(%b, %b, %b, %d)".format(useDisk, useMemory, deserialized, replication)
+  override def toString: String = "StorageLevel(%b, %b, %b, %b, %d)".format(
+    useDisk, useMemory, useOffHeap, deserialized, replication)
 
   override def hashCode(): Int = toInt * 41 + replication
   def description : String = {
     var result = ""
     result += (if (useDisk) "Disk " else "")
     result += (if (useMemory) "Memory " else "")
+    result += (if (useOffHeap) "Tachyon " else "")
     result += (if (deserialized) "Deserialized " else "Serialized ")
     result += "%sx Replicated".format(replication)
     result
@@ -113,22 +129,28 @@ class StorageLevel private(
  * new storage levels.
  */
 object StorageLevel {
-  val NONE = new StorageLevel(false, false, false)
-  val DISK_ONLY = new StorageLevel(true, false, false)
-  val DISK_ONLY_2 = new StorageLevel(true, false, false, 2)
-  val MEMORY_ONLY = new StorageLevel(false, true, true)
-  val MEMORY_ONLY_2 = new StorageLevel(false, true, true, 2)
-  val MEMORY_ONLY_SER = new StorageLevel(false, true, false)
-  val MEMORY_ONLY_SER_2 = new StorageLevel(false, true, false, 2)
-  val MEMORY_AND_DISK = new StorageLevel(true, true, true)
-  val MEMORY_AND_DISK_2 = new StorageLevel(true, true, true, 2)
-  val MEMORY_AND_DISK_SER = new StorageLevel(true, true, false)
-  val MEMORY_AND_DISK_SER_2 = new StorageLevel(true, true, false, 2)
+  val NONE = new StorageLevel(false, false, false, false)
+  val DISK_ONLY = new StorageLevel(true, false, false, false)
+  val DISK_ONLY_2 = new StorageLevel(true, false, false, false, 2)
+  val MEMORY_ONLY = new StorageLevel(false, true, false, true)
+  val MEMORY_ONLY_2 = new StorageLevel(false, true, false, true, 2)
+  val MEMORY_ONLY_SER = new StorageLevel(false, true, false, false)
+  val MEMORY_ONLY_SER_2 = new StorageLevel(false, true, false, false, 2)
+  val MEMORY_AND_DISK = new StorageLevel(true, true, false, true)
+  val MEMORY_AND_DISK_2 = new StorageLevel(true, true, false, true, 2)
+  val MEMORY_AND_DISK_SER = new StorageLevel(true, true, false, false)
+  val MEMORY_AND_DISK_SER_2 = new StorageLevel(true, true, false, false, 2)
+  val OFF_HEAP = new StorageLevel(false, false, true, false)
+
+  /** Create a new StorageLevel object without setting useOffHeap */
+  def apply(useDisk: Boolean, useMemory: Boolean, useOffHeap: Boolean,
+    deserialized: Boolean, replication: Int) = getCachedStorageLevel(
+      new StorageLevel(useDisk, useMemory, useOffHeap, deserialized, replication))
 
   /** Create a new StorageLevel object */
-  def apply(useDisk: Boolean, useMemory: Boolean, deserialized: Boolean,
-      replication: Int = 1): StorageLevel =
-    getCachedStorageLevel(new StorageLevel(useDisk, useMemory, deserialized, replication))
+  def apply(useDisk: Boolean, useMemory: Boolean,
+    deserialized: Boolean, replication: Int = 1) = getCachedStorageLevel(
+      new StorageLevel(useDisk, useMemory, false, deserialized, replication))
 
   /** Create a new StorageLevel object from its integer representation */
   def apply(flags: Int, replication: Int): StorageLevel =
diff --git a/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala b/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala
index 26565f56ad858..7a174959037be 100644
--- a/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala
@@ -44,7 +44,7 @@ private[spark] class StorageStatusListener extends SparkListener {
     storageStatusList.foreach { storageStatus =>
       val unpersistedBlocksIds = storageStatus.rddBlocks.keys.filter(_.rddId == unpersistedRDDId)
       unpersistedBlocksIds.foreach { blockId =>
-        storageStatus.blocks(blockId) = BlockStatus(StorageLevel.NONE, 0L, 0L)
+        storageStatus.blocks(blockId) = BlockStatus(StorageLevel.NONE, 0L, 0L, 0L)
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala b/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
index 6153dfe0b7e13..ff6e84cf9819a 100644
--- a/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
+++ b/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
@@ -48,17 +48,23 @@ class StorageStatus(
 }
 
 private[spark]
-class RDDInfo(val id: Int, val name: String, val numPartitions: Int, val storageLevel: StorageLevel)
-  extends Ordered[RDDInfo] {
+class RDDInfo(
+  val id: Int,
+  val name: String,
+  val numPartitions: Int,
+  val storageLevel: StorageLevel) extends Ordered[RDDInfo] {
 
   var numCachedPartitions = 0
   var memSize = 0L
   var diskSize = 0L
+  var tachyonSize= 0L
 
   override def toString = {
-    ("RDD \"%s\" (%d) Storage: %s; CachedPartitions: %d; TotalPartitions: %d; MemorySize: %s; " +
-       "DiskSize: %s").format(name, id, storageLevel.toString, numCachedPartitions,
-         numPartitions, Utils.bytesToString(memSize), Utils.bytesToString(diskSize))
+    import Utils.bytesToString
+    ("RDD \"%s\" (%d) Storage: %s; CachedPartitions: %d; TotalPartitions: %d; MemorySize: %s;" +
+      "TachyonSize: %s; DiskSize: %s").format(
+        name, id, storageLevel.toString, numCachedPartitions, numPartitions,
+        bytesToString(memSize), bytesToString(tachyonSize), bytesToString(diskSize))
   }
 
   override def compare(that: RDDInfo) = {
@@ -105,14 +111,17 @@ object StorageUtils {
     val rddInfoMap = rddInfos.map { info => (info.id, info) }.toMap
 
     val rddStorageInfos = blockStatusMap.flatMap { case (rddId, blocks) =>
-      // Add up memory and disk sizes
-      val persistedBlocks = blocks.filter { status => status.memSize + status.diskSize > 0 }
+      // Add up memory, disk and Tachyon sizes
+      val persistedBlocks =
+        blocks.filter { status => status.memSize + status.diskSize + status.tachyonSize > 0 }
       val memSize = persistedBlocks.map(_.memSize).reduceOption(_ + _).getOrElse(0L)
       val diskSize = persistedBlocks.map(_.diskSize).reduceOption(_ + _).getOrElse(0L)
+      val tachyonSize = persistedBlocks.map(_.tachyonSize).reduceOption(_ + _).getOrElse(0L)
       rddInfoMap.get(rddId).map { rddInfo =>
         rddInfo.numCachedPartitions = persistedBlocks.length
         rddInfo.memSize = memSize
         rddInfo.diskSize = diskSize
+        rddInfo.tachyonSize = tachyonSize
         rddInfo
       }
     }.toArray
diff --git a/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala
new file mode 100644
index 0000000000000..b0b9674856568
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/storage/TachyonBlockManager.scala
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.storage
+
+import java.text.SimpleDateFormat
+import java.util.{Date, Random}
+
+import tachyon.client.TachyonFS
+import tachyon.client.TachyonFile
+
+import org.apache.spark.Logging
+import org.apache.spark.executor.ExecutorExitCode
+import org.apache.spark.network.netty.ShuffleSender
+import org.apache.spark.util.Utils
+
+
+/**
+ * Creates and maintains the logical mapping between logical blocks and tachyon fs locations. By
+ * default, one block is mapped to one file with a name given by its BlockId.
+ *
+ * @param rootDirs The directories to use for storing block files. Data will be hashed among these.
+ */
+private[spark] class TachyonBlockManager(
+    shuffleManager: ShuffleBlockManager,
+    rootDirs: String,
+    val master: String)
+  extends Logging {
+
+  val client = if (master != null && master != "") TachyonFS.get(master) else null
+
+  if (client == null) {
+    logError("Failed to connect to the Tachyon as the master address is not configured")
+    System.exit(ExecutorExitCode.TACHYON_STORE_FAILED_TO_INITIALIZE)
+  }
+
+  private val MAX_DIR_CREATION_ATTEMPTS = 10
+  private val subDirsPerTachyonDir =
+    shuffleManager.conf.get("spark.tachyonStore.subDirectories", "64").toInt
+
+  // Create one Tachyon directory for each path mentioned in spark.tachyonStore.folderName;
+  // then, inside this directory, create multiple subdirectories that we will hash files into,
+  // in order to avoid having really large inodes at the top level in Tachyon.
+  private val tachyonDirs: Array[TachyonFile] = createTachyonDirs()
+  private val subDirs = Array.fill(tachyonDirs.length)(new Array[TachyonFile](subDirsPerTachyonDir))
+
+  addShutdownHook()
+
+  def removeFile(file: TachyonFile): Boolean = {
+    client.delete(file.getPath(), false)
+  }
+
+  def fileExists(file: TachyonFile): Boolean = {
+    client.exist(file.getPath())
+  }
+
+  def getFile(filename: String): TachyonFile = {
+    // Figure out which tachyon directory it hashes to, and which subdirectory in that
+    val hash = Utils.nonNegativeHash(filename)
+    val dirId = hash % tachyonDirs.length
+    val subDirId = (hash / tachyonDirs.length) % subDirsPerTachyonDir
+
+    // Create the subdirectory if it doesn't already exist
+    var subDir = subDirs(dirId)(subDirId)
+    if (subDir == null) {
+      subDir = subDirs(dirId).synchronized {
+        val old = subDirs(dirId)(subDirId)
+        if (old != null) {
+          old
+        } else {
+          val path = tachyonDirs(dirId) + "/" + "%02x".format(subDirId)
+          client.mkdir(path)
+          val newDir = client.getFile(path)
+          subDirs(dirId)(subDirId) = newDir
+          newDir
+        }
+      }
+    }
+    val filePath = subDir + "/" + filename
+    if(!client.exist(filePath)) {
+      client.createFile(filePath)
+    }
+    val file = client.getFile(filePath)
+    file
+  }
+
+  def getFile(blockId: BlockId): TachyonFile = getFile(blockId.name)
+
+  // TODO: Some of the logic here could be consolidated/de-duplicated with that in the DiskStore.
+  private def createTachyonDirs(): Array[TachyonFile] = {
+    logDebug("Creating tachyon directories at root dirs '" + rootDirs + "'")
+    val dateFormat = new SimpleDateFormat("yyyyMMddHHmmss")
+    rootDirs.split(",").map { rootDir =>
+      var foundLocalDir = false
+      var tachyonDir: TachyonFile = null
+      var tachyonDirId: String = null
+      var tries = 0
+      val rand = new Random()
+      while (!foundLocalDir && tries < MAX_DIR_CREATION_ATTEMPTS) {
+        tries += 1
+        try {
+          tachyonDirId = "%s-%04x".format(dateFormat.format(new Date), rand.nextInt(65536))
+          val path = rootDir + "/" + "spark-tachyon-" + tachyonDirId
+          if (!client.exist(path)) {
+            foundLocalDir = client.mkdir(path)
+            tachyonDir = client.getFile(path)
+          }
+        } catch {
+          case e: Exception =>
+            logWarning("Attempt " + tries + " to create tachyon dir " + tachyonDir + " failed", e)
+        }
+      }
+      if (!foundLocalDir) {
+        logError("Failed " + MAX_DIR_CREATION_ATTEMPTS + " attempts to create tachyon dir in " +
+          rootDir)
+        System.exit(ExecutorExitCode.TACHYON_STORE_FAILED_TO_CREATE_DIR)
+      }
+      logInfo("Created tachyon directory at " + tachyonDir)
+      tachyonDir
+    }
+  }
+
+  private def addShutdownHook() {
+    tachyonDirs.foreach(tachyonDir => Utils.registerShutdownDeleteDir(tachyonDir))
+    Runtime.getRuntime.addShutdownHook(new Thread("delete Spark tachyon dirs") {
+      override def run() {
+        logDebug("Shutdown hook called")
+        tachyonDirs.foreach { tachyonDir =>
+          try {
+            if (!Utils.hasRootAsShutdownDeleteDir(tachyonDir)) {
+              Utils.deleteRecursively(tachyonDir, client)
+            }
+          } catch {
+            case t: Throwable =>
+              logError("Exception while deleting tachyon spark dir: " + tachyonDir, t)
+          }
+        }
+      }
+    })
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/storage/TachyonFileSegment.scala b/core/src/main/scala/org/apache/spark/storage/TachyonFileSegment.scala
new file mode 100644
index 0000000000000..b86abbda1d3e7
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/storage/TachyonFileSegment.scala
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.storage
+
+import tachyon.client.TachyonFile
+
+/**
+ * References a particular segment of a file (potentially the entire file), based off an offset and
+ * a length.
+ */
+private[spark] class TachyonFileSegment(val file: TachyonFile, val offset: Long, val length: Long) {
+  override def toString = "(name=%s, offset=%d, length=%d)".format(file.getPath(), offset, length)
+}
diff --git a/core/src/main/scala/org/apache/spark/storage/TachyonStore.scala b/core/src/main/scala/org/apache/spark/storage/TachyonStore.scala
new file mode 100644
index 0000000000000..c37e76f893605
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/storage/TachyonStore.scala
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.storage
+
+import java.io.IOException
+import java.nio.ByteBuffer
+
+import scala.collection.mutable.ArrayBuffer
+
+import tachyon.client.{WriteType, ReadType}
+
+import org.apache.spark.Logging
+import org.apache.spark.util.Utils
+import org.apache.spark.serializer.Serializer
+
+
+private class Entry(val size: Long)
+
+
+/**
+ * Stores BlockManager blocks on Tachyon.
+ */
+private class TachyonStore(
+    blockManager: BlockManager,
+    tachyonManager: TachyonBlockManager)
+  extends BlockStore(blockManager: BlockManager) with Logging {
+
+  logInfo("TachyonStore started")
+
+  override def getSize(blockId: BlockId): Long = {
+    tachyonManager.getFile(blockId.name).length
+  }
+
+  override def putBytes(blockId: BlockId, bytes: ByteBuffer, level: StorageLevel): PutResult =  {
+    putToTachyonStore(blockId, bytes, true)
+  }
+
+  override def putValues(
+      blockId: BlockId,
+      values: ArrayBuffer[Any],
+      level: StorageLevel,
+      returnValues: Boolean): PutResult = {
+    return putValues(blockId, values.toIterator, level, returnValues)
+  }
+
+  override def putValues(
+      blockId: BlockId,
+      values: Iterator[Any],
+      level: StorageLevel,
+      returnValues: Boolean): PutResult = {
+    logDebug("Attempting to write values for block " + blockId)
+    val _bytes = blockManager.dataSerialize(blockId, values)
+    putToTachyonStore(blockId, _bytes, returnValues)
+  }
+
+  private def putToTachyonStore(
+      blockId: BlockId,
+      bytes: ByteBuffer,
+      returnValues: Boolean): PutResult = {
+    // So that we do not modify the input offsets !
+    // duplicate does not copy buffer, so inexpensive
+    val byteBuffer = bytes.duplicate()
+    byteBuffer.rewind()
+    logDebug("Attempting to put block " + blockId + " into Tachyon")
+    val startTime = System.currentTimeMillis
+    val file = tachyonManager.getFile(blockId)
+    val os = file.getOutStream(WriteType.TRY_CACHE)
+    os.write(byteBuffer.array())
+    os.close()
+    val finishTime = System.currentTimeMillis
+    logDebug("Block %s stored as %s file in Tachyon in %d ms".format(
+      blockId, Utils.bytesToString(byteBuffer.limit), (finishTime - startTime)))
+
+    if (returnValues) {
+      PutResult(bytes.limit(), Right(bytes.duplicate()))
+    } else {
+      PutResult(bytes.limit(), null)
+    }
+  }
+
+  override def remove(blockId: BlockId): Boolean = {
+    val file = tachyonManager.getFile(blockId)
+    if (tachyonManager.fileExists(file)) {
+      tachyonManager.removeFile(file)
+    } else {
+      false
+    }
+  }
+
+  override def getValues(blockId: BlockId): Option[Iterator[Any]] = {
+    getBytes(blockId).map(buffer => blockManager.dataDeserialize(blockId, buffer))
+  }
+
+
+  override def getBytes(blockId: BlockId): Option[ByteBuffer] = {
+    val file = tachyonManager.getFile(blockId)
+    if (file == null || file.getLocationHosts().size == 0) {
+      return None
+    }
+    val is = file.getInStream(ReadType.CACHE)
+    var buffer: ByteBuffer = null
+    try {
+      if (is != null) {
+        val size = file.length
+        val bs = new Array[Byte](size.asInstanceOf[Int])
+        val fetchSize = is.read(bs, 0, size.asInstanceOf[Int])
+        buffer = ByteBuffer.wrap(bs)
+        if (fetchSize != size) {
+          logWarning("Failed to fetch the block " + blockId + " from Tachyon : Size " + size +
+            " is not equal to fetched size " + fetchSize)
+          return None
+        }
+      }
+    } catch {
+        case ioe: IOException => {
+          logWarning("Failed to fetch the block " + blockId + " from Tachyon", ioe)
+          return None
+        }
+    }
+    Some(buffer)
+  }
+
+  override def contains(blockId: BlockId): Boolean = {
+    val file = tachyonManager.getFile(blockId)
+    tachyonManager.fileExists(file)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/ui/storage/IndexPage.scala b/core/src/main/scala/org/apache/spark/ui/storage/IndexPage.scala
index b2732de51058a..0fa461e5e9d27 100644
--- a/core/src/main/scala/org/apache/spark/ui/storage/IndexPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/storage/IndexPage.scala
@@ -33,6 +33,7 @@ private[ui] class IndexPage(parent: BlockManagerUI) {
   private lazy val listener = parent.listener
 
   def render(request: HttpServletRequest): Seq[Node] = {
+
     val rdds = listener.rddInfoList
     val content = UIUtils.listingTable(rddHeader, rddRow, rdds)
     UIUtils.headerSparkPage(content, basePath, appName, "Storage ", Storage)
@@ -45,6 +46,7 @@ private[ui] class IndexPage(parent: BlockManagerUI) {
     "Cached Partitions",
     "Fraction Cached",
     "Size in Memory",
+    "Size in Tachyon",
     "Size on Disk")
 
   /** Render an HTML row representing an RDD */
@@ -60,6 +62,7 @@ private[ui] class IndexPage(parent: BlockManagerUI) {
       <td>{rdd.numCachedPartitions}</td>
       <td>{"%.0f%%".format(rdd.numCachedPartitions * 100.0 / rdd.numPartitions)}</td>
       <td>{Utils.bytesToString(rdd.memSize)}</td>
+      <td>{Utils.bytesToString(rdd.tachyonSize)}</td>
       <td>{Utils.bytesToString(rdd.diskSize)}</td>
     </tr>
   }
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index d9a6af61872d1..2155a8888c85c 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -274,12 +274,14 @@ private[spark] object JsonProtocol {
     ("Number of Partitions" -> rddInfo.numPartitions) ~
     ("Number of Cached Partitions" -> rddInfo.numCachedPartitions) ~
     ("Memory Size" -> rddInfo.memSize) ~
+    ("Tachyon Size" -> rddInfo.tachyonSize) ~
     ("Disk Size" -> rddInfo.diskSize)
   }
 
   def storageLevelToJson(storageLevel: StorageLevel): JValue = {
     ("Use Disk" -> storageLevel.useDisk) ~
     ("Use Memory" -> storageLevel.useMemory) ~
+    ("Use Tachyon" -> storageLevel.useOffHeap) ~
     ("Deserialized" -> storageLevel.deserialized) ~
     ("Replication" -> storageLevel.replication)
   }
@@ -288,6 +290,7 @@ private[spark] object JsonProtocol {
     val storageLevel = storageLevelToJson(blockStatus.storageLevel)
     ("Storage Level" -> storageLevel) ~
     ("Memory Size" -> blockStatus.memSize) ~
+    ("Tachyon Size" -> blockStatus.tachyonSize) ~
     ("Disk Size" -> blockStatus.diskSize)
   }
 
@@ -570,11 +573,13 @@ private[spark] object JsonProtocol {
     val numPartitions = (json \ "Number of Partitions").extract[Int]
     val numCachedPartitions = (json \ "Number of Cached Partitions").extract[Int]
     val memSize = (json \ "Memory Size").extract[Long]
+    val tachyonSize = (json \ "Tachyon Size").extract[Long]
     val diskSize = (json \ "Disk Size").extract[Long]
 
     val rddInfo = new RDDInfo(rddId, name, numPartitions, storageLevel)
     rddInfo.numCachedPartitions = numCachedPartitions
     rddInfo.memSize = memSize
+    rddInfo.tachyonSize = tachyonSize 
     rddInfo.diskSize = diskSize
     rddInfo
   }
@@ -582,16 +587,18 @@ private[spark] object JsonProtocol {
   def storageLevelFromJson(json: JValue): StorageLevel = {
     val useDisk = (json \ "Use Disk").extract[Boolean]
     val useMemory = (json \ "Use Memory").extract[Boolean]
+    val useTachyon = (json \ "Use Tachyon").extract[Boolean]
     val deserialized = (json \ "Deserialized").extract[Boolean]
     val replication = (json \ "Replication").extract[Int]
-    StorageLevel(useDisk, useMemory, deserialized, replication)
+    StorageLevel(useDisk, useMemory, useTachyon, deserialized, replication)
   }
 
   def blockStatusFromJson(json: JValue): BlockStatus = {
     val storageLevel = storageLevelFromJson(json \ "Storage Level")
     val memorySize = (json \ "Memory Size").extract[Long]
     val diskSize = (json \ "Disk Size").extract[Long]
-    BlockStatus(storageLevel, memorySize, diskSize)
+    val tachyonSize = (json \ "Tachyon Size").extract[Long]
+    BlockStatus(storageLevel, memorySize, diskSize, tachyonSize)
   }
 
 
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 737b765e2aed6..d3c39dee330b2 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -34,11 +34,13 @@ import com.google.common.io.Files
 import com.google.common.util.concurrent.ThreadFactoryBuilder
 import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
 import org.json4s._
+import tachyon.client.{TachyonFile,TachyonFS}
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.serializer.{DeserializationStream, SerializationStream, SerializerInstance}
 
+
 /**
  * Various utility methods used by Spark.
  */
@@ -153,6 +155,7 @@ private[spark] object Utils extends Logging {
   }
 
   private val shutdownDeletePaths = new scala.collection.mutable.HashSet[String]()
+  private val shutdownDeleteTachyonPaths = new scala.collection.mutable.HashSet[String]()
 
   // Register the path to be deleted via shutdown hook
   def registerShutdownDeleteDir(file: File) {
@@ -162,6 +165,14 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  // Register the tachyon path to be deleted via shutdown hook
+  def registerShutdownDeleteDir(tachyonfile: TachyonFile) {
+    val absolutePath = tachyonfile.getPath()
+    shutdownDeleteTachyonPaths.synchronized {
+      shutdownDeleteTachyonPaths += absolutePath
+    }
+  }
+
   // Is the path already registered to be deleted via a shutdown hook ?
   def hasShutdownDeleteDir(file: File): Boolean = {
     val absolutePath = file.getAbsolutePath()
@@ -170,6 +181,14 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  // Is the path already registered to be deleted via a shutdown hook ?
+  def hasShutdownDeleteTachyonDir(file: TachyonFile): Boolean = {
+    val absolutePath = file.getPath()
+    shutdownDeletePaths.synchronized {
+      shutdownDeletePaths.contains(absolutePath)
+    }
+  }
+
   // Note: if file is child of some registered path, while not equal to it, then return true;
   // else false. This is to ensure that two shutdown hooks do not try to delete each others
   // paths - resulting in IOException and incomplete cleanup.
@@ -186,6 +205,22 @@ private[spark] object Utils extends Logging {
     retval
   }
 
+  // Note: if file is child of some registered path, while not equal to it, then return true;
+  // else false. This is to ensure that two shutdown hooks do not try to delete each others
+  // paths - resulting in Exception and incomplete cleanup.
+  def hasRootAsShutdownDeleteDir(file: TachyonFile): Boolean = {
+    val absolutePath = file.getPath()
+    val retval = shutdownDeletePaths.synchronized {
+      shutdownDeletePaths.find { path =>
+        !absolutePath.equals(path) && absolutePath.startsWith(path)
+      }.isDefined
+    }
+    if (retval) {
+      logInfo("path = " + file + ", already present as root for deletion.")
+    }
+    retval
+  }
+
   /** Create a temporary directory inside the given parent directory */
   def createTempDir(root: String = System.getProperty("java.io.tmpdir")): File = {
     var attempts = 0
@@ -541,7 +576,16 @@ private[spark] object Utils extends Logging {
   }
 
   /**
-   * Check to see if file is a symbolic link. 
+   * Delete a file or directory and its contents recursively.
+   */
+  def deleteRecursively(dir: TachyonFile, client: TachyonFS) {
+    if (!client.delete(dir.getPath(), true)) {
+      throw new IOException("Failed to delete the tachyon dir: " + dir)
+    }
+  }
+
+  /**
+   * Check to see if file is a symbolic link.
    */
   def isSymlink(file: File): Boolean = {
     if (file == null) throw new NullPointerException("File must not be null")
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index e83cd55e73691..b6dd0526105a0 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -96,9 +96,9 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
   }
 
   test("StorageLevel object caching") {
-    val level1 = StorageLevel(false, false, false, 3)
-    val level2 = StorageLevel(false, false, false, 3) // this should return the same object as level1
-    val level3 = StorageLevel(false, false, false, 2) // this should return a different object
+    val level1 = StorageLevel(false, false, false, false, 3)
+    val level2 = StorageLevel(false, false, false, false, 3) // this should return the same object as level1
+    val level3 = StorageLevel(false, false, false, false, 2) // this should return a different object
     assert(level2 === level1, "level2 is not same as level1")
     assert(level2.eq(level1), "level2 is not the same object as level1")
     assert(level3 != level1, "level3 is same as level1")
@@ -410,6 +410,25 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     assert(store.memoryStore.contains(rdd(0, 3)), "rdd_0_3 was not in store")
   }
 
+  test("tachyon storage") {
+    // TODO Make the spark.test.tachyon.enable true after using tachyon 0.5.0 testing jar.
+    val tachyonUnitTestEnabled = conf.getBoolean("spark.test.tachyon.enable", false)
+    if (tachyonUnitTestEnabled) {
+      store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf, securityMgr)
+      val a1 = new Array[Byte](400)
+      val a2 = new Array[Byte](400)
+      val a3 = new Array[Byte](400)
+      store.putSingle("a1", a1, StorageLevel.OFF_HEAP)
+      store.putSingle("a2", a2, StorageLevel.OFF_HEAP)
+      store.putSingle("a3", a3, StorageLevel.OFF_HEAP)
+      assert(store.getSingle("a3").isDefined, "a3 was in store")
+      assert(store.getSingle("a2").isDefined, "a2 was in store")
+      assert(store.getSingle("a1").isDefined, "a1 was in store")
+    } else {
+      info("tachyon storage test disabled.")
+    }
+  }
+
   test("on-disk storage") {
     store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf, securityMgr)
     val a1 = new Array[Byte](400)
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 40c29014c4b59..054eb01a64c11 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -456,7 +456,7 @@ class JsonProtocolSuite extends FunSuite {
     t.shuffleWriteMetrics = Some(sw)
     // Make at most 6 blocks
     t.updatedBlocks = Some((1 to (e % 5 + 1)).map { i =>
-      (RDDBlockId(e % i, f % i), BlockStatus(StorageLevel.MEMORY_AND_DISK_SER_2, a % i, b % i))
+      (RDDBlockId(e % i, f % i), BlockStatus(StorageLevel.MEMORY_AND_DISK_SER_2, a % i, b % i, c%i))
     }.toSeq)
     t
   }
@@ -470,19 +470,19 @@ class JsonProtocolSuite extends FunSuite {
     """
       {"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":100,"Stage Name":
       "greetings","Number of Tasks":200,"RDD Info":{"RDD ID":100,"Name":"mayor","Storage
-      Level":{"Use Disk":true,"Use Memory":true,"Deserialized":true,"Replication":1},
-      "Number of Partitions":200,"Number of Cached Partitions":300,"Memory Size":400,
-      "Disk Size":500},"Emitted Task Size Warning":false},"Properties":{"France":"Paris",
-      "Germany":"Berlin","Russia":"Moscow","Ukraine":"Kiev"}}
+      Level":{"Use Disk":true,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,
+      "Replication":1},"Number of Partitions":200,"Number of Cached Partitions":300,
+      "Memory Size":400,"Disk Size":500,"Tachyon Size":0},"Emitted Task Size Warning":false},
+      "Properties":{"France":"Paris","Germany":"Berlin","Russia":"Moscow","Ukraine":"Kiev"}}
     """
 
   private val stageCompletedJsonString =
     """
       {"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":101,"Stage Name":
       "greetings","Number of Tasks":201,"RDD Info":{"RDD ID":101,"Name":"mayor","Storage
-      Level":{"Use Disk":true,"Use Memory":true,"Deserialized":true,"Replication":1},
-      "Number of Partitions":201,"Number of Cached Partitions":301,"Memory Size":401,
-      "Disk Size":501},"Emitted Task Size Warning":false}}
+      Level":{"Use Disk":true,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,
+      "Replication":1},"Number of Partitions":201,"Number of Cached Partitions":301,
+      "Memory Size":401,"Disk Size":501,"Tachyon Size":0},"Emitted Task Size Warning":false}}
     """
 
   private val taskStartJsonString =
@@ -515,8 +515,8 @@ class JsonProtocolSuite extends FunSuite {
       700,"Fetch Wait Time":900,"Remote Bytes Read":1000},"Shuffle Write Metrics":
       {"Shuffle Bytes Written":1200,"Shuffle Write Time":1500},"Updated Blocks":
       [{"Block ID":{"Type":"RDDBlockId","RDD ID":0,"Split Index":0},"Status":
-      {"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,
-      "Replication":2},"Memory Size":0,"Disk Size":0}}]}}
+      {"Storage Level":{"Use Disk":true,"Use Memory":true,"Use Tachyon":false,"Deserialized":false,
+      "Replication":2},"Memory Size":0,"Disk Size":0,"Tachyon Size":0}}]}}
     """
 
   private val jobStartJsonString =
diff --git a/docs/configuration.md b/docs/configuration.md
index 1ff0150567255..b6005acac8b93 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -122,6 +122,21 @@ Apart from these, the following properties are also available, and may be useful
     <code>spark.storage.memoryFraction</code>.
   </td>
 </tr>
+<tr>
+  <td>spark.tachyonStore.baseDir</td>
+  <td>System.getProperty("java.io.tmpdir")</td>
+  <td>
+    Directories of the Tachyon File System that store RDDs. The Tachyon file system's URL is set by <code>spark.tachyonStore.url</code>.
+    It can also be a comma-separated list of multiple directories on Tachyon file system.
+  </td>
+</tr>
+<tr>
+  <td>spark.tachyonStore.url</td>
+  <td>tachyon://localhost:19998</td>
+  <td>
+    The URL of the underlying Tachyon file system in the TachyonStore.
+  </td>
+</tr>
 <tr>
   <td>spark.mesos.coarse</td>
   <td>false</td>
@@ -161,13 +176,13 @@ Apart from these, the following properties are also available, and may be useful
   <td>spark.ui.acls.enable</td>
   <td>false</td>
   <td>
-    Whether spark web ui acls should are enabled. If enabled, this checks to see if the user has 
+    Whether spark web ui acls should are enabled. If enabled, this checks to see if the user has
     access permissions to view the web ui. See <code>spark.ui.view.acls</code> for more details.
     Also note this requires the user to be known, if the user comes across as null no checks
     are done. Filters can be used to authenticate and set the user.
   </td>
 </tr>
-<tr>  
+<tr>
   <td>spark.ui.view.acls</td>
   <td>Empty</td>
   <td>
@@ -276,10 +291,10 @@ Apart from these, the following properties are also available, and may be useful
   <td>spark.serializer.objectStreamReset</td>
   <td>10000</td>
   <td>
-    When serializing using org.apache.spark.serializer.JavaSerializer, the serializer caches 
-    objects to prevent writing redundant data, however that stops garbage collection of those 
-    objects. By calling 'reset' you flush that info from the serializer, and allow old 
-    objects to be collected. To turn off this periodic reset set it to a value of <= 0. 
+    When serializing using org.apache.spark.serializer.JavaSerializer, the serializer caches
+    objects to prevent writing redundant data, however that stops garbage collection of those
+    objects. By calling 'reset' you flush that info from the serializer, and allow old
+    objects to be collected. To turn off this periodic reset set it to a value of <= 0.
     By default it will reset the serializer every 10,000 objects.
   </td>
 </tr>
@@ -375,7 +390,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>spark.akka.heartbeat.interval</td>
   <td>1000</td>
   <td>
-    This is set to a larger value to disable failure detector that comes inbuilt akka. It can be enabled again, if you plan to use this feature (Not recommended). A larger interval value in seconds reduces network overhead and a smaller value ( ~ 1 s) might be more informative for akka's failure detector. Tune this in combination of `spark.akka.heartbeat.pauses` and `spark.akka.failure-detector.threshold` if you need to. Only positive use case for using failure detector can be, a sensistive failure detector can help evict rogue executors really quick. However this is usually not the case as gc pauses and network lags are expected in a real spark cluster. Apart from that enabling this leads to a lot of exchanges of heart beats between nodes leading to flooding the network with those. 
+    This is set to a larger value to disable failure detector that comes inbuilt akka. It can be enabled again, if you plan to use this feature (Not recommended). A larger interval value in seconds reduces network overhead and a smaller value ( ~ 1 s) might be more informative for akka's failure detector. Tune this in combination of `spark.akka.heartbeat.pauses` and `spark.akka.failure-detector.threshold` if you need to. Only positive use case for using failure detector can be, a sensistive failure detector can help evict rogue executors really quick. However this is usually not the case as gc pauses and network lags are expected in a real spark cluster. Apart from that enabling this leads to a lot of exchanges of heart beats between nodes leading to flooding the network with those.
   </td>
 </tr>
 <tr>
@@ -430,7 +445,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>spark.broadcast.blockSize</td>
   <td>4096</td>
   <td>
-    Size of each piece of a block in kilobytes for <code>TorrentBroadcastFactory</code>. 
+    Size of each piece of a block in kilobytes for <code>TorrentBroadcastFactory</code>.
     Too large a value decreases parallelism during broadcast (makes it slower); however, if it is too small, <code>BlockManager</code> might take a performance hit.
   </td>
 </tr>
@@ -555,7 +570,7 @@ Apart from these, the following properties are also available, and may be useful
     the driver.
   </td>
 </tr>
-<tr>  
+<tr>
   <td>spark.authenticate</td>
   <td>false</td>
   <td>
@@ -563,7 +578,7 @@ Apart from these, the following properties are also available, and may be useful
     running on Yarn.
   </td>
 </tr>
-<tr>  
+<tr>
   <td>spark.authenticate.secret</td>
   <td>None</td>
   <td>
@@ -571,12 +586,12 @@ Apart from these, the following properties are also available, and may be useful
     not running on Yarn and authentication is enabled.
   </td>
 </tr>
-<tr>  
+<tr>
   <td>spark.core.connection.auth.wait.timeout</td>
   <td>30</td>
   <td>
     Number of seconds for the connection to wait for authentication to occur before timing
-    out and giving up. 
+    out and giving up.
   </td>
 </tr>
 <tr>
diff --git a/docs/scala-programming-guide.md b/docs/scala-programming-guide.md
index 99412733d4268..77373890eead7 100644
--- a/docs/scala-programming-guide.md
+++ b/docs/scala-programming-guide.md
@@ -23,7 +23,7 @@ To write a Spark application, you need to add a dependency on Spark. If you use
 
     groupId = org.apache.spark
     artifactId = spark-core_{{site.SCALA_BINARY_VERSION}}
-    version = {{site.SPARK_VERSION}} 
+    version = {{site.SPARK_VERSION}}
 
 In addition, if you wish to access an HDFS cluster, you need to add a dependency on `hadoop-client` for your version of HDFS:
 
@@ -73,14 +73,14 @@ The master URL passed to Spark can be in one of the following formats:
 <table class="table">
 <tr><th>Master URL</th><th>Meaning</th></tr>
 <tr><td> local </td><td> Run Spark locally with one worker thread (i.e. no parallelism at all). </td></tr>
-<tr><td> local[K] </td><td> Run Spark locally with K worker threads (ideally, set this to the number of cores on your machine). 
+<tr><td> local[K] </td><td> Run Spark locally with K worker threads (ideally, set this to the number of cores on your machine).
 </td></tr>
-<tr><td> spark://HOST:PORT </td><td> Connect to the given <a href="spark-standalone.html">Spark standalone 
-        cluster</a> master. The port must be whichever one your master is configured to use, which is 7077 by default. 
+<tr><td> spark://HOST:PORT </td><td> Connect to the given <a href="spark-standalone.html">Spark standalone
+        cluster</a> master. The port must be whichever one your master is configured to use, which is 7077 by default.
 </td></tr>
-<tr><td> mesos://HOST:PORT </td><td> Connect to the given <a href="running-on-mesos.html">Mesos</a> cluster. 
-        The host parameter is the hostname of the Mesos master. The port must be whichever one the master is configured to use, 
-        which is 5050 by default. 
+<tr><td> mesos://HOST:PORT </td><td> Connect to the given <a href="running-on-mesos.html">Mesos</a> cluster.
+        The host parameter is the hostname of the Mesos master. The port must be whichever one the master is configured to use,
+        which is 5050 by default.
 </td></tr>
 </table>
 
@@ -265,11 +265,25 @@ A complete list of actions is available in the [RDD API doc](api/core/index.html
 
 ## RDD Persistence
 
-One of the most important capabilities in Spark is *persisting* (or *caching*) a dataset in memory across operations. When you persist an RDD, each node stores any slices of it that it computes in memory and reuses them in other actions on that dataset (or datasets derived from it). This allows future actions to be much faster (often by more than 10x). Caching is a key tool for building iterative algorithms with Spark and for interactive use from the interpreter.
-
-You can mark an RDD to be persisted using the `persist()` or `cache()` methods on it. The first time it is computed in an action, it will be kept in memory on the nodes. The cache is fault-tolerant -- if any partition of an RDD is lost, it will automatically be recomputed using the transformations that originally created it.
-
-In addition, each RDD can be stored using a different *storage level*, allowing you, for example, to persist the dataset on disk, or persist it in memory but as serialized Java objects (to save space), or even replicate it across nodes. These levels are chosen by passing a [`org.apache.spark.storage.StorageLevel`](api/core/index.html#org.apache.spark.storage.StorageLevel) object to `persist()`. The `cache()` method is a shorthand for using the default storage level, which is `StorageLevel.MEMORY_ONLY` (store deserialized objects in memory). The complete set of available storage levels is:
+One of the most important capabilities in Spark is *persisting* (or *caching*) a dataset in memory
+across operations. When you persist an RDD, each node stores any slices of it that it computes in
+memory and reuses them in other actions on that dataset (or datasets derived from it). This allows
+future actions to be much faster (often by more than 10x). Caching is a key tool for building
+iterative algorithms with Spark and for interactive use from the interpreter.
+
+You can mark an RDD to be persisted using the `persist()` or `cache()` methods on it. The first time
+it is computed in an action, it will be kept in memory on the nodes. The cache is fault-tolerant --
+if any partition of an RDD is lost, it will automatically be recomputed using the transformations
+that originally created it.
+
+In addition, each RDD can be stored using a different *storage level*, allowing you, for example, to
+persist the dataset on disk, or persist it in memory but as serialized Java objects (to save space),
+or replicate it across nodes, or store the data in off-heap memory in [Tachyon](http://tachyon-project.org/).
+These levels are chosen by passing a
+[`org.apache.spark.storage.StorageLevel`](api/core/index.html#org.apache.spark.storage.StorageLevel)
+object to `persist()`. The `cache()` method is a shorthand for using the default storage level,
+which is `StorageLevel.MEMORY_ONLY` (store deserialized objects in memory). The complete set of
+available storage levels is:
 
 <table class="table">
 <tr><th style="width:23%">Storage Level</th><th>Meaning</th></tr>
@@ -292,8 +306,16 @@ In addition, each RDD can be stored using a different *storage level*, allowing
 </tr>
 <tr>
   <td> MEMORY_AND_DISK_SER </td>
-  <td> Similar to MEMORY_ONLY_SER, but spill partitions that don't fit in memory to disk instead of recomputing them
-    on the fly each time they're needed. </td>
+  <td> Similar to MEMORY_ONLY_SER, but spill partitions that don't fit in memory to disk instead of
+    recomputing them on the fly each time they're needed. </td>
+</tr>
+<tr>
+  <td> OFF_HEAP  </td>
+  <td> Store RDD in a <i>serialized</i> format in Tachyon.
+    This is generally more space-efficient than deserialized objects, especially when using a
+    <a href="tuning.html">fast serializer</a>, but more CPU-intensive to read.
+    This also significantly reduces the overheads of GC.
+  </td>
 </tr>
 <tr>
   <td> DISK_ONLY </td>
@@ -307,30 +329,59 @@ In addition, each RDD can be stored using a different *storage level*, allowing
 
 ### Which Storage Level to Choose?
 
-Spark's storage levels are meant to provide different tradeoffs between memory usage and CPU efficiency.
-We recommend going through the following process to select one:
-
-* If your RDDs fit comfortably with the default storage level (`MEMORY_ONLY`), leave them that way. This is the most
-  CPU-efficient option, allowing operations on the RDDs to run as fast as possible.
-* If not, try using `MEMORY_ONLY_SER` and [selecting a fast serialization library](tuning.html) to make the objects
-  much more space-efficient, but still reasonably fast to access.
-* Don't spill to disk unless the functions that computed your datasets are expensive, or they filter a large
-  amount of the data. Otherwise, recomputing a partition is about as fast as reading it from disk.
-* Use the replicated storage levels if you want fast fault recovery (e.g. if using Spark to serve requests from a web
-  application). *All* the storage levels provide full fault tolerance by recomputing lost data, but the replicated ones
-  let you continue running tasks on the RDD without waiting to recompute a lost partition.
- 
-If you want to define your own storage level (say, with replication factor of 3 instead of 2), then use the function factor method `apply()` of the [`StorageLevel`](api/core/index.html#org.apache.spark.storage.StorageLevel$) singleton object.  
+Spark's storage levels are meant to provide different trade-offs between memory usage and CPU
+efficiency. It allows uses to choose memory, disk, or Tachyon for storing data. We recommend going
+through the following process to select one:
+
+* If your RDDs fit comfortably with the default storage level (`MEMORY_ONLY`), leave them that way.
+  This is the most CPU-efficient option, allowing operations on the RDDs to run as fast as possible.
+
+* If not, try using `MEMORY_ONLY_SER` and [selecting a fast serialization library](tuning.html) to
+make the objects much more space-efficient, but still reasonably fast to access. You can also use
+`OFF_HEAP` mode to store the data off the heap in [Tachyon](http://tachyon-project.org/). This will
+significantly reduce JVM GC overhead.
+
+* Don't spill to disk unless the functions that computed your datasets are expensive, or they filter
+a large amount of the data. Otherwise, recomputing a partition is about as fast as reading it from
+disk.
+
+* Use the replicated storage levels if you want fast fault recovery (e.g. if using Spark to serve
+requests from a web application). *All* the storage levels provide full fault tolerance by
+recomputing lost data, but the replicated ones let you continue running tasks on the RDD without
+waiting to recompute a lost partition.
+
+If you want to define your own storage level (say, with replication factor of 3 instead of 2), then
+use the function factor method `apply()` of the
+[`StorageLevel`](api/core/index.html#org.apache.spark.storage.StorageLevel$) singleton object.
+
+Spark has a block manager inside the Executors that let you chose memory, disk, or off-heap. The
+latter is for storing RDDs off-heap outside the Executor JVM on top of the memory management system
+[Tachyon](http://tachyon-project.org/). This mode has the following advantages:
+
+* Cached data will not be lost if individual executors crash.
+* Executors can have a smaller memory footprint, allowing you to run more executors on the same
+machine as the bulk of the memory will be inside Tachyon.
+* Reduced GC overhead since data is stored in Tachyon.
 
 # Shared Variables
 
-Normally, when a function passed to a Spark operation (such as `map` or `reduce`) is executed on a remote cluster node, it works on separate copies of all the variables used in the function. These variables are copied to each machine, and no updates to the variables on the remote machine are propagated back to the driver program. Supporting general, read-write shared variables across tasks would be inefficient. However, Spark does provide two limited types of *shared variables* for two common usage patterns: broadcast variables and accumulators.
+Normally, when a function passed to a Spark operation (such as `map` or `reduce`) is executed on a
+remote cluster node, it works on separate copies of all the variables used in the function. These
+variables are copied to each machine, and no updates to the variables on the remote machine are
+propagated back to the driver program. Supporting general, read-write shared variables across tasks
+would be inefficient. However, Spark does provide two limited types of *shared variables* for two
+common usage patterns: broadcast variables and accumulators.
 
 ## Broadcast Variables
 
-Broadcast variables allow the programmer to keep a read-only variable cached on each machine rather than shipping a copy of it with tasks. They can be used, for example, to give every node a copy of a large input dataset in an efficient manner. Spark also attempts to distribute broadcast variables using efficient broadcast algorithms to reduce communication cost.
+Broadcast variables allow the programmer to keep a read-only variable cached on each machine rather
+than shipping a copy of it with tasks. They can be used, for example, to give every node a copy of a
+large input dataset in an efficient manner. Spark also attempts to distribute broadcast variables
+using efficient broadcast algorithms to reduce communication cost.
 
-Broadcast variables are created from a variable `v` by calling `SparkContext.broadcast(v)`. The broadcast variable is a wrapper around `v`, and its value can be accessed by calling the `value` method. The interpreter session below shows this:
+Broadcast variables are created from a variable `v` by calling `SparkContext.broadcast(v)`. The
+broadcast variable is a wrapper around `v`, and its value can be accessed by calling the `value`
+method. The interpreter session below shows this:
 
 {% highlight scala %}
 scala> val broadcastVar = sc.broadcast(Array(1, 2, 3))
@@ -340,13 +391,21 @@ scala> broadcastVar.value
 res0: Array[Int] = Array(1, 2, 3)
 {% endhighlight %}
 
-After the broadcast variable is created, it should be used instead of the value `v` in any functions run on the cluster so that `v` is not shipped to the nodes more than once. In addition, the object `v` should not be modified after it is broadcast in order to ensure that all nodes get the same value of the broadcast variable (e.g. if the variable is shipped to a new node later).
+After the broadcast variable is created, it should be used instead of the value `v` in any functions
+run on the cluster so that `v` is not shipped to the nodes more than once. In addition, the object
+`v` should not be modified after it is broadcast in order to ensure that all nodes get the same
+value of the broadcast variable (e.g. if the variable is shipped to a new node later).
 
 ## Accumulators
 
-Accumulators are variables that are only "added" to through an associative operation and can therefore be efficiently supported in parallel. They can be used to implement counters (as in MapReduce) or sums. Spark natively supports accumulators of numeric value types and standard mutable collections, and programmers can add support for new types.
+Accumulators are variables that are only "added" to through an associative operation and can
+therefore be efficiently supported in parallel. They can be used to implement counters (as in
+MapReduce) or sums. Spark natively supports accumulators of numeric value types and standard mutable
+collections, and programmers can add support for new types.
 
-An accumulator is created from an initial value `v` by calling `SparkContext.accumulator(v)`. Tasks running on the cluster can then add to it using the `+=` operator. However, they cannot read its value. Only the driver program can read the accumulator's value, using its `value` method.
+An accumulator is created from an initial value `v` by calling `SparkContext.accumulator(v)`. Tasks
+running on the cluster can then add to it using the `+=` operator. However, they cannot read its
+value. Only the driver program can read the accumulator's value, using its `value` method.
 
 The interpreter session below shows an accumulator being used to add up the elements of an array:
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala b/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala
index e5a09ecec006f..d3babc3ed12c8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.examples
 
 import scala.math.random
+
 import org.apache.spark._
-import SparkContext._
 
 /** Computes an approximation to pi */
 object SparkPi {
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkTachyonHdfsLR.scala b/examples/src/main/scala/org/apache/spark/examples/SparkTachyonHdfsLR.scala
new file mode 100644
index 0000000000000..53b303d658386
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkTachyonHdfsLR.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples
+
+import java.util.Random
+import scala.math.exp
+import org.apache.spark.util.Vector
+import org.apache.spark._
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.scheduler.InputFormatInfo
+import org.apache.spark.storage.StorageLevel
+
+/**
+ * Logistic regression based classification.
+ * This example uses Tachyon to persist rdds during computation.
+ */
+object SparkTachyonHdfsLR {
+  val D = 10   // Numer of dimensions
+  val rand = new Random(42)
+
+  case class DataPoint(x: Vector, y: Double)
+
+  def parsePoint(line: String): DataPoint = {
+    val tok = new java.util.StringTokenizer(line, " ")
+    var y = tok.nextToken.toDouble
+    var x = new Array[Double](D)
+    var i = 0
+    while (i < D) {
+      x(i) = tok.nextToken.toDouble; i += 1
+    }
+    DataPoint(new Vector(x), y)
+  }
+
+  def main(args: Array[String]) {
+    if (args.length < 3) {
+      System.err.println("Usage: SparkTachyonHdfsLR <master> <file> <iters>")
+      System.exit(1)
+    }
+    val inputPath = args(1)
+    val conf = SparkHadoopUtil.get.newConfiguration()
+    val sc = new SparkContext(args(0), "SparkTachyonHdfsLR",
+      System.getenv("SPARK_HOME"), SparkContext.jarOfClass(this.getClass), Map(),
+      InputFormatInfo.computePreferredLocations(
+        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
+      ))
+    val lines = sc.textFile(inputPath)
+    val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP)
+    val ITERATIONS = args(2).toInt
+
+    // Initialize w to a random value
+    var w = Vector(D, _ => 2 * rand.nextDouble - 1)
+    println("Initial w: " + w)
+
+    for (i <- 1 to ITERATIONS) {
+      println("On iteration " + i)
+      val gradient = points.map { p =>
+        (1 / (1 + exp(-p.y * (w dot p.x))) - 1) * p.y * p.x
+      }.reduce(_ + _)
+      w -= gradient
+    }
+
+    println("Final w: " + w)
+    System.exit(0)
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkTachyonPi.scala b/examples/src/main/scala/org/apache/spark/examples/SparkTachyonPi.scala
new file mode 100644
index 0000000000000..ce78f0876ed7c
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkTachyonPi.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples
+
+import scala.math.random
+
+import org.apache.spark._
+import org.apache.spark.storage.StorageLevel
+
+/**
+ *  Computes an approximation to pi
+ *  This example uses Tachyon to persist rdds during computation.
+ */
+object SparkTachyonPi {
+  def main(args: Array[String]) {
+    if (args.length == 0) {
+      System.err.println("Usage: SparkTachyonPi <master> [<slices>]")
+      System.exit(1)
+    }
+    val spark = new SparkContext(args(0), "SparkTachyonPi",
+      System.getenv("SPARK_HOME"), SparkContext.jarOfClass(this.getClass))
+    
+    val slices = if (args.length > 1) args(1).toInt else 2
+    val n = 100000 * slices
+    
+    val rdd = spark.parallelize(1 to n, slices)
+    rdd.persist(StorageLevel.OFF_HEAP)
+    val count = rdd.map { i =>
+      val x = random * 2 - 1
+      val y = random * 2 - 1
+      if (x * x + y * y < 1) 1 else 0
+    }.reduce(_ + _)
+    println("Pi is roughly " + 4.0 * count / n)
+    
+    spark.stop()
+  }
+}
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index c5c697e8e2427..843a874fbfdb0 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -30,7 +30,7 @@ import scala.collection.JavaConversions._
 // import com.jsuereth.pgp.sbtplugin.PgpKeys._
 
 object SparkBuild extends Build {
-  val SPARK_VERSION = "1.0.0-SNAPSHOT" 
+  val SPARK_VERSION = "1.0.0-SNAPSHOT"
 
   // Hadoop version to build against. For example, "1.0.4" for Apache releases, or
   // "2.0.0-mr1-cdh4.2.0" for Cloudera Hadoop. Note that these variables can be set
@@ -185,15 +185,14 @@ object SparkBuild extends Build {
     concurrentRestrictions in Global += Tags.limit(Tags.Test, 1),
 
     resolvers ++= Seq(
-      // HTTPS is unavailable for Maven Central
       "Maven Repository"     at "http://repo.maven.apache.org/maven2",
       "Apache Repository"    at "https://repository.apache.org/content/repositories/releases",
       "JBoss Repository"     at "https://repository.jboss.org/nexus/content/repositories/releases/",
       "MQTT Repository"      at "https://repo.eclipse.org/content/repositories/paho-releases/",
-      "Cloudera Repository"  at "https://repository.cloudera.com/artifactory/cloudera-repos/",
+      "Cloudera Repository"  at "http://repository.cloudera.com/artifactory/cloudera-repos/",
       // For Sonatype publishing
-      //"sonatype-snapshots"   at "https://oss.sonatype.org/content/repositories/snapshots",
-      //"sonatype-staging"     at "https://oss.sonatype.org/service/local/staging/deploy/maven2/",
+      // "sonatype-snapshots"   at "https://oss.sonatype.org/content/repositories/snapshots",
+      // "sonatype-staging"     at "https://oss.sonatype.org/service/local/staging/deploy/maven2/",
       // also check the local Maven repository ~/.m2
       Resolver.mavenLocal
     ),
@@ -280,13 +279,18 @@ object SparkBuild extends Build {
   val slf4jVersion = "1.7.5"
 
   val excludeNetty = ExclusionRule(organization = "org.jboss.netty")
+  val excludeEclipseJetty = ExclusionRule(organization = "org.eclipse.jetty")
   val excludeAsm = ExclusionRule(organization = "org.ow2.asm")
   val excludeOldAsm = ExclusionRule(organization = "asm")
   val excludeCommonsLogging = ExclusionRule(organization = "commons-logging")
   val excludeSLF4J = ExclusionRule(organization = "org.slf4j")
   val excludeScalap = ExclusionRule(organization = "org.scala-lang", artifact = "scalap")
+  val excludeHadoop = ExclusionRule(organization = "org.apache.hadoop")
+  val excludeCurator = ExclusionRule(organization = "org.apache.curator")
+  val excludePowermock = ExclusionRule(organization = "org.powermock")
 
-  def sparkPreviousArtifact(id: String, organization: String = "org.apache.spark", 
+
+  def sparkPreviousArtifact(id: String, organization: String = "org.apache.spark",
       version: String = "0.9.0-incubating", crossVersion: String = "2.10"): Option[sbt.ModuleID] = {
     val fullId = if (crossVersion.isEmpty) id else id + "_" + crossVersion
     Some(organization % fullId % version) // the artifact to compare binary compatibility with
@@ -323,6 +327,7 @@ object SparkBuild extends Build {
         "com.codahale.metrics"       % "metrics-graphite" % "3.0.0",
         "com.twitter"               %% "chill"            % "0.3.1" excludeAll(excludeAsm),
         "com.twitter"                % "chill-java"       % "0.3.1" excludeAll(excludeAsm),
+        "org.tachyonproject"         % "tachyon"          % "0.4.1-thrift" excludeAll(excludeHadoop, excludeCurator, excludeEclipseJetty, excludePowermock),
         "com.clearspring.analytics"  % "stream"           % "2.5.1"
       ),
     libraryDependencies ++= maybeAvro
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index ff1023bbfa539..d8667e84fedff 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -423,8 +423,11 @@ def _getJavaStorageLevel(self, storageLevel):
             raise Exception("storageLevel must be of type pyspark.StorageLevel")
 
         newStorageLevel = self._jvm.org.apache.spark.storage.StorageLevel
-        return newStorageLevel(storageLevel.useDisk, storageLevel.useMemory,
-            storageLevel.deserialized, storageLevel.replication)
+        return newStorageLevel(storageLevel.useDisk,
+                               storageLevel.useMemory,
+                               storageLevel.useOffHeap,
+                               storageLevel.deserialized,
+                               storageLevel.replication)
 
     def setJobGroup(self, groupId, description):
         """
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 9943296b927dc..fb27863e07f55 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1302,11 +1302,12 @@ def getStorageLevel(self):
         Get the RDD's current storage level.
         >>> rdd1 = sc.parallelize([1,2])
         >>> rdd1.getStorageLevel()
-        StorageLevel(False, False, False, 1)
+        StorageLevel(False, False, False, False, 1)
         """
         java_storage_level = self._jrdd.getStorageLevel()
         storage_level = StorageLevel(java_storage_level.useDisk(),
                                      java_storage_level.useMemory(),
+                                     java_storage_level.useOffHeap(),
                                      java_storage_level.deserialized(),
                                      java_storage_level.replication())
         return storage_level
diff --git a/python/pyspark/storagelevel.py b/python/pyspark/storagelevel.py
index c3e3a44e8e7ab..7b6660eab231b 100644
--- a/python/pyspark/storagelevel.py
+++ b/python/pyspark/storagelevel.py
@@ -25,23 +25,25 @@ class StorageLevel:
     Also contains static constants for some commonly used storage levels, such as MEMORY_ONLY.
     """
 
-    def __init__(self, useDisk, useMemory, deserialized, replication = 1):
+    def __init__(self, useDisk, useMemory, useOffHeap, deserialized, replication = 1):
         self.useDisk = useDisk
         self.useMemory = useMemory
+        self.useOffHeap = useOffHeap
         self.deserialized = deserialized
         self.replication = replication
 
     def __repr__(self):
-        return "StorageLevel(%s, %s, %s, %s)" % (
-            self.useDisk, self.useMemory, self.deserialized, self.replication)
+        return "StorageLevel(%s, %s, %s, %s, %s)" % (
+            self.useDisk, self.useMemory, self.useOffHeap, self.deserialized, self.replication)
 
-StorageLevel.DISK_ONLY = StorageLevel(True, False, False)
-StorageLevel.DISK_ONLY_2 = StorageLevel(True, False, False, 2)
-StorageLevel.MEMORY_ONLY = StorageLevel(False, True, True)
-StorageLevel.MEMORY_ONLY_2 = StorageLevel(False, True, True, 2)
-StorageLevel.MEMORY_ONLY_SER = StorageLevel(False, True, False)
-StorageLevel.MEMORY_ONLY_SER_2 = StorageLevel(False, True, False, 2)
-StorageLevel.MEMORY_AND_DISK = StorageLevel(True, True, True)
-StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, True, 2)
-StorageLevel.MEMORY_AND_DISK_SER = StorageLevel(True, True, False)
-StorageLevel.MEMORY_AND_DISK_SER_2 = StorageLevel(True, True, False, 2)
+StorageLevel.DISK_ONLY = StorageLevel(True, False, False, False)
+StorageLevel.DISK_ONLY_2 = StorageLevel(True, False, False, False, 2)
+StorageLevel.MEMORY_ONLY = StorageLevel(False, True, False, True)
+StorageLevel.MEMORY_ONLY_2 = StorageLevel(False, True, False, True, 2)
+StorageLevel.MEMORY_ONLY_SER = StorageLevel(False, True, False, False)
+StorageLevel.MEMORY_ONLY_SER_2 = StorageLevel(False, True, False, False, 2)
+StorageLevel.MEMORY_AND_DISK = StorageLevel(True, True, False, True)
+StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, False, True, 2)
+StorageLevel.MEMORY_AND_DISK_SER = StorageLevel(True, True, False, False)
+StorageLevel.MEMORY_AND_DISK_SER_2 = StorageLevel(True, True, False, False, 2)
+StorageLevel.OFF_HEAP = StorageLevel(False, False, True, False, 1)
\ No newline at end of file

From 8de038eb366ded2ac74f72517e40545dbbab8cdd Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Fri, 4 Apr 2014 21:15:33 -0700
Subject: [PATCH 09/21] [SQL] SPARK-1366 Consistent sql function across
 different types of SQLContexts

Now users who want to use HiveQL should explicitly say `hiveql` or `hql`.

Author: Michael Armbrust <michael@databricks.com>

Closes #319 from marmbrus/standardizeSqlHql and squashes the following commits:

de68d0e [Michael Armbrust] Fix sampling test.
fbe4a54 [Michael Armbrust] Make `sql` always use spark sql parser, users of hive context can now use hql or hiveql to run queries using HiveQL instead.
---
 .../spark/sql/examples/HiveFromSpark.scala    | 12 ++---
 .../apache/spark/sql/hive/HiveContext.scala   | 17 ++++---
 .../org/apache/spark/sql/hive/TestHive.scala  | 12 ++---
 .../hive/execution/HiveComparisonTest.scala   | 10 ++--
 .../sql/hive/execution/HiveQuerySuite.scala   | 12 ++++-
 .../hive/execution/HiveResolutionSuite.scala  |  2 +-
 .../sql/hive/execution/PruningSuite.scala     |  2 +-
 .../spark/sql/parquet/HiveParquetSuite.scala  | 46 +++++++++----------
 8 files changed, 63 insertions(+), 50 deletions(-)

diff --git a/examples/src/main/scala/org/apache/spark/sql/examples/HiveFromSpark.scala b/examples/src/main/scala/org/apache/spark/sql/examples/HiveFromSpark.scala
index abcc1f04d4279..62329bde84481 100644
--- a/examples/src/main/scala/org/apache/spark/sql/examples/HiveFromSpark.scala
+++ b/examples/src/main/scala/org/apache/spark/sql/examples/HiveFromSpark.scala
@@ -33,20 +33,20 @@ object HiveFromSpark {
     val hiveContext = new LocalHiveContext(sc)
     import hiveContext._
 
-    sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
-    sql("LOAD DATA LOCAL INPATH 'src/main/resources/kv1.txt' INTO TABLE src")
+    hql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+    hql("LOAD DATA LOCAL INPATH 'src/main/resources/kv1.txt' INTO TABLE src")
 
     // Queries are expressed in HiveQL
     println("Result of 'SELECT *': ")
-    sql("SELECT * FROM src").collect.foreach(println)
+    hql("SELECT * FROM src").collect.foreach(println)
 
     // Aggregation queries are also supported.
-    val count = sql("SELECT COUNT(*) FROM src").collect().head.getInt(0)
+    val count = hql("SELECT COUNT(*) FROM src").collect().head.getInt(0)
     println(s"COUNT(*): $count")
 
     // The results of SQL queries are themselves RDDs and support all normal RDD functions.  The
     // items in the RDD are of type Row, which allows you to access each column by ordinal.
-    val rddFromSql = sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key")
+    val rddFromSql = hql("SELECT key, value FROM src WHERE key < 10 ORDER BY key")
 
     println("Result of RDD.map:")
     val rddAsStrings = rddFromSql.map {
@@ -59,6 +59,6 @@ object HiveFromSpark {
 
     // Queries can then join RDD data with data stored in Hive.
     println("Result of SELECT *:")
-    sql("SELECT * FROM records r JOIN src s ON r.key = s.key").collect().foreach(println)
+    hql("SELECT * FROM records r JOIN src s ON r.key = s.key").collect().foreach(println)
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index ff8eaacded4c8..f66a667c0a942 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -67,14 +67,13 @@ class LocalHiveContext(sc: SparkContext) extends HiveContext(sc) {
 class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   self =>
 
-  override def parseSql(sql: String): LogicalPlan = HiveQl.parseSql(sql)
-  override def executePlan(plan: LogicalPlan): this.QueryExecution =
+  override protected[sql] def executePlan(plan: LogicalPlan): this.QueryExecution =
     new this.QueryExecution { val logical = plan }
 
   /**
    * Executes a query expressed in HiveQL using Spark, returning the result as a SchemaRDD.
    */
-  def hql(hqlQuery: String): SchemaRDD = {
+  def hiveql(hqlQuery: String): SchemaRDD = {
     val result = new SchemaRDD(this, HiveQl.parseSql(hqlQuery))
     // We force query optimization to happen right away instead of letting it happen lazily like
     // when using the query DSL.  This is so DDL commands behave as expected.  This is only
@@ -83,6 +82,9 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
     result
   }
 
+  /** An alias for `hiveql`. */
+  def hql(hqlQuery: String): SchemaRDD = hiveql(hqlQuery)
+
   // Circular buffer to hold what hive prints to STDOUT and ERR.  Only printed when failures occur.
   @transient
   protected val outputBuffer =  new java.io.OutputStream {
@@ -120,7 +122,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
 
   /* A catalyst metadata catalog that points to the Hive Metastore. */
   @transient
-  override lazy val catalog = new HiveMetastoreCatalog(this) with OverrideCatalog {
+  override protected[sql] lazy val catalog = new HiveMetastoreCatalog(this) with OverrideCatalog {
     override def lookupRelation(
       databaseName: Option[String],
       tableName: String,
@@ -132,7 +134,8 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
 
   /* An analyzer that uses the Hive metastore. */
   @transient
-  override lazy val analyzer = new Analyzer(catalog, HiveFunctionRegistry, caseSensitive = false)
+  override protected[sql] lazy val analyzer =
+    new Analyzer(catalog, HiveFunctionRegistry, caseSensitive = false)
 
   /**
    * Runs the specified SQL query using Hive.
@@ -214,14 +217,14 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   }
 
   @transient
-  override val planner = hivePlanner
+  override protected[sql] val planner = hivePlanner
 
   @transient
   protected lazy val emptyResult =
     sparkContext.parallelize(Seq(new GenericRow(Array[Any]()): Row), 1)
 
   /** Extends QueryExecution with hive specific features. */
-  abstract class QueryExecution extends super.QueryExecution {
+  protected[sql] abstract class QueryExecution extends super.QueryExecution {
     // TODO: Create mixin for the analyzer instead of overriding things here.
     override lazy val optimizedPlan =
       optimizer(catalog.PreInsertionCasts(catalog.CreateTables(analyzed)))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
index 0a6bea0162430..2fea9702954d7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
@@ -110,10 +110,10 @@ class TestHiveContext(sc: SparkContext) extends LocalHiveContext(sc) {
 
   val describedTable = "DESCRIBE (\\w+)".r
 
-  class SqlQueryExecution(sql: String) extends this.QueryExecution {
-    lazy val logical = HiveQl.parseSql(sql)
-    def hiveExec() = runSqlHive(sql)
-    override def toString = sql + "\n" + super.toString
+  protected[hive] class HiveQLQueryExecution(hql: String) extends this.QueryExecution {
+    lazy val logical = HiveQl.parseSql(hql)
+    def hiveExec() = runSqlHive(hql)
+    override def toString = hql + "\n" + super.toString
   }
 
   /**
@@ -140,8 +140,8 @@ class TestHiveContext(sc: SparkContext) extends LocalHiveContext(sc) {
 
   case class TestTable(name: String, commands: (()=>Unit)*)
 
-  implicit class SqlCmd(sql: String) {
-    def cmd = () => new SqlQueryExecution(sql).stringResult(): Unit
+  protected[hive] implicit class SqlCmd(sql: String) {
+    def cmd = () => new HiveQLQueryExecution(sql).stringResult(): Unit
   }
 
   /**
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index 18654b308d234..3cc4562a88d66 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -125,7 +125,7 @@ abstract class HiveComparisonTest
   }
 
   protected def prepareAnswer(
-    hiveQuery: TestHive.type#SqlQueryExecution,
+    hiveQuery: TestHive.type#HiveQLQueryExecution,
     answer: Seq[String]): Seq[String] = {
     val orderedAnswer = hiveQuery.logical match {
       // Clean out non-deterministic time schema info.
@@ -227,7 +227,7 @@ abstract class HiveComparisonTest
 
       try {
         // MINOR HACK: You must run a query before calling reset the first time.
-        TestHive.sql("SHOW TABLES")
+        TestHive.hql("SHOW TABLES")
         if (reset) { TestHive.reset() }
 
         val hiveCacheFiles = queryList.zipWithIndex.map {
@@ -256,7 +256,7 @@ abstract class HiveComparisonTest
             hiveCachedResults
           } else {
 
-            val hiveQueries = queryList.map(new TestHive.SqlQueryExecution(_))
+            val hiveQueries = queryList.map(new TestHive.HiveQLQueryExecution(_))
             // Make sure we can at least parse everything before attempting hive execution.
             hiveQueries.foreach(_.logical)
             val computedResults = (queryList.zipWithIndex, hiveQueries, hiveCacheFiles).zipped.map {
@@ -302,7 +302,7 @@ abstract class HiveComparisonTest
 
         // Run w/ catalyst
         val catalystResults = queryList.zip(hiveResults).map { case (queryString, hive) =>
-          val query = new TestHive.SqlQueryExecution(queryString)
+          val query = new TestHive.HiveQLQueryExecution(queryString)
           try { (query, prepareAnswer(query, query.stringResult())) } catch {
             case e: Exception =>
               val errorMessage =
@@ -359,7 +359,7 @@ abstract class HiveComparisonTest
             // When we encounter an error we check to see if the environment is still okay by running a simple query.
             // If this fails then we halt testing since something must have gone seriously wrong.
             try {
-              new TestHive.SqlQueryExecution("SELECT key FROM src").stringResult()
+              new TestHive.HiveQLQueryExecution("SELECT key FROM src").stringResult()
               TestHive.runSqlHive("SELECT key FROM src")
             } catch {
               case e: Exception =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index c184ebe288af4..0c27498a93507 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -23,6 +23,16 @@ import org.apache.spark.sql.hive.TestHive._
  * A set of test cases expressed in Hive QL that are not covered by the tests included in the hive distribution.
  */
 class HiveQuerySuite extends HiveComparisonTest {
+
+  test("Query expressed in SQL") {
+    assert(sql("SELECT 1").collect() === Array(Seq(1)))
+  }
+
+  test("Query expressed in HiveQL") {
+    hql("FROM src SELECT key").collect()
+    hiveql("FROM src SELECT key").collect()
+  }
+
   createQueryTest("Simple Average",
     "SELECT AVG(key) FROM src")
 
@@ -133,7 +143,7 @@ class HiveQuerySuite extends HiveComparisonTest {
     "SELECT * FROM src LATERAL VIEW explode(map(key+3,key+4)) D as k, v")
 
   test("sampling") {
-    sql("SELECT * FROM src TABLESAMPLE(0.1 PERCENT) s")
+    hql("SELECT * FROM src TABLESAMPLE(0.1 PERCENT) s")
   }
 
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
index 40c4e23f90fb8..8883e5b16d4da 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
@@ -56,7 +56,7 @@ class HiveResolutionSuite extends HiveComparisonTest {
     TestHive.sparkContext.parallelize(Data(1, 2, Nested(1,2)) :: Nil)
       .registerAsTable("caseSensitivityTest")
 
-    sql("SELECT a, b, A, B, n.a, n.b, n.A, n.B FROM caseSensitivityTest")
+    hql("SELECT a, b, A, B, n.a, n.b, n.A, n.B FROM caseSensitivityTest")
   }
 
   /**
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
index 1318ac1968dad..d9ccb93e23923 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
@@ -136,7 +136,7 @@ class PruningSuite extends HiveComparisonTest {
       expectedScannedColumns: Seq[String],
       expectedPartValues: Seq[Seq[String]]) = {
     test(s"$testCaseName - pruning test") {
-      val plan = new TestHive.SqlQueryExecution(sql).executedPlan
+      val plan = new TestHive.HiveQLQueryExecution(sql).executedPlan
       val actualOutputColumns = plan.output.map(_.name)
       val (actualScannedColumns, actualPartValues) = plan.collect {
         case p @ HiveTableScan(columns, relation, _) =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
index 314ca48ad8f6a..aade62eb8f84e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
@@ -57,34 +57,34 @@ class HiveParquetSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAft
   }
 
   test("SELECT on Parquet table") {
-    val rdd = sql("SELECT * FROM testsource").collect()
+    val rdd = hql("SELECT * FROM testsource").collect()
     assert(rdd != null)
     assert(rdd.forall(_.size == 6))
   }
 
   test("Simple column projection + filter on Parquet table") {
-    val rdd = sql("SELECT myboolean, mylong FROM testsource WHERE myboolean=true").collect()
+    val rdd = hql("SELECT myboolean, mylong FROM testsource WHERE myboolean=true").collect()
     assert(rdd.size === 5, "Filter returned incorrect number of rows")
     assert(rdd.forall(_.getBoolean(0)), "Filter returned incorrect Boolean field value")
   }
 
   test("Converting Hive to Parquet Table via saveAsParquetFile") {
-    sql("SELECT * FROM src").saveAsParquetFile(dirname.getAbsolutePath)
+    hql("SELECT * FROM src").saveAsParquetFile(dirname.getAbsolutePath)
     parquetFile(dirname.getAbsolutePath).registerAsTable("ptable")
-    val rddOne = sql("SELECT * FROM src").collect().sortBy(_.getInt(0))
-    val rddTwo = sql("SELECT * from ptable").collect().sortBy(_.getInt(0))
+    val rddOne = hql("SELECT * FROM src").collect().sortBy(_.getInt(0))
+    val rddTwo = hql("SELECT * from ptable").collect().sortBy(_.getInt(0))
     compareRDDs(rddOne, rddTwo, "src (Hive)", Seq("key:Int", "value:String"))
   }
 
   test("INSERT OVERWRITE TABLE Parquet table") {
-    sql("SELECT * FROM testsource").saveAsParquetFile(dirname.getAbsolutePath)
+    hql("SELECT * FROM testsource").saveAsParquetFile(dirname.getAbsolutePath)
     parquetFile(dirname.getAbsolutePath).registerAsTable("ptable")
     // let's do three overwrites for good measure
-    sql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
-    sql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
-    sql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
-    val rddCopy = sql("SELECT * FROM ptable").collect()
-    val rddOrig = sql("SELECT * FROM testsource").collect()
+    hql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
+    hql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
+    hql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
+    val rddCopy = hql("SELECT * FROM ptable").collect()
+    val rddOrig = hql("SELECT * FROM testsource").collect()
     assert(rddCopy.size === rddOrig.size, "INSERT OVERWRITE changed size of table??")
     compareRDDs(rddOrig, rddCopy, "testsource", ParquetTestData.testSchemaFieldNames)
   }
@@ -93,13 +93,13 @@ class HiveParquetSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAft
     createParquetFile(dirname.getAbsolutePath, ("key", IntegerType), ("value", StringType))
       .registerAsTable("tmp")
     val rddCopy =
-      sql("INSERT INTO TABLE tmp SELECT * FROM src")
+      hql("INSERT INTO TABLE tmp SELECT * FROM src")
       .collect()
       .sortBy[Int](_.apply(0) match {
         case x: Int => x
         case _ => 0
       })
-    val rddOrig = sql("SELECT * FROM src")
+    val rddOrig = hql("SELECT * FROM src")
       .collect()
       .sortBy(_.getInt(0))
     compareRDDs(rddOrig, rddCopy, "src (Hive)", Seq("key:Int", "value:String"))
@@ -108,22 +108,22 @@ class HiveParquetSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAft
   test("Appending to Parquet table") {
     createParquetFile(dirname.getAbsolutePath, ("key", IntegerType), ("value", StringType))
       .registerAsTable("tmpnew")
-    sql("INSERT INTO TABLE tmpnew SELECT * FROM src").collect()
-    sql("INSERT INTO TABLE tmpnew SELECT * FROM src").collect()
-    sql("INSERT INTO TABLE tmpnew SELECT * FROM src").collect()
-    val rddCopies = sql("SELECT * FROM tmpnew").collect()
-    val rddOrig = sql("SELECT * FROM src").collect()
+    hql("INSERT INTO TABLE tmpnew SELECT * FROM src").collect()
+    hql("INSERT INTO TABLE tmpnew SELECT * FROM src").collect()
+    hql("INSERT INTO TABLE tmpnew SELECT * FROM src").collect()
+    val rddCopies = hql("SELECT * FROM tmpnew").collect()
+    val rddOrig = hql("SELECT * FROM src").collect()
     assert(rddCopies.size === 3 * rddOrig.size, "number of copied rows via INSERT INTO did not match correct number")
   }
 
   test("Appending to and then overwriting Parquet table") {
     createParquetFile(dirname.getAbsolutePath, ("key", IntegerType), ("value", StringType))
       .registerAsTable("tmp")
-    sql("INSERT INTO TABLE tmp SELECT * FROM src").collect()
-    sql("INSERT INTO TABLE tmp SELECT * FROM src").collect()
-    sql("INSERT OVERWRITE TABLE tmp SELECT * FROM src").collect()
-    val rddCopies = sql("SELECT * FROM tmp").collect()
-    val rddOrig = sql("SELECT * FROM src").collect()
+    hql("INSERT INTO TABLE tmp SELECT * FROM src").collect()
+    hql("INSERT INTO TABLE tmp SELECT * FROM src").collect()
+    hql("INSERT OVERWRITE TABLE tmp SELECT * FROM src").collect()
+    val rddCopies = hql("SELECT * FROM tmp").collect()
+    val rddOrig = hql("SELECT * FROM src").collect()
     assert(rddCopies.size === rddOrig.size, "INSERT OVERWRITE did not actually overwrite")
   }
 

From 0acc7a02b4323f4e0b7736bc1999bdcedab41f39 Mon Sep 17 00:00:00 2001
From: Prabeesh K <prabsmails@gmail.com>
Date: Fri, 4 Apr 2014 21:32:00 -0700
Subject: [PATCH 10/21] small fix ( proogram -> program )

Author: Prabeesh K <prabsmails@gmail.com>

Closes #331 from prabeesh/patch-3 and squashes the following commits:

9399eb5 [Prabeesh K] small fix(proogram -> program)
---
 docs/quick-start.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/quick-start.md b/docs/quick-start.md
index 13df6beea16e8..60e8b1ba0eb46 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -124,7 +124,7 @@ object SimpleApp {
 }
 {% endhighlight %}
 
-This program just counts the number of lines containing 'a' and the number containing 'b' in the Spark README. Note that you'll need to replace $YOUR_SPARK_HOME with the location where Spark is installed. Unlike the earlier examples with the Spark shell, which initializes its own SparkContext, we initialize a SparkContext as part of the proogram. We pass the SparkContext constructor four arguments, the type of scheduler we want to use (in this case, a local scheduler), a name for the application, the directory where Spark is installed, and a name for the jar file containing the application's code. The final two arguments are needed in a distributed setting, where Spark is running across several nodes, so we include them for completeness. Spark will automatically ship the jar files you list to slave nodes.
+This program just counts the number of lines containing 'a' and the number containing 'b' in the Spark README. Note that you'll need to replace $YOUR_SPARK_HOME with the location where Spark is installed. Unlike the earlier examples with the Spark shell, which initializes its own SparkContext, we initialize a SparkContext as part of the program. We pass the SparkContext constructor four arguments, the type of scheduler we want to use (in this case, a local scheduler), a name for the application, the directory where Spark is installed, and a name for the jar file containing the application's code. The final two arguments are needed in a distributed setting, where Spark is running across several nodes, so we include them for completeness. Spark will automatically ship the jar files you list to slave nodes.
 
 This file depends on the Spark API, so we'll also include an sbt configuration file, `simple.sbt` which explains that Spark is a dependency. This file also adds a repository that Spark depends on:
 

From 7c18428fac1403eb9c69b61890453964b255c432 Mon Sep 17 00:00:00 2001
From: Prashant Sharma <prashant.s@imaginea.com>
Date: Fri, 4 Apr 2014 22:49:19 -0700
Subject: [PATCH 11/21] HOTFIX for broken CI, by SPARK-1336

Learnt about `set -o pipefail` is very useful.

Author: Prashant Sharma <prashant.s@imaginea.com>
Author: Prashant Sharma <scrapcodes@gmail.com>

Closes #321 from ScrapCodes/hf-SPARK-1336 and squashes the following commits:

9d22bc2 [Prashant Sharma] added comment why echo -e q exists.
f865951 [Prashant Sharma] made error to match with word boundry so errors does not match. This is there to make sure build fails if provided SparkBuild has compile errors.
7fffdf2 [Prashant Sharma] Removed a stray line.
97379d8 [Prashant Sharma] HOTFIX for broken CI, by SPARK-1336
---
 .rat-excludes  |  1 +
 dev/run-tests  | 10 +++++-----
 dev/scalastyle |  4 ++--
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.rat-excludes b/.rat-excludes
index 17cf6d0ed1cf3..85bfad60fcadc 100644
--- a/.rat-excludes
+++ b/.rat-excludes
@@ -39,3 +39,4 @@ work
 .*\.q
 golden
 test.out/*
+.*iml
diff --git a/dev/run-tests b/dev/run-tests
index fff949e04fcd7..6ad674a2ba127 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -26,13 +26,12 @@ rm -rf ./work
 
 # Fail fast
 set -e
-
+set -o pipefail
 if test -x "$JAVA_HOME/bin/java"; then
     declare java_cmd="$JAVA_HOME/bin/java"
 else 
     declare java_cmd=java
 fi
-
 JAVA_VERSION=$($java_cmd -version 2>&1 | sed 's/java version "\(.*\)\.\(.*\)\..*"/\1\2/; 1q')
 [ "$JAVA_VERSION" -ge 18 ] && echo "" || echo "[Warn] Java 8 tests will not run because JDK version is < 1.8."
 
@@ -49,7 +48,9 @@ dev/scalastyle
 echo "========================================================================="
 echo "Running Spark unit tests"
 echo "========================================================================="
-sbt/sbt assembly test
+# echo "q" is needed because sbt on encountering a build file with failure (either resolution or compilation)
+# prompts the user for input either q, r, etc to quit or retry. This echo is there to make it not block.
+echo -e "q\n" | sbt/sbt assembly test |  grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
 
 echo "========================================================================="
 echo "Running PySpark tests"
@@ -63,5 +64,4 @@ echo "========================================================================="
 echo "Detecting binary incompatibilites with MiMa"
 echo "========================================================================="
 ./bin/spark-class org.apache.spark.tools.GenerateMIMAIgnore
-sbt/sbt mima-report-binary-issues | grep -v -e "info.*Resolving" 
-
+echo -e "q\n" | sbt/sbt mima-report-binary-issues | grep -v -e "info.*Resolving" 
diff --git a/dev/scalastyle b/dev/scalastyle
index 5a18f4d672825..19955b9aaaad3 100755
--- a/dev/scalastyle
+++ b/dev/scalastyle
@@ -17,8 +17,8 @@
 # limitations under the License.
 #
 
-sbt/sbt clean scalastyle > scalastyle.txt
-ERRORS=$(cat scalastyle.txt | grep -e "error file")
+echo -e "q\n" | sbt/sbt clean scalastyle > scalastyle.txt
+ERRORS=$(cat scalastyle.txt | grep -e "\<error\>")
 if test ! -z "$ERRORS"; then
     echo -e "Scalastyle checks failed at following occurrences:\n$ERRORS"
     exit 1

From 2d0150c1a2688296346fa279b1f8d14edac935eb Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Sat, 5 Apr 2014 15:17:50 -0700
Subject: [PATCH 12/21] Remove the getStageInfo() method from SparkContext.

This method exposes the Stage objects, which are
private to Spark and should not be exposed to the
user.

This method was added in https://github.com/apache/spark/commit/01d77f329f5878b7c8672bbdc1859f3ca95d759d; ccing @squito here in case there's a good reason to keep this!

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #308 from kayousterhout/remove_public_method and squashes the following commits:

2e2f009 [Kay Ousterhout] Remove the getStageInfo() method from SparkContext.
---
 core/src/main/scala/org/apache/spark/SparkContext.scala       | 4 ----
 .../main/scala/org/apache/spark/scheduler/DAGScheduler.scala  | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index fcf16ce1b278e..8382dd44f3484 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -731,10 +731,6 @@ class SparkContext(
    */
   def getPersistentRDDs: Map[Int, RDD[_]] = persistentRdds.toMap
 
-  def getStageInfo: Map[Stage, StageInfo] = {
-    dagScheduler.stageToInfos
-  }
-
   /**
    * Return information about blocks stored in all of the slaves
    */
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index ef3d24d746829..442a95bb2c44b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -86,7 +86,7 @@ class DAGScheduler(
   private[scheduler] val shuffleToMapStage = new TimeStampedHashMap[Int, Stage]
   private[scheduler] val jobIdToActiveJob = new HashMap[Int, ActiveJob]
   private[scheduler] val resultStageToJob = new HashMap[Stage, ActiveJob]
-  private[spark] val stageToInfos = new TimeStampedHashMap[Stage, StageInfo]
+  private[scheduler] val stageToInfos = new TimeStampedHashMap[Stage, StageInfo]
 
   // Stages we need to run whose parents aren't done
   private[scheduler] val waitingStages = new HashSet[Stage]

From 6e88583aef7d8caf59d53c9fcb659a62d2cd6051 Mon Sep 17 00:00:00 2001
From: Mridul Muralidharan <mridulm80@apache.org>
Date: Sat, 5 Apr 2014 15:23:37 -0700
Subject: [PATCH 13/21] [SPARK-1371] fix computePreferredLocations signature to
 not depend on underlying implementation

Change to Map and Set - not mutable HashMap and HashSet

Author: Mridul Muralidharan <mridulm80@apache.org>

Closes #302 from mridulm/master and squashes the following commits:

df747af [Mridul Muralidharan] Address review comments
17e2907 [Mridul Muralidharan] fix computePreferredLocations signature to not depend on underlying implementation
---
 .../scala/org/apache/spark/scheduler/InputFormatInfo.scala   | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
index 5555585c8b4cd..b3f2cb346f7da 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
@@ -164,8 +164,7 @@ object InputFormatInfo {
 
     PS: I know the wording here is weird, hopefully it makes some sense !
   */
-  def computePreferredLocations(formats: Seq[InputFormatInfo]): HashMap[String, HashSet[SplitInfo]]
-  = {
+  def computePreferredLocations(formats: Seq[InputFormatInfo]): Map[String, Set[SplitInfo]] = {
 
     val nodeToSplit = new HashMap[String, HashSet[SplitInfo]]
     for (inputSplit <- formats) {
@@ -178,6 +177,6 @@ object InputFormatInfo {
       }
     }
 
-    nodeToSplit
+    nodeToSplit.mapValues(_.toSet).toMap
   }
 }

From 890d63bd4e16296ac70e151b3754727ea42b583c Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 5 Apr 2014 19:08:24 -0700
Subject: [PATCH 14/21] Fix for PR #195 for Java 6

Use Java 6's recommended equivalent of Java 7's Logger.getGlobal() to retain Java 6 compatibility. See PR #195

Author: Sean Owen <sowen@cloudera.com>

Closes #334 from srowen/FixPR195ForJava6 and squashes the following commits:

f92fbd3 [Sean Owen] Use Java 6's recommended equivalent of Java 7's Logger.getGlobal() to retain Java 6 compatibility
---
 .../scala/org/apache/spark/sql/parquet/ParquetRelation.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index 114bfbb719ee9..505ad0a2c77c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -103,7 +103,7 @@ private[sql] object ParquetRelation {
     SLF4JBridgeHandler.install()
     for(name <- loggerNames) {
       val logger = Logger.getLogger(name)
-      logger.setParent(Logger.getGlobal)
+      logger.setParent(Logger.getLogger(Logger.GLOBAL_LOGGER_NAME))
       logger.setUseParentHandlers(true)
     }
   }

From 0b855167818b9afd2d2aa9f617b9861d77b2425d Mon Sep 17 00:00:00 2001
From: Matei Zaharia <matei@databricks.com>
Date: Sat, 5 Apr 2014 20:52:05 -0700
Subject: [PATCH 15/21] SPARK-1421. Make MLlib work on Python 2.6

The reason it wasn't working was passing a bytearray to stream.write(), which is not supported in Python 2.6 but is in 2.7. (This array came from NumPy when we converted data to send it over to Java). Now we just convert those bytearrays to strings of bytes, which preserves nonprintable characters as well.

Author: Matei Zaharia <matei@databricks.com>

Closes #335 from mateiz/mllib-python-2.6 and squashes the following commits:

f26c59f [Matei Zaharia] Update docs to no longer say we need Python 2.7
a84d6af [Matei Zaharia] SPARK-1421. Make MLlib work on Python 2.6
---
 docs/mllib-guide.md              |  3 +--
 docs/python-programming-guide.md |  2 +-
 python/pyspark/mllib/__init__.py |  6 +-----
 python/pyspark/serializers.py    | 11 ++++++++++-
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 203d235bf9663..a5e0cc50809cf 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -38,6 +38,5 @@ depends on native Fortran routines. You may need to install the
 if it is not already present on your nodes. MLlib will throw a linking error if it cannot 
 detect these libraries automatically.
 
-To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.7 or newer
-and Python 2.7.
+To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.7 or newer.
 
diff --git a/docs/python-programming-guide.md b/docs/python-programming-guide.md
index cbe7d820b455e..c2e5327324898 100644
--- a/docs/python-programming-guide.md
+++ b/docs/python-programming-guide.md
@@ -152,7 +152,7 @@ Many of the methods also contain [doctests](http://docs.python.org/2/library/doc
 # Libraries
 
 [MLlib](mllib-guide.html) is also available in PySpark. To use it, you'll need
-[NumPy](http://www.numpy.org) version 1.7 or newer, and Python 2.7. The [MLlib guide](mllib-guide.html) contains
+[NumPy](http://www.numpy.org) version 1.7 or newer. The [MLlib guide](mllib-guide.html) contains
 some example applications.
 
 # Where to Go from Here
diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py
index b420d7a7f23ba..538ff26ce7c33 100644
--- a/python/pyspark/mllib/__init__.py
+++ b/python/pyspark/mllib/__init__.py
@@ -19,11 +19,7 @@
 Python bindings for MLlib.
 """
 
-# MLlib currently needs Python 2.7+ and NumPy 1.7+, so complain if lower
-
-import sys
-if sys.version_info[0:2] < (2, 7):
-    raise Exception("MLlib requires Python 2.7+")
+# MLlib currently needs and NumPy 1.7+, so complain if lower
 
 import numpy
 if numpy.version.version < '1.7':
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 4d802924df4a1..b253807974a2e 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -64,6 +64,7 @@
 from itertools import chain, izip, product
 import marshal
 import struct
+import sys
 from pyspark import cloudpickle
 
 
@@ -113,6 +114,11 @@ class FramedSerializer(Serializer):
     where C{length} is a 32-bit integer and data is C{length} bytes.
     """
 
+    def __init__(self):
+        # On Python 2.6, we can't write bytearrays to streams, so we need to convert them
+        # to strings first. Check if the version number is that old.
+        self._only_write_strings = sys.version_info[0:2] <= (2, 6)
+
     def dump_stream(self, iterator, stream):
         for obj in iterator:
             self._write_with_length(obj, stream)
@@ -127,7 +133,10 @@ def load_stream(self, stream):
     def _write_with_length(self, obj, stream):
         serialized = self.dumps(obj)
         write_int(len(serialized), stream)
-        stream.write(serialized)
+        if self._only_write_strings:
+            stream.write(str(serialized))
+        else:
+            stream.write(serialized)
 
     def _read_with_length(self, stream):
         length = read_int(stream)

From 7012ffafad8fa876aa8bcb0b848445eec6734ef1 Mon Sep 17 00:00:00 2001
From: witgo <witgo@qq.com>
Date: Sun, 6 Apr 2014 16:03:06 -0700
Subject: [PATCH 16/21] Fix SPARK-1420 The maven build error for Spark Catalyst

Author: witgo <witgo@qq.com>

Closes #333 from witgo/SPARK-1420 and squashes the following commits:

902519e [witgo] add dependency scala-reflect to catalyst
---
 sql/catalyst/pom.xml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 0edce55a93338..9d5c6a857bb00 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -44,6 +44,10 @@
     </profiles>
 
     <dependencies>
+        <dependency>
+            <groupId>org.scala-lang</groupId>
+            <artifactId>scala-reflect</artifactId>
+        </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-core_${scala.binary.version}</artifactId>

From e258e5040fa1905a04efcb7b3ca4a6d33e18fa61 Mon Sep 17 00:00:00 2001
From: Egor Pakhomov <pahomov.egor@gmail.com>
Date: Sun, 6 Apr 2014 16:41:23 -0700
Subject: [PATCH 17/21] [SPARK-1259] Make RDD locally iterable

Author: Egor Pakhomov <pahomov.egor@gmail.com>

Closes #156 from epahomov/SPARK-1259 and squashes the following commits:

8ec8f24 [Egor Pakhomov] Make to local iterator shorter
34aa300 [Egor Pakhomov] Fix toLocalIterator docs
08363ef [Egor Pakhomov] SPARK-1259 from toLocallyIterable to toLocalIterator
6a994eb [Egor Pakhomov] SPARK-1259 Make RDD locally iterable
8be3dcf [Egor Pakhomov] SPARK-1259 Make RDD locally iterable
33ecb17 [Egor Pakhomov] SPARK-1259 Make RDD locally iterable
---
 .../org/apache/spark/api/java/JavaRDDLike.scala    | 14 +++++++++++++-
 core/src/main/scala/org/apache/spark/rdd/RDD.scala | 12 ++++++++++++
 .../test/java/org/apache/spark/JavaAPISuite.java   |  9 +++++++++
 .../test/scala/org/apache/spark/rdd/RDDSuite.scala |  1 +
 4 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
index e03b8e78d5f52..6e8ec8e0c7629 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.api.java
 
-import java.util.{Comparator, List => JList}
+import java.util.{Comparator, Iterator => JIterator, List => JList}
+import java.lang.{Iterable => JIterable}
 
 import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
@@ -280,6 +281,17 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
     new java.util.ArrayList(arr)
   }
 
+  /**
+   * Return an iterator that contains all of the elements in this RDD.
+   *
+   * The iterator will consume as much memory as the largest partition in this RDD.
+   */
+  def toLocalIterator(): JIterator[T] = {
+     import scala.collection.JavaConversions._
+     rdd.toLocalIterator
+  }
+
+
   /**
    * Return an array that contains all of the elements in this RDD.
    * @deprecated As of Spark 1.0.0, toArray() is deprecated, use {@link #collect()} instead
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 08c42c5ee87b6..c43823bd769b7 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -661,6 +661,18 @@ abstract class RDD[T: ClassTag](
     Array.concat(results: _*)
   }
 
+  /**
+   * Return an iterator that contains all of the elements in this RDD.
+   *
+   * The iterator will consume as much memory as the largest partition in this RDD.
+   */
+  def toLocalIterator: Iterator[T] = {
+    def collectPartition(p: Int): Array[T] = {
+      sc.runJob(this, (iter: Iterator[T]) => iter.toArray, Seq(p), allowLocal = false).head
+    }
+    (0 until partitions.length).iterator.flatMap(i => collectPartition(i))
+  }
+
   /**
    * Return an array that contains all of the elements in this RDD.
    */
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index 2372f2d9924a1..762405be2a8f9 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -22,6 +22,7 @@
 
 import scala.Tuple2;
 
+import com.google.common.collect.Lists;
 import com.google.common.base.Optional;
 import com.google.common.base.Charsets;
 import com.google.common.io.Files;
@@ -179,6 +180,14 @@ public void call(String s) {
     Assert.assertEquals(2, foreachCalls);
   }
 
+    @Test
+    public void toLocalIterator() {
+        List<Integer> correct = Arrays.asList(1, 2, 3, 4);
+        JavaRDD<Integer> rdd = sc.parallelize(correct);
+        List<Integer> result = Lists.newArrayList(rdd.toLocalIterator());
+        Assert.assertTrue(correct.equals(result));
+    }
+
   @SuppressWarnings("unchecked")
   @Test
   public void lookup() {
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index d6b5fdc7984b4..25973348a7837 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -33,6 +33,7 @@ class RDDSuite extends FunSuite with SharedSparkContext {
   test("basic operations") {
     val nums = sc.makeRDD(Array(1, 2, 3, 4), 2)
     assert(nums.collect().toList === List(1, 2, 3, 4))
+    assert(nums.toLocalIterator.toList === List(1, 2, 3, 4))
     val dups = sc.makeRDD(Array(1, 1, 2, 2, 3, 3, 4, 4), 2)
     assert(dups.distinct().count() === 4)
     assert(dups.distinct.count === 4)  // Can distinct and count be called without parentheses?

From 856c50f59bffbf76ad495eaab837febaf65cf02d Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sun, 6 Apr 2014 17:40:37 -0700
Subject: [PATCH 18/21] SPARK-1387. Update build plugins, avoid plugin version
 warning, centralize versions

Another handful of small build changes to organize and standardize a bit, and avoid warnings:

- Update Maven plugin versions for good measure
- Since plugins need maven 3.0.4 already, require it explicitly (<3.0.4 had some bugs anyway)
- Use variables to define versions across dependencies where they should move in lock step
- ... and make this consistent between Maven/SBT

OK, I also updated the JIRA URL while I was at it here.

Author: Sean Owen <sowen@cloudera.com>

Closes #291 from srowen/SPARK-1387 and squashes the following commits:

461eca1 [Sean Owen] Couldn't resist also updating JIRA location to new one
c2d5cc5 [Sean Owen] Update plugins and Maven version; use variables consistently across Maven/SBT to define dependency versions that should stay in step.
---
 assembly/pom.xml                         |  2 +-
 core/pom.xml                             |  2 -
 dev/audit-release/maven_app_core/pom.xml |  2 +-
 docs/building-with-maven.md              |  2 +-
 examples/pom.xml                         |  2 +-
 graphx/pom.xml                           |  2 +-
 mllib/pom.xml                            |  2 +-
 pom.xml                                  | 43 +++++++++++----------
 project/SparkBuild.scala                 | 49 ++++++++++++++----------
 streaming/pom.xml                        |  1 -
 10 files changed, 57 insertions(+), 50 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index b5e752c6cd1f6..255107a2c47cb 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -208,7 +208,7 @@
           <plugin>
             <groupId>org.codehaus.mojo</groupId>
             <artifactId>buildnumber-maven-plugin</artifactId>
-            <version>1.1</version>
+            <version>1.2</version>
             <executions>
               <execution>
                 <phase>validate</phase>
diff --git a/core/pom.xml b/core/pom.xml
index 66f9fc4961b03..1f808380817c9 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -117,12 +117,10 @@
     <dependency>
       <groupId>com.twitter</groupId>
       <artifactId>chill_${scala.binary.version}</artifactId>
-      <version>0.3.1</version>
     </dependency>
     <dependency>
       <groupId>com.twitter</groupId>
       <artifactId>chill-java</artifactId>
-      <version>0.3.1</version>
     </dependency>
     <dependency>
       <groupId>commons-net</groupId>
diff --git a/dev/audit-release/maven_app_core/pom.xml b/dev/audit-release/maven_app_core/pom.xml
index 0b837c01751fe..76a381f8e17e0 100644
--- a/dev/audit-release/maven_app_core/pom.xml
+++ b/dev/audit-release/maven_app_core/pom.xml
@@ -49,7 +49,7 @@
     <plugins>
       <plugin>
         <artifactId>maven-compiler-plugin</artifactId>
-        <version>2.3.2</version>
+        <version>3.1</version>
       </plugin>
     </plugins>
   </build>
diff --git a/docs/building-with-maven.md b/docs/building-with-maven.md
index 730a6e7932564..9cebaf12283fc 100644
--- a/docs/building-with-maven.md
+++ b/docs/building-with-maven.md
@@ -6,7 +6,7 @@ title: Building Spark with Maven
 * This will become a table of contents (this text will be scraped).
 {:toc}
 
-Building Spark using Maven Requires Maven 3 (the build process is tested with Maven 3.0.4) and Java 1.6 or newer.
+Building Spark using Maven requires Maven 3.0.4 or newer and Java 1.6 or newer.
 
 
 ## Setting up Maven's Memory Usage ##
diff --git a/examples/pom.xml b/examples/pom.xml
index a5569ff5e71f3..0b6212b5d1549 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -110,7 +110,7 @@
     <dependency>
       <groupId>org.apache.hbase</groupId>
       <artifactId>hbase</artifactId>
-      <version>0.94.6</version>
+      <version>${hbase.version}</version>
       <exclusions>
         <exclusion>
           <groupId>asm</groupId>
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 5a5022916d234..b4c67ddcd8ca9 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -54,7 +54,7 @@
     <dependency>
       <groupId>org.jblas</groupId>
       <artifactId>jblas</artifactId>
-      <version>1.2.3</version>
+      <version>${jblas.version}</version>
     </dependency>
     <dependency>
       <groupId>org.eclipse.jetty</groupId>
diff --git a/mllib/pom.xml b/mllib/pom.xml
index fec1cc94b2642..e7ce00efc4af6 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -58,7 +58,7 @@
     <dependency>
       <groupId>org.jblas</groupId>
       <artifactId>jblas</artifactId>
-      <version>1.2.3</version>
+      <version>${jblas.version}</version>
     </dependency>
     <dependency>
       <groupId>org.scalanlp</groupId>
diff --git a/pom.xml b/pom.xml
index 01341d21b7f23..1426e0e00214c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,11 +54,11 @@
   </developers>
   <issueManagement>
     <system>JIRA</system>
-    <url>https://spark-project.atlassian.net/browse/SPARK</url>
+    <url>https://issues.apache.org/jira/browse/SPARK</url>
   </issueManagement>
 
   <prerequisites>
-    <maven>3.0.0</maven>
+    <maven>3.0.4</maven>
   </prerequisites>
 
   <mailingLists>
@@ -123,6 +123,10 @@
     <hbase.version>0.94.6</hbase.version>
     <hive.version>0.12.0</hive.version>
     <parquet.version>1.3.2</parquet.version>
+    <jblas.version>1.2.3</jblas.version>
+    <jetty.version>8.1.14.v20131031</jetty.version>
+    <chill.version>0.3.1</chill.version>
+    <codahale.metrics.version>3.0.0</codahale.metrics.version>
 
     <PermGen>64m</PermGen>
     <MaxPermGen>512m</MaxPermGen>
@@ -192,22 +196,22 @@
       <dependency>
         <groupId>org.eclipse.jetty</groupId>
         <artifactId>jetty-util</artifactId>
-        <version>8.1.14.v20131031</version>
+        <version>${jetty.version}</version>
       </dependency>
       <dependency>
         <groupId>org.eclipse.jetty</groupId>
         <artifactId>jetty-security</artifactId>
-        <version>8.1.14.v20131031</version>
+        <version>${jetty.version}</version>
       </dependency>
       <dependency>
         <groupId>org.eclipse.jetty</groupId>
         <artifactId>jetty-plus</artifactId>
-        <version>8.1.14.v20131031</version>
+        <version>${jetty.version}</version>
       </dependency>
       <dependency>
         <groupId>org.eclipse.jetty</groupId>
         <artifactId>jetty-server</artifactId>
-        <version>8.1.14.v20131031</version>
+        <version>${jetty.version}</version>
       </dependency>
       <dependency>
         <groupId>com.google.guava</groupId>
@@ -273,7 +277,7 @@
       <dependency>
         <groupId>com.twitter</groupId>
         <artifactId>chill_${scala.binary.version}</artifactId>
-        <version>0.3.1</version>
+        <version>${chill.version}</version>
         <exclusions>
           <exclusion>
             <groupId>org.ow2.asm</groupId>
@@ -288,7 +292,7 @@
       <dependency>
         <groupId>com.twitter</groupId>
         <artifactId>chill-java</artifactId>
-        <version>0.3.1</version>
+        <version>${chill.version}</version>
         <exclusions>
           <exclusion>
             <groupId>org.ow2.asm</groupId>
@@ -392,27 +396,27 @@
       <dependency>
         <groupId>com.codahale.metrics</groupId>
         <artifactId>metrics-core</artifactId>
-        <version>3.0.0</version>
+        <version>${codahale.metrics.version}</version>
       </dependency>
       <dependency>
         <groupId>com.codahale.metrics</groupId>
         <artifactId>metrics-jvm</artifactId>
-        <version>3.0.0</version>
+        <version>${codahale.metrics.version}</version>
       </dependency>
       <dependency>
         <groupId>com.codahale.metrics</groupId>
         <artifactId>metrics-json</artifactId>
-        <version>3.0.0</version>
+        <version>${codahale.metrics.version}</version>
       </dependency>
       <dependency>
         <groupId>com.codahale.metrics</groupId>
         <artifactId>metrics-ganglia</artifactId>
-        <version>3.0.0</version>
+        <version>${codahale.metrics.version}</version>
       </dependency>
       <dependency>
         <groupId>com.codahale.metrics</groupId>
         <artifactId>metrics-graphite</artifactId>
-        <version>3.0.0</version>
+        <version>${codahale.metrics.version}</version>
       </dependency>
       <dependency>
         <groupId>org.scala-lang</groupId>
@@ -585,7 +589,7 @@
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-enforcer-plugin</artifactId>
-          <version>1.1.1</version>
+          <version>1.3.1</version>
           <executions>
             <execution>
               <id>enforce-versions</id>
@@ -595,7 +599,7 @@
               <configuration>
                 <rules>
                   <requireMavenVersion>
-                    <version>3.0.0</version>
+                    <version>3.0.4</version>
                   </requireMavenVersion>
                   <requireJavaVersion>
                     <version>${java.version}</version>
@@ -608,12 +612,12 @@
         <plugin>
           <groupId>org.codehaus.mojo</groupId>
           <artifactId>build-helper-maven-plugin</artifactId>
-          <version>1.7</version>
+          <version>1.8</version>
         </plugin>
         <plugin>
           <groupId>net.alchim31.maven</groupId>
           <artifactId>scala-maven-plugin</artifactId>
-          <version>3.1.5</version>
+          <version>3.1.6</version>
           <executions>
             <execution>
               <id>scala-compile-first</id>
@@ -674,7 +678,7 @@
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-surefire-plugin</artifactId>
-          <version>2.12.4</version>
+          <version>2.17</version>
           <configuration>
             <!-- Uses scalatest instead -->
             <skipTests>true</skipTests>
@@ -713,7 +717,7 @@
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-shade-plugin</artifactId>
-          <version>2.0</version>
+          <version>2.2</version>
         </plugin>
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
@@ -810,7 +814,6 @@
           <plugin>
             <groupId>org.apache.maven.plugins</groupId>
             <artifactId>maven-jar-plugin</artifactId>
-            <version>2.4</version>
             <executions>
               <execution>
                 <goals>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 843a874fbfdb0..3489b43d43f0d 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -248,10 +248,10 @@ object SparkBuild extends Build {
 
     libraryDependencies ++= Seq(
         "io.netty"          % "netty-all"      % "4.0.17.Final",
-        "org.eclipse.jetty" % "jetty-server"   % "8.1.14.v20131031",
-        "org.eclipse.jetty" % "jetty-util"     % "8.1.14.v20131031",
-        "org.eclipse.jetty" % "jetty-plus"     % "8.1.14.v20131031",
-        "org.eclipse.jetty" % "jetty-security" % "8.1.14.v20131031",
+        "org.eclipse.jetty" % "jetty-server"   % jettyVersion,
+        "org.eclipse.jetty" % "jetty-util"     % jettyVersion,
+        "org.eclipse.jetty" % "jetty-plus"     % jettyVersion,
+        "org.eclipse.jetty" % "jetty-security" % jettyVersion,
         /** Workaround for SPARK-959. Dependency used by org.eclipse.jetty. Fixed in ivy 2.3.0. */
         "org.eclipse.jetty.orbit" % "javax.servlet" % "3.0.0.v201112011016" artifacts Artifact("javax.servlet", "jar", "jar"),
         "org.scalatest"    %% "scalatest"       % "1.9.1"  % "test",
@@ -276,6 +276,13 @@ object SparkBuild extends Build {
     publishLocalBoth <<= Seq(publishLocal in MavenCompile, publishLocal).dependOn
   ) ++ net.virtualvoid.sbt.graph.Plugin.graphSettings ++ ScalaStyleSettings
 
+  val akkaVersion = "2.2.3-shaded-protobuf"
+  val chillVersion = "0.3.1"
+  val codahaleMetricsVersion = "3.0.0"
+  val jblasVersion = "1.2.3"
+  val jettyVersion = "8.1.14.v20131031"
+  val hiveVersion = "0.12.0"
+  val parquetVersion = "1.3.2"
   val slf4jVersion = "1.7.5"
 
   val excludeNetty = ExclusionRule(organization = "org.jboss.netty")
@@ -309,9 +316,9 @@ object SparkBuild extends Build {
         "commons-daemon"             % "commons-daemon"   % "1.0.10", // workaround for bug HADOOP-9407
         "com.ning"                   % "compress-lzf"     % "1.0.0",
         "org.xerial.snappy"          % "snappy-java"      % "1.0.5",
-        "org.spark-project.akka"    %% "akka-remote"      % "2.2.3-shaded-protobuf"  excludeAll(excludeNetty),
-        "org.spark-project.akka"    %% "akka-slf4j"       % "2.2.3-shaded-protobuf"  excludeAll(excludeNetty),
-        "org.spark-project.akka"    %% "akka-testkit"     % "2.2.3-shaded-protobuf" % "test",
+        "org.spark-project.akka"    %% "akka-remote"      % akkaVersion excludeAll(excludeNetty),
+        "org.spark-project.akka"    %% "akka-slf4j"       % akkaVersion excludeAll(excludeNetty),
+        "org.spark-project.akka"    %% "akka-testkit"     % akkaVersion % "test",
         "org.json4s"                %% "json4s-jackson"   % "3.2.6" excludeAll(excludeScalap),
         "it.unimi.dsi"               % "fastutil"         % "6.4.4",
         "colt"                       % "colt"             % "1.2.0",
@@ -321,12 +328,12 @@ object SparkBuild extends Build {
         "org.apache.derby"           % "derby"            % "10.4.2.0"                     % "test",
         "org.apache.hadoop"          % hadoopClient       % hadoopVersion excludeAll(excludeNetty, excludeAsm, excludeCommonsLogging, excludeSLF4J, excludeOldAsm),
         "org.apache.curator"         % "curator-recipes"  % "2.4.0" excludeAll(excludeNetty),
-        "com.codahale.metrics"       % "metrics-core"     % "3.0.0",
-        "com.codahale.metrics"       % "metrics-jvm"      % "3.0.0",
-        "com.codahale.metrics"       % "metrics-json"     % "3.0.0",
-        "com.codahale.metrics"       % "metrics-graphite" % "3.0.0",
-        "com.twitter"               %% "chill"            % "0.3.1" excludeAll(excludeAsm),
-        "com.twitter"                % "chill-java"       % "0.3.1" excludeAll(excludeAsm),
+        "com.codahale.metrics"       % "metrics-core"     % codahaleMetricsVersion,
+        "com.codahale.metrics"       % "metrics-jvm"      % codahaleMetricsVersion,
+        "com.codahale.metrics"       % "metrics-json"     % codahaleMetricsVersion,
+        "com.codahale.metrics"       % "metrics-graphite" % codahaleMetricsVersion,
+        "com.twitter"               %% "chill"            % chillVersion excludeAll(excludeAsm),
+        "com.twitter"                % "chill-java"       % chillVersion excludeAll(excludeAsm),
         "org.tachyonproject"         % "tachyon"          % "0.4.1-thrift" excludeAll(excludeHadoop, excludeCurator, excludeEclipseJetty, excludePowermock),
         "com.clearspring.analytics"  % "stream"           % "2.5.1"
       ),
@@ -370,7 +377,7 @@ object SparkBuild extends Build {
     name := "spark-graphx",
     previousArtifact := sparkPreviousArtifact("spark-graphx"),
     libraryDependencies ++= Seq(
-      "org.jblas" % "jblas" % "1.2.3"
+      "org.jblas" % "jblas" % jblasVersion
     )
   )
 
@@ -383,7 +390,7 @@ object SparkBuild extends Build {
     name := "spark-mllib",
     previousArtifact := sparkPreviousArtifact("spark-mllib"),
     libraryDependencies ++= Seq(
-      "org.jblas" % "jblas" % "1.2.3",
+      "org.jblas" % "jblas" % jblasVersion,
       "org.scalanlp" %% "breeze" % "0.7"
     )
   )
@@ -403,8 +410,8 @@ object SparkBuild extends Build {
   def sqlCoreSettings = sharedSettings ++ Seq(
     name := "spark-sql",
     libraryDependencies ++= Seq(
-      "com.twitter" % "parquet-column" % "1.3.2",
-      "com.twitter" % "parquet-hadoop" % "1.3.2"
+      "com.twitter" % "parquet-column" % parquetVersion,
+      "com.twitter" % "parquet-hadoop" % parquetVersion
     )
   )
 
@@ -416,9 +423,9 @@ object SparkBuild extends Build {
     jarName in packageDependency <<= version map { v => "spark-hive-assembly-" + v + "-hadoop" + hadoopVersion + "-deps.jar" },
     javaOptions += "-XX:MaxPermSize=1g",
     libraryDependencies ++= Seq(
-      "org.apache.hive" % "hive-metastore" % "0.12.0",
-      "org.apache.hive" % "hive-exec" % "0.12.0",
-      "org.apache.hive" % "hive-serde" % "0.12.0"
+      "org.apache.hive" % "hive-metastore" % hiveVersion,
+      "org.apache.hive" % "hive-exec"      % hiveVersion,
+      "org.apache.hive" % "hive-serde"     % hiveVersion
     ),
     // Multiple queries rely on the TestHive singleton.  See comments there for more details.
     parallelExecution in Test := false,
@@ -549,7 +556,7 @@ object SparkBuild extends Build {
     name := "spark-streaming-zeromq",
     previousArtifact := sparkPreviousArtifact("spark-streaming-zeromq"),
     libraryDependencies ++= Seq(
-      "org.spark-project.akka" %% "akka-zeromq" % "2.2.3-shaded-protobuf" excludeAll(excludeNetty)
+      "org.spark-project.akka" %% "akka-zeromq" % akkaVersion excludeAll(excludeNetty)
     )
   )
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 1953cc6883378..93b1c5a37aff9 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -96,7 +96,6 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-jar-plugin</artifactId>
-        <version>2.2</version>
         <executions>
           <execution>
             <goals>

From 7ce52c4a7a07b0db5e7c1312b1920efb1165ce6a Mon Sep 17 00:00:00 2001
From: Aaron Davidson <aaron@databricks.com>
Date: Sun, 6 Apr 2014 17:43:44 -0700
Subject: [PATCH 19/21] SPARK-1349: spark-shell gets its own command history

Currently, spark-shell shares its command history with scala repl.

This fix is simply a modification of the default FileBackedHistory file setting:
https://github.com/scala/scala/blob/master/src/repl/scala/tools/nsc/interpreter/session/FileBackedHistory.scala#L77

Author: Aaron Davidson <aaron@databricks.com>

Closes #267 from aarondav/repl and squashes the following commits:

f9c62d2 [Aaron Davidson] SPARK-1349: spark-shell gets its own command history separate from scala repl
---
 .../org/apache/spark/repl/SparkJLineReader.scala     | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkJLineReader.scala b/repl/src/main/scala/org/apache/spark/repl/SparkJLineReader.scala
index 946e71039088d..0db26c3407dff 100644
--- a/repl/src/main/scala/org/apache/spark/repl/SparkJLineReader.scala
+++ b/repl/src/main/scala/org/apache/spark/repl/SparkJLineReader.scala
@@ -7,8 +7,10 @@
 
 package org.apache.spark.repl
 
+import scala.reflect.io.{Path, File}
 import scala.tools.nsc._
 import scala.tools.nsc.interpreter._
+import scala.tools.nsc.interpreter.session.JLineHistory.JLineFileHistory
 
 import scala.tools.jline.console.ConsoleReader
 import scala.tools.jline.console.completer._
@@ -25,7 +27,7 @@ class SparkJLineReader(_completion: => Completion) extends InteractiveReader {
   val consoleReader = new JLineConsoleReader()
 
   lazy val completion = _completion
-  lazy val history: JLineHistory = JLineHistory()
+  lazy val history: JLineHistory = new SparkJLineHistory
 
   private def term = consoleReader.getTerminal()
   def reset() = term.reset()
@@ -78,3 +80,11 @@ class SparkJLineReader(_completion: => Completion) extends InteractiveReader {
   def readOneLine(prompt: String) = consoleReader readLine prompt
   def readOneKey(prompt: String)  = consoleReader readOneKey prompt
 }
+
+/** Changes the default history file to not collide with the scala repl's. */
+class SparkJLineHistory extends JLineFileHistory {
+  import Properties.userHome
+
+  def defaultFileName = ".spark_history"
+  override protected lazy val historyFile = File(Path(userHome) / defaultFileName)
+}

From 4106558435889261243d186f5f0b51c5f9e98d56 Mon Sep 17 00:00:00 2001
From: Aaron Davidson <aaron@databricks.com>
Date: Sun, 6 Apr 2014 17:48:41 -0700
Subject: [PATCH 20/21] SPARK-1314: Use SPARK_HIVE to determine if we include
 Hive in packaging

Previously, we based our decision regarding including datanucleus jars based on the existence of a spark-hive-assembly jar, which was incidentally built whenever "sbt assembly" is run. This means that a typical and previously supported pathway would start using hive jars.

This patch has the following features/bug fixes:

- Use of SPARK_HIVE (default false) to determine if we should include Hive in the assembly jar.
- Analagous feature in Maven with -Phive (previously, there was no support for adding Hive to any of our jars produced by Maven)
- assemble-deps fixed since we no longer use a different ASSEMBLY_DIR
- avoid adding log message in compute-classpath.sh to the classpath :)

Still TODO before mergeable:
- We need to download the datanucleus jars outside of sbt. Perhaps we can have spark-class download them if SPARK_HIVE is set similar to how sbt downloads itself.
- Spark SQL documentation updates.

Author: Aaron Davidson <aaron@databricks.com>

Closes #237 from aarondav/master and squashes the following commits:

5dc4329 [Aaron Davidson] Typo fixes
dd4f298 [Aaron Davidson] Doc update
dd1a365 [Aaron Davidson] Eliminate need for SPARK_HIVE at runtime by d/ling datanucleus from Maven
a9269b5 [Aaron Davidson] [WIP] Use SPARK_HIVE to determine if we include Hive in packaging
---
 assembly/pom.xml                     | 10 ++++++++
 bin/compute-classpath.sh             | 35 +++++++++++++++-------------
 bin/spark-class                      |  2 --
 dev/create-release/create-release.sh |  4 ++--
 docs/sql-programming-guide.md        |  4 ++--
 pom.xml                              |  7 +++++-
 project/SparkBuild.scala             | 25 +++++++++++++-------
 sql/hive/pom.xml                     | 28 ++++++++++++++++++++++
 8 files changed, 83 insertions(+), 32 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index 255107a2c47cb..923bf47f7076a 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -163,6 +163,16 @@
         </dependency>
       </dependencies>
     </profile>
+    <profile>
+      <id>hive</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-hive_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
     <profile>
       <id>spark-ganglia-lgpl</id>
       <dependencies>
diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
index bef42df71ce01..be37102dc069a 100755
--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -30,21 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
 # Build up classpath
 CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
 
-# Support for interacting with Hive.  Since hive pulls in a lot of dependencies that might break
-# existing Spark applications, it is not included in the standard spark assembly.  Instead, we only
-# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
-# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
-# the future.
-if [ -f "$FWDIR"/sql/hive/target/scala-$SCALA_VERSION/spark-hive-assembly-*.jar ]; then
-
-  # Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
-  DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
-  CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
-
-  ASSEMBLY_DIR="$FWDIR/sql/hive/target/scala-$SCALA_VERSION/"
-else
-  ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
-fi
+ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
 
 # First check if we have a dependencies jar. If so, include binary classes with the deps jar
 if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
@@ -59,7 +45,7 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
   CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
 
-  DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*-deps.jar`
+  DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
   CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
 else
   # Else use spark-assembly jar from either RELEASE or assembly directory
@@ -71,6 +57,23 @@ else
   CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
 fi
 
+# When Hive support is needed, Datanucleus jars must be included on the classpath.
+# Datanucleus jars do not work if only included in the  uber jar as plugin.xml metadata is lost.
+# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
+# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
+# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
+# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
+num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ | grep "datanucleus-.*\\.jar" | wc -l)
+if [ $num_datanucleus_jars -gt 0 ]; then
+  AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
+  num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
+  if [ $num_hive_files -gt 0 ]; then
+    echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
+    DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
+    CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
+  fi
+fi
+
 # Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1
 if [[ $SPARK_TESTING == 1 ]]; then
   CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/test-classes"
diff --git a/bin/spark-class b/bin/spark-class
index 0dcf0e156cb52..76fde3e448891 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -154,5 +154,3 @@ if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
 fi
 
 exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
-
-
diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
index 995106f111443..bf1c5d7953bd2 100755
--- a/dev/create-release/create-release.sh
+++ b/dev/create-release/create-release.sh
@@ -49,14 +49,14 @@ mvn -DskipTests \
   -Darguments="-DskipTests=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \
   -Dusername=$GIT_USERNAME -Dpassword=$GIT_PASSWORD \
   -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
-  -Pyarn -Pspark-ganglia-lgpl \
+  -Pyarn -Phive -Pspark-ganglia-lgpl\
   -Dtag=$GIT_TAG -DautoVersionSubmodules=true \
   --batch-mode release:prepare
 
 mvn -DskipTests \
   -Darguments="-DskipTests=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \
   -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
-  -Pyarn -Pspark-ganglia-lgpl\
+  -Pyarn -Phive -Pspark-ganglia-lgpl\
   release:perform
 
 rm -rf spark
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index f849716f7a48f..a59393e1424de 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -264,8 +264,8 @@ evaluated by the SQL execution engine.  A full list of the functions supported c
 
 Spark SQL also supports reading and writing data stored in [Apache Hive](http://hive.apache.org/).
 However, since Hive has a large number of dependencies, it is not included in the default Spark assembly.
-In order to use Hive you must first run '`SPARK_HIVE=true sbt/sbt assembly/assembly`'.  This command builds a new assembly
-jar that includes Hive. Note that this Hive assembly jar must also be present
+In order to use Hive you must first run '`SPARK_HIVE=true sbt/sbt assembly/assembly`' (or use `-Phive` for maven).
+This command builds a new assembly jar that includes Hive. Note that this Hive assembly jar must also be present
 on all of the worker nodes, as they will need access to the Hive serialization and deserialization libraries
 (SerDes) in order to acccess data stored in Hive.
 
diff --git a/pom.xml b/pom.xml
index 1426e0e00214c..c03bb35c99442 100644
--- a/pom.xml
+++ b/pom.xml
@@ -377,7 +377,6 @@
         <groupId>org.apache.derby</groupId>
         <artifactId>derby</artifactId>
         <version>10.4.2.0</version>
-        <scope>test</scope>
       </dependency>
       <dependency>
         <groupId>net.liftweb</groupId>
@@ -580,6 +579,12 @@
           </exclusion>
         </exclusions>
       </dependency>
+      <dependency>
+        <!-- Matches the version of jackson-core-asl pulled in by avro -->
+        <groupId>org.codehaus.jackson</groupId>
+        <artifactId>jackson-mapper-asl</artifactId>
+        <version>1.8.8</version>
+      </dependency>
     </dependencies>
   </dependencyManagement>
 
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 3489b43d43f0d..d1e4b8b964b88 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -43,6 +43,8 @@ object SparkBuild extends Build {
 
   val DEFAULT_YARN = false
 
+  val DEFAULT_HIVE = false
+
   // HBase version; set as appropriate.
   val HBASE_VERSION = "0.94.6"
 
@@ -67,15 +69,17 @@ object SparkBuild extends Build {
 
   lazy val sql = Project("sql", file("sql/core"), settings = sqlCoreSettings) dependsOn(core, catalyst)
 
-  // Since hive is its own assembly, it depends on all of the modules.
-  lazy val hive = Project("hive", file("sql/hive"), settings = hiveSettings) dependsOn(sql, graphx, bagel, mllib, streaming, repl)
+  lazy val hive = Project("hive", file("sql/hive"), settings = hiveSettings) dependsOn(sql)
+
+  lazy val maybeHive: Seq[ClasspathDependency] = if (isHiveEnabled) Seq(hive) else Seq()
+  lazy val maybeHiveRef: Seq[ProjectReference] = if (isHiveEnabled) Seq(hive) else Seq()
 
   lazy val streaming = Project("streaming", file("streaming"), settings = streamingSettings) dependsOn(core)
 
   lazy val mllib = Project("mllib", file("mllib"), settings = mllibSettings) dependsOn(core)
 
   lazy val assemblyProj = Project("assembly", file("assembly"), settings = assemblyProjSettings)
-    .dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeGanglia: _*)
+    .dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeHive: _*) dependsOn(maybeGanglia: _*)
 
   lazy val assembleDeps = TaskKey[Unit]("assemble-deps", "Build assembly of dependencies and packages Spark projects")
 
@@ -101,6 +105,11 @@ object SparkBuild extends Build {
   lazy val hadoopClient = if (hadoopVersion.startsWith("0.20.") || hadoopVersion == "1.0.0") "hadoop-core" else "hadoop-client"
   val maybeAvro = if (hadoopVersion.startsWith("0.23.") && isYarnEnabled) Seq("org.apache.avro" % "avro" % "1.7.4") else Seq()
 
+  lazy val isHiveEnabled = Properties.envOrNone("SPARK_HIVE") match {
+    case None => DEFAULT_HIVE
+    case Some(v) => v.toBoolean
+  }
+
   // Include Ganglia integration if the user has enabled Ganglia
   // This is isolated from the normal build due to LGPL-licensed code in the library
   lazy val isGangliaEnabled = Properties.envOrNone("SPARK_GANGLIA_LGPL").isDefined
@@ -141,13 +150,13 @@ object SparkBuild extends Build {
   lazy val allExternalRefs = Seq[ProjectReference](externalTwitter, externalKafka, externalFlume, externalZeromq, externalMqtt)
 
   lazy val examples = Project("examples", file("examples"), settings = examplesSettings)
-    .dependsOn(core, mllib, graphx, bagel, streaming, externalTwitter, hive) dependsOn(allExternal: _*)
+    .dependsOn(core, mllib, graphx, bagel, streaming, hive) dependsOn(allExternal: _*)
 
   // Everything except assembly, hive, tools, java8Tests and examples belong to packageProjects
-  lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeGangliaRef
+  lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeHiveRef ++ maybeGangliaRef
 
   lazy val allProjects = packageProjects ++ allExternalRefs ++
-    Seq[ProjectReference](examples, tools, assemblyProj, hive) ++ maybeJava8Tests
+    Seq[ProjectReference](examples, tools, assemblyProj) ++ maybeJava8Tests
 
   def sharedSettings = Defaults.defaultSettings ++ MimaBuild.mimaSettings(file(sparkHome)) ++ Seq(
     organization       := "org.apache.spark",
@@ -417,10 +426,8 @@ object SparkBuild extends Build {
 
   // Since we don't include hive in the main assembly this project also acts as an alternative
   // assembly jar.
-  def hiveSettings = sharedSettings ++ assemblyProjSettings ++ Seq(
+  def hiveSettings = sharedSettings ++ Seq(
     name := "spark-hive",
-    jarName in assembly <<= version map { v => "spark-hive-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" },
-    jarName in packageDependency <<= version map { v => "spark-hive-assembly-" + v + "-hadoop" + hadoopVersion + "-deps.jar" },
     javaOptions += "-XX:MaxPermSize=1g",
     libraryDependencies ++= Seq(
       "org.apache.hive" % "hive-metastore" % hiveVersion,
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 63f592cb4b441..a662da76ce25a 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -63,6 +63,10 @@
             <artifactId>hive-exec</artifactId>
             <version>${hive.version}</version>
         </dependency>
+        <dependency>
+            <groupId>org.codehaus.jackson</groupId>
+            <artifactId>jackson-mapper-asl</artifactId>
+        </dependency>
         <dependency>
             <groupId>org.apache.hive</groupId>
             <artifactId>hive-serde</artifactId>
@@ -87,6 +91,30 @@
                 <groupId>org.scalatest</groupId>
                 <artifactId>scalatest-maven-plugin</artifactId>
             </plugin>
+
+            <!-- Deploy datanucleus jars to the spark/lib_managed/jars directory -->
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-dependency-plugin</artifactId>
+                <version>2.4</version>
+                <executions>
+                    <execution>
+                        <id>copy-dependencies</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>copy-dependencies</goal>
+                        </goals>
+                        <configuration>
+                            <!-- basedir is spark/sql/hive/ -->
+                            <outputDirectory>${basedir}/../../lib_managed/jars</outputDirectory>
+                            <overWriteReleases>false</overWriteReleases>
+                            <overWriteSnapshots>false</overWriteSnapshots>
+                            <overWriteIfNewer>true</overWriteIfNewer>
+                            <includeGroupIds>org.datanucleus</includeGroupIds>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
         </plugins>
     </build>
 </project>

From 1440154c27ca48b5a75103eccc9057286d3f6ca8 Mon Sep 17 00:00:00 2001
From: Evan Chan <ev@ooyala.com>
Date: Sun, 6 Apr 2014 19:17:33 -0700
Subject: [PATCH 21/21] SPARK-1154: Clean up app folders in worker nodes

This is a fix for [SPARK-1154](https://issues.apache.org/jira/browse/SPARK-1154).   The issue is that worker nodes fill up with a huge number of app-* folders after some time.  This change adds a periodic cleanup task which asynchronously deletes app directories older than a configurable TTL.

Two new configuration parameters have been introduced:
  spark.worker.cleanup_interval
  spark.worker.app_data_ttl

This change does not include moving the downloads of application jars to a location outside of the work directory.  We will address that if we have time, but that potentially involves caching so it will come either as part of this PR or a separate PR.

Author: Evan Chan <ev@ooyala.com>
Author: Kelvin Chu <kelvinkwchu@yahoo.com>

Closes #288 from velvia/SPARK-1154-cleanup-app-folders and squashes the following commits:

0689995 [Evan Chan] CR from @aarondav - move config, clarify for standalone mode
9f10d96 [Evan Chan] CR from @pwendell - rename configs and add cleanup.enabled
f2f6027 [Evan Chan] CR from @andrewor14
553d8c2 [Kelvin Chu] change the variable name to currentTimeMillis since it actually tracks in seconds
8dc9cb5 [Kelvin Chu] Fixed a bug in Utils.findOldFiles() after merge.
cb52f2b [Kelvin Chu] Change the name of findOldestFiles() to findOldFiles()
72f7d2d [Kelvin Chu] Fix a bug of Utils.findOldestFiles(). file.lastModified is returned in milliseconds.
ad99955 [Kelvin Chu] Add unit test for Utils.findOldestFiles()
dc1a311 [Evan Chan] Don't recompute current time with every new file
e3c408e [Evan Chan] Document the two new settings
b92752b [Evan Chan] SPARK-1154: Add a periodic task to clean up app directories
---
 .../apache/spark/deploy/DeployMessage.scala   |  4 +++
 .../apache/spark/deploy/worker/Worker.scala   | 23 +++++++++++++++-
 .../scala/org/apache/spark/util/Utils.scala   | 19 ++++++++++++--
 .../org/apache/spark/util/UtilsSuite.scala    | 15 ++++++++++-
 docs/configuration.md                         | 26 +++++++++++++++++++
 5 files changed, 83 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
index 83ce14a0a806a..a7368f9f3dfbe 100644
--- a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
@@ -86,6 +86,10 @@ private[deploy] object DeployMessages {
 
   case class KillDriver(driverId: String) extends DeployMessage
 
+  // Worker internal
+
+  case object WorkDirCleanup      // Sent to Worker actor periodically for cleaning up app folders
+
   // AppClient to Master
 
   case class RegisterApplication(appDescription: ApplicationDescription)
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 8a71ddda4cb5e..bf5a8d09dd2df 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -64,6 +64,12 @@ private[spark] class Worker(
   val REGISTRATION_TIMEOUT = 20.seconds
   val REGISTRATION_RETRIES = 3
 
+  val CLEANUP_ENABLED = conf.getBoolean("spark.worker.cleanup.enabled", true)
+  // How often worker will clean up old app folders
+  val CLEANUP_INTERVAL_MILLIS = conf.getLong("spark.worker.cleanup.interval", 60 * 30) * 1000
+  // TTL for app folders/data;  after TTL expires it will be cleaned up
+  val APP_DATA_RETENTION_SECS = conf.getLong("spark.worker.cleanup.appDataTtl", 7 * 24 * 3600)
+
   // Index into masterUrls that we're currently trying to register with.
   var masterIndex = 0
 
@@ -179,12 +185,28 @@ private[spark] class Worker(
       registered = true
       changeMaster(masterUrl, masterWebUiUrl)
       context.system.scheduler.schedule(0 millis, HEARTBEAT_MILLIS millis, self, SendHeartbeat)
+      if (CLEANUP_ENABLED) {
+        context.system.scheduler.schedule(CLEANUP_INTERVAL_MILLIS millis,
+          CLEANUP_INTERVAL_MILLIS millis, self, WorkDirCleanup)
+      }
 
     case SendHeartbeat =>
       masterLock.synchronized {
         if (connected) { master ! Heartbeat(workerId) }
       }
 
+    case WorkDirCleanup =>
+      // Spin up a separate thread (in a future) to do the dir cleanup; don't tie up worker actor
+      val cleanupFuture = concurrent.future {
+        logInfo("Cleaning up oldest application directories in " + workDir + " ...")
+        Utils.findOldFiles(workDir, APP_DATA_RETENTION_SECS)
+          .foreach(Utils.deleteRecursively)
+      }
+      cleanupFuture onFailure {
+        case e: Throwable =>
+          logError("App dir cleanup failed: " + e.getMessage, e)
+      }
+
     case MasterChanged(masterUrl, masterWebUiUrl) =>
       logInfo("Master has changed, new master is at " + masterUrl)
       changeMaster(masterUrl, masterWebUiUrl)
@@ -331,7 +353,6 @@ private[spark] class Worker(
 }
 
 private[spark] object Worker {
-
   def main(argStrings: Array[String]) {
     val args = new WorkerArguments(argStrings)
     val (actorSystem, _) = startSystemAndActor(args.host, args.port, args.webUiPort, args.cores,
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index d3c39dee330b2..4435b21a7505e 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -597,9 +597,24 @@ private[spark] object Utils extends Logging {
     }
 
     if (fileInCanonicalDir.getCanonicalFile().equals(fileInCanonicalDir.getAbsoluteFile())) {
-      return false;
+      return false
     } else {
-      return true;
+      return true
+    }
+  }
+
+  /**
+   * Finds all the files in a directory whose last modified time is older than cutoff seconds.
+   * @param dir  must be the path to a directory, or IllegalArgumentException is thrown
+   * @param cutoff measured in seconds. Files older than this are returned.
+   */
+  def findOldFiles(dir: File, cutoff: Long): Seq[File] = {
+    val currentTimeMillis = System.currentTimeMillis
+    if (dir.isDirectory) {
+      val files = listFilesSafely(dir)
+      files.filter { file => file.lastModified < (currentTimeMillis - cutoff * 1000) }
+    } else {
+      throw new IllegalArgumentException(dir + " is not a directory!")
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 616214fb5e3a6..eb7fb6318262b 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.util
 
 import scala.util.Random
 
-import java.io.{ByteArrayOutputStream, ByteArrayInputStream, FileOutputStream}
+import java.io.{File, ByteArrayOutputStream, ByteArrayInputStream, FileOutputStream}
 import java.nio.{ByteBuffer, ByteOrder}
 
 import com.google.common.base.Charsets
@@ -154,5 +154,18 @@ class UtilsSuite extends FunSuite {
     val iterator = Iterator.range(0, 5)
     assert(Utils.getIteratorSize(iterator) === 5L)
   }
+
+  test("findOldFiles") {
+    // create some temporary directories and files
+    val parent: File = Utils.createTempDir()
+    val child1: File = Utils.createTempDir(parent.getCanonicalPath) // The parent directory has two child directories
+    val child2: File = Utils.createTempDir(parent.getCanonicalPath)
+    // set the last modified time of child1 to 10 secs old
+    child1.setLastModified(System.currentTimeMillis() - (1000 * 10))
+
+    val result = Utils.findOldFiles(parent, 5) // find files older than 5 secs
+    assert(result.size.equals(1))
+    assert(result(0).getCanonicalPath.equals(child1.getCanonicalPath))
+  }
 }
 
diff --git a/docs/configuration.md b/docs/configuration.md
index b6005acac8b93..57bda20edcdf1 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -348,6 +348,32 @@ Apart from these, the following properties are also available, and may be useful
     receives no heartbeats.
   </td>
 </tr>
+<tr>
+  <td>spark.worker.cleanup.enabled</td>
+  <td>true</td>
+  <td>
+    Enable periodic cleanup of worker / application directories.  Note that this only affects standalone
+    mode, as YARN works differently.
+  </td>
+</tr>
+<tr>
+  <td>spark.worker.cleanup.interval</td>
+  <td>1800 (30 minutes)</td>
+  <td>
+    Controls the interval, in seconds, at which the worker cleans up old application work dirs
+    on the local machine.
+  </td>
+</tr>
+<tr>
+  <td>spark.worker.cleanup.appDataTtl</td>
+  <td>7 * 24 * 3600 (7 days)</td>
+  <td>
+    The number of seconds to retain application work directories on each worker.  This is a Time To Live
+    and should depend on the amount of available disk space you have.  Application logs and jars are
+    downloaded to each application work dir.  Over time, the work dirs can quickly fill up disk space,
+    especially if you run jobs very frequently.
+  </td>
+</tr>
 <tr>
   <td>spark.akka.frameSize</td>
   <td>10</td>

Master URL	Meaning
local	Run Spark locally with one worker thread (i.e. no parallelism at all).
local[K]	Run Spark locally with K worker threads (ideally, set this to the number of cores on your machine). +
local[K]	Run Spark locally with K worker threads (ideally, set this to the number of cores on your machine).
spark://HOST:PORT	Connect to the given Spark standalone - cluster master. The port must be whichever one your master is configured to use, which is 7077 by default. +
spark://HOST:PORT	Connect to the given Spark standalone + cluster master. The port must be whichever one your master is configured to use, which is 7077 by default.
mesos://HOST:PORT	Connect to the given Mesos cluster. - The host parameter is the hostname of the Mesos master. The port must be whichever one the master is configured to use, - which is 5050 by default. +
mesos://HOST:PORT	Connect to the given Mesos cluster. + The host parameter is the hostname of the Mesos master. The port must be whichever one the master is configured to use, + which is 5050 by default.
Storage Level	Meaning
MEMORY_AND_DISK_SER	Similar to MEMORY_ONLY_SER, but spill partitions that don't fit in memory to disk instead of recomputing them - on the fly each time they're needed.	Similar to MEMORY_ONLY_SER, but spill partitions that don't fit in memory to disk instead of + recomputing them on the fly each time they're needed.
OFF_HEAP	Store RDD in a serialized format in Tachyon. + This is generally more space-efficient than deserialized objects, especially when using a + fast serializer, but more CPU-intensive to read. + This also significantly reduces the overheads of GC. +
DISK_ONLY
spark.worker.cleanup.enabled	true	+ Enable periodic cleanup of worker / application directories. Note that this only affects standalone + mode, as YARN works differently. +
spark.worker.cleanup.interval	1800 (30 minutes)	+ Controls the interval, in seconds, at which the worker cleans up old application work dirs + on the local machine. +
spark.worker.cleanup.appDataTtl	7 * 24 * 3600 (7 days)	+ The number of seconds to retain application work directories on each worker. This is a Time To Live + and should depend on the amount of available disk space you have. Application logs and jars are + downloaded to each application work dir. Over time, the work dirs can quickly fill up disk space, + especially if you run jobs very frequently. +
spark.akka.frameSize	10