Merge branch 'master' of https://github.com/apache/spark into SPARK-1413

witgo · Apr 5, 2014 · 5ad52bd · 5ad52bd
2 parents 76670c1 + 7c18428
commit 5ad52bd
Show file tree

Hide file tree

Showing 71 changed files with 1,716 additions and 301 deletions.
diff --git a/.rat-excludes b/.rat-excludes
@@ -39,3 +39,4 @@ work
 .*\.q
 golden
 test.out/*
+.*iml
diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh
@@ -30,6 +30,9 @@ if [ -z "$SPARK_ENV_LOADED" ]; then
   use_conf_dir=${SPARK_CONF_DIR:-"$parent_dir/conf"}
 
   if [ -f "${use_conf_dir}/spark-env.sh" ]; then
+    # Promote all variable declarations to environment (exported) variables
+    set -a
     . "${use_conf_dir}/spark-env.sh"
+    set +a
   fi
 fi
diff --git a/bin/spark-shell b/bin/spark-shell
@@ -127,7 +127,7 @@ function set_spark_log_conf(){
 
 function set_spark_master(){
     if ! [[ "$1" =~ $ARG_FLAG_PATTERN ]]; then
-        MASTER="$1"
+        export MASTER="$1"
     else
         out_error "wrong format for $2"
     fi
@@ -145,7 +145,7 @@ function resolve_spark_master(){
     fi
 
     if [ -z "$MASTER" ]; then
-        MASTER="$DEFAULT_MASTER"
+        export MASTER="$DEFAULT_MASTER"
     fi
 
 }

diff --git a/core/pom.xml b/core/pom.xml
@@ -200,6 +200,53 @@
       <artifactId>derby</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.tachyonproject</groupId>
+      <artifactId>tachyon</artifactId>
+      <version>0.4.1-thrift</version>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-client</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.curator</groupId>
+          <artifactId>curator-recipes</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-jsp</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-webapp</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-server</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-servlet</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>junit</groupId>
+          <artifactId>junit</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.powermock</groupId>
+          <artifactId>powermock-module-junit4</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.powermock</groupId>
+          <artifactId>powermock-api-mockito</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.curator</groupId>
+          <artifactId>curator-test</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_${scala.binary.version}</artifactId>

diff --git a/core/src/main/java/org/apache/spark/api/java/StorageLevels.java b/core/src/main/java/org/apache/spark/api/java/StorageLevels.java
@@ -23,17 +23,18 @@
  * Expose some commonly useful storage level constants.
  */
 public class StorageLevels {
-  public static final StorageLevel NONE = create(false, false, false, 1);
-  public static final StorageLevel DISK_ONLY = create(true, false, false, 1);
-  public static final StorageLevel DISK_ONLY_2 = create(true, false, false, 2);
-  public static final StorageLevel MEMORY_ONLY = create(false, true, true, 1);
-  public static final StorageLevel MEMORY_ONLY_2 = create(false, true, true, 2);
-  public static final StorageLevel MEMORY_ONLY_SER = create(false, true, false, 1);
-  public static final StorageLevel MEMORY_ONLY_SER_2 = create(false, true, false, 2);
-  public static final StorageLevel MEMORY_AND_DISK = create(true, true, true, 1);
-  public static final StorageLevel MEMORY_AND_DISK_2 = create(true, true, true, 2);
-  public static final StorageLevel MEMORY_AND_DISK_SER = create(true, true, false, 1);
-  public static final StorageLevel MEMORY_AND_DISK_SER_2 = create(true, true, false, 2);
+  public static final StorageLevel NONE = create(false, false, false, false, 1);
+  public static final StorageLevel DISK_ONLY = create(true, false, false, false, 1);
+  public static final StorageLevel DISK_ONLY_2 = create(true, false, false, false, 2);
+  public static final StorageLevel MEMORY_ONLY = create(false, true, false, true, 1);
+  public static final StorageLevel MEMORY_ONLY_2 = create(false, true, false, true, 2);
+  public static final StorageLevel MEMORY_ONLY_SER = create(false, true, false, false, 1);
+  public static final StorageLevel MEMORY_ONLY_SER_2 = create(false, true, false, false, 2);
+  public static final StorageLevel MEMORY_AND_DISK = create(true, true, false, true, 1);
+  public static final StorageLevel MEMORY_AND_DISK_2 = create(true, true, false, true, 2);
+  public static final StorageLevel MEMORY_AND_DISK_SER = create(true, true, false, false, 1);
+  public static final StorageLevel MEMORY_AND_DISK_SER_2 = create(true, true, false, false, 2);
+  public static final StorageLevel OFF_HEAP = create(false, false, true, false, 1);
 
   /**
    * Create a new StorageLevel object.
@@ -42,7 +43,26 @@ public class StorageLevels {
    * @param deserialized saved as deserialized objects, if true
    * @param replication replication factor
    */
-  public static StorageLevel create(boolean useDisk, boolean useMemory, boolean deserialized, int replication) {
-    return StorageLevel.apply(useDisk, useMemory, deserialized, replication);
+  @Deprecated
+  public static StorageLevel create(boolean useDisk, boolean useMemory, boolean deserialized,
+      int replication) {
+    return StorageLevel.apply(useDisk, useMemory, false, deserialized, replication);
+  }
+
+  /**
+   * Create a new StorageLevel object.
+   * @param useDisk saved to disk, if true
+   * @param useMemory saved to memory, if true
+   * @param useOffHeap saved to Tachyon, if true
+   * @param deserialized saved as deserialized objects, if true
+   * @param replication replication factor
+   */
+  public static StorageLevel create(
+    boolean useDisk,
+    boolean useMemory,
+    boolean useOffHeap,
+    boolean deserialized,
+    int replication) {
+    return StorageLevel.apply(useDisk, useMemory, useOffHeap, deserialized, replication);
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -19,14 +19,13 @@ package org.apache.spark
 
 import java.io._
 import java.net.URI
-import java.util.{Properties, UUID}
 import java.util.concurrent.atomic.AtomicInteger
-
+import java.util.{Properties, UUID}
+import java.util.UUID.randomUUID
 import scala.collection.{Map, Set}
 import scala.collection.generic.Growable
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.reflect.{ClassTag, classTag}
-
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable, FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
@@ -37,6 +36,7 @@ import org.apache.mesos.MesosNativeLibrary
 
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.{LocalSparkCluster, SparkHadoopUtil}
+import org.apache.spark.input.WholeTextFileInputFormat
 import org.apache.spark.partial.{ApproximateEvaluator, PartialResult}
 import org.apache.spark.rdd._
 import org.apache.spark.scheduler._
@@ -129,6 +129,11 @@ class SparkContext(
   val master = conf.get("spark.master")
   val appName = conf.get("spark.app.name")
 
+  // Generate the random name for a temp folder in Tachyon
+  // Add a timestamp as the suffix here to make it more safe
+  val tachyonFolderName = "spark-" + randomUUID.toString()
+  conf.set("spark.tachyonStore.folderName", tachyonFolderName)
+
   val isLocal = (master == "local" || master.startsWith("local["))
 
   if (master == "yarn-client") System.setProperty("SPARK_YARN_MODE", "true")
@@ -371,6 +376,39 @@ class SparkContext(
       minSplits).map(pair => pair._2.toString)
   }
 
+  /**
+   * Read a directory of text files from HDFS, a local file system (available on all nodes), or any
+   * Hadoop-supported file system URI. Each file is read as a single record and returned in a
+   * key-value pair, where the key is the path of each file, the value is the content of each file.
+   *
+   * <p> For example, if you have the following files:
+   * {{{
+   *   hdfs://a-hdfs-path/part-00000
+   *   hdfs://a-hdfs-path/part-00001
+   *   ...
+   *   hdfs://a-hdfs-path/part-nnnnn
+   * }}}
+   *
+   * Do `val rdd = sparkContext.wholeTextFile("hdfs://a-hdfs-path")`,
+   *
+   * <p> then `rdd` contains
+   * {{{
+   *   (a-hdfs-path/part-00000, its content)
+   *   (a-hdfs-path/part-00001, its content)
+   *   ...
+   *   (a-hdfs-path/part-nnnnn, its content)
+   * }}}
+   *
+   * @note Small files are preferred, as each file will be loaded fully in memory.
+   */
+  def wholeTextFiles(path: String): RDD[(String, String)] = {
+    newAPIHadoopFile(
+      path,
+      classOf[WholeTextFileInputFormat],
+      classOf[String],
+      classOf[String])
+  }
+
   /**
    * Get an RDD for a Hadoop-readable dataset from a Hadoop JobConf given its InputFormat and other
    * necessary info (e.g. file name for a filesystem-based dataset, table name for HyperTable),

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -154,6 +154,34 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
    */
   def textFile(path: String, minSplits: Int): JavaRDD[String] = sc.textFile(path, minSplits)
 
+  /**
+   * Read a directory of text files from HDFS, a local file system (available on all nodes), or any
+   * Hadoop-supported file system URI. Each file is read as a single record and returned in a
+   * key-value pair, where the key is the path of each file, the value is the content of each file.
+   *
+   * <p> For example, if you have the following files:
+   * {{{
+   *   hdfs://a-hdfs-path/part-00000
+   *   hdfs://a-hdfs-path/part-00001
+   *   ...
+   *   hdfs://a-hdfs-path/part-nnnnn
+   * }}}
+   *
+   * Do `JavaPairRDD<String, String> rdd = sparkContext.wholeTextFiles("hdfs://a-hdfs-path")`,
+   *
+   * <p> then `rdd` contains
+   * {{{
+   *   (a-hdfs-path/part-00000, its content)
+   *   (a-hdfs-path/part-00001, its content)
+   *   ...
+   *   (a-hdfs-path/part-nnnnn, its content)
+   * }}}
+   *
+   * @note Small files are preferred, as each file will be loaded fully in memory.
+   */
+  def wholeTextFiles(path: String): JavaPairRDD[String, String] =
+    new JavaPairRDD(sc.wholeTextFiles(path))
+
   /** Get an RDD for a Hadoop SequenceFile with given key and value types.
     *
     * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -19,6 +19,7 @@ package org.apache.spark.api.python
 
 import java.io._
 import java.net._
+import java.nio.charset.Charset
 import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
 
 import scala.collection.JavaConversions._
@@ -206,6 +207,7 @@ private object SpecialLengths {
 }
 
 private[spark] object PythonRDD {
+  val UTF8 = Charset.forName("UTF-8")
 
   def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int):
   JavaRDD[Array[Byte]] = {
@@ -266,7 +268,7 @@ private[spark] object PythonRDD {
   }
 
   def writeUTF(str: String, dataOut: DataOutputStream) {
-    val bytes = str.getBytes("UTF-8")
+    val bytes = str.getBytes(UTF8)
     dataOut.writeInt(bytes.length)
     dataOut.write(bytes)
   }
@@ -286,7 +288,7 @@ private[spark] object PythonRDD {
 
 private
 class BytesToString extends org.apache.spark.api.java.function.Function[Array[Byte], String] {
-  override def call(arr: Array[Byte]) : String = new String(arr, "UTF-8")
+  override def call(arr: Array[Byte]) : String = new String(arr, PythonRDD.UTF8)
 }
 
 /**

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -79,20 +79,23 @@ object SparkSubmit {
       printErrorAndExit("master must start with yarn, mesos, spark, or local")
     }
 
-    // Because "yarn-standalone" and "yarn-client" encapsulate both the master
+    // Because "yarn-cluster" and "yarn-client" encapsulate both the master
     // and deploy mode, we have some logic to infer the master and deploy mode
     // from each other if only one is specified, or exit early if they are at odds.
-    if (appArgs.deployMode == null && appArgs.master == "yarn-standalone") {
+    if (appArgs.deployMode == null &&
+        (appArgs.master == "yarn-standalone" || appArgs.master == "yarn-cluster")) {
       appArgs.deployMode = "cluster"
     }
     if (appArgs.deployMode == "cluster" && appArgs.master == "yarn-client") {
       printErrorAndExit("Deploy mode \"cluster\" and master \"yarn-client\" are not compatible")
     }
-    if (appArgs.deployMode == "client" && appArgs.master == "yarn-standalone") {
-      printErrorAndExit("Deploy mode \"client\" and master \"yarn-standalone\" are not compatible")
+    if (appArgs.deployMode == "client" &&
+        (appArgs.master == "yarn-standalone" || appArgs.master == "yarn-cluster")) {
+      printErrorAndExit("Deploy mode \"client\" and master \"" + appArgs.master
+        + "\" are not compatible")
     }
     if (appArgs.deployMode == "cluster" && appArgs.master.startsWith("yarn")) {
-      appArgs.master = "yarn-standalone"
+      appArgs.master = "yarn-cluster"
     }
     if (appArgs.deployMode != "cluster" && appArgs.master.startsWith("yarn")) {
       appArgs.master = "yarn-client"

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -171,7 +171,7 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {
       outStream.println("Unknown/unsupported param " + unknownParam)
     }
     outStream.println(
-      """Usage: spark-submit <primary binary> [options]
+      """Usage: spark-submit <app jar> [options]
         |Options:
         |  --master MASTER_URL         spark://host:port, mesos://host:port, yarn, or local.
         |  --deploy-mode DEPLOY_MODE   Mode to deploy the app in, either 'client' or 'cluster'.

diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -53,7 +53,8 @@ private[spark] class CoarseGrainedExecutorBackend(
     case RegisteredExecutor(sparkProperties) =>
       logInfo("Successfully registered with driver")
       // Make this host instead of hostPort ?
-      executor = new Executor(executorId, Utils.parseHostPort(hostPort)._1, sparkProperties)
+      executor = new Executor(executorId, Utils.parseHostPort(hostPort)._1, sparkProperties, 
+        false)
 
     case RegisterExecutorFailed(message) =>
       logError("Slave registration failed: " + message)
@@ -105,7 +106,8 @@ private[spark] object CoarseGrainedExecutorBackend {
     // set it
     val sparkHostPort = hostname + ":" + boundPort
     actorSystem.actorOf(
-      Props(classOf[CoarseGrainedExecutorBackend], driverUrl, executorId, sparkHostPort, cores),
+      Props(classOf[CoarseGrainedExecutorBackend], driverUrl, executorId,
+        sparkHostPort, cores),
       name = "Executor")
     workerUrl.foreach{ url =>
       actorSystem.actorOf(Props(classOf[WorkerWatcher], url), name = "WorkerWatcher")

diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala
@@ -41,13 +41,22 @@ object ExecutorExitCode {
   /** DiskStore failed to create a local temporary directory after many attempts. */
   val DISK_STORE_FAILED_TO_CREATE_DIR = 53
 
+  /** TachyonStore failed to initialize after many attempts. */
+  val TACHYON_STORE_FAILED_TO_INITIALIZE = 54
+
+  /** TachyonStore failed to create a local temporary directory after many attempts. */
+  val TACHYON_STORE_FAILED_TO_CREATE_DIR = 55
+
   def explainExitCode(exitCode: Int): String = {
     exitCode match {
       case UNCAUGHT_EXCEPTION => "Uncaught exception"
       case UNCAUGHT_EXCEPTION_TWICE => "Uncaught exception, and logging the exception failed"
       case OOM => "OutOfMemoryError"
       case DISK_STORE_FAILED_TO_CREATE_DIR =>
         "Failed to create local directory (bad spark.local.dir?)"
+      case TACHYON_STORE_FAILED_TO_INITIALIZE => "TachyonStore failed to initialize."
+      case TACHYON_STORE_FAILED_TO_CREATE_DIR =>
+        "TachyonStore failed to create a local temporary directory."
       case _ => 
         "Unknown executor exit code (" + exitCode + ")" + (
           if (exitCode > 128) {