up-merging to the current master branch of the apache spark

witgo · Mar 24, 2014 · 2825a13 · 2825a13
2 parents 8968b67 + 646e554
commit 2825a13
Show file tree

Hide file tree

Showing 11,572 changed files with 253,338 additions and 2,800 deletions.
diff --git a/NOTICE b/NOTICE
@@ -1,5 +1,5 @@
 Apache Spark
-Copyright 2013 The Apache Software Foundation.
+Copyright 2014 The Apache Software Foundation.
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -79,6 +79,11 @@
       <artifactId>spark-graphx_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>net.sf.py4j</groupId>
       <artifactId>py4j</artifactId>

diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
@@ -33,23 +33,43 @@ fi
 # Build up classpath
 CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
 
+# Support for interacting with Hive.  Since hive pulls in a lot of dependencies that might break
+# existing Spark applications, it is not included in the standard spark assembly.  Instead, we only
+# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
+# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
+# the future.
+if [ -f "$FWDIR"/sql/hive/target/scala-$SCALA_VERSION/spark-hive-assembly-*.jar ]; then
+  echo "Hive assembly found, including hive support.  If this isn't desired run sbt hive/clean."
+
+  # Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
+  DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
+  CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
+
+  ASSEMBLY_DIR="$FWDIR/sql/hive/target/scala-$SCALA_VERSION/"
+else
+  ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
+fi
+
 # First check if we have a dependencies jar. If so, include binary classes with the deps jar
-if [ -f "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-deps.jar ]; then
+if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
   CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
 
-  DEPS_ASSEMBLY_JAR=`ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-deps.jar`
+  DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*-deps.jar`
   CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
 else
   # Else use spark-assembly jar from either RELEASE or assembly directory
   if [ -f "$FWDIR/RELEASE" ]; then
-    ASSEMBLY_JAR=`ls "$FWDIR"/jars/spark-assembly*.jar`
+    ASSEMBLY_JAR=`ls "$FWDIR"/jars/spark*-assembly*.jar`
   else
-    ASSEMBLY_JAR=`ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*.jar`
+    ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*.jar`
   fi
   CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
 fi
@@ -62,6 +82,9 @@ if [[ $SPARK_TESTING == 1 ]]; then
   CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/test-classes"
   CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/test-classes"
   CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/test-classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/test-classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/test-classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/test-classes"
 fi
 
 # Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail !

diff --git a/core/src/main/scala/org/apache/spark/Aggregator.scala b/core/src/main/scala/org/apache/spark/Aggregator.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark
 
-import scala.{Option, deprecated}
-
 import org.apache.spark.util.collection.{AppendOnlyMap, ExternalAppendOnlyMap}
 
 /**

diff --git a/core/src/main/scala/org/apache/spark/CacheManager.scala b/core/src/main/scala/org/apache/spark/CacheManager.scala
@@ -20,11 +20,12 @@ package org.apache.spark
 import scala.collection.mutable.{ArrayBuffer, HashSet}
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.storage.{BlockManager, RDDBlockId, StorageLevel}
+import org.apache.spark.storage.{BlockId, BlockManager, BlockStatus, RDDBlockId, StorageLevel}
 
-/** Spark class responsible for passing RDDs split contents to the BlockManager and making
-    sure a node doesn't load two copies of an RDD at once.
-  */
+/**
+ * Spark class responsible for passing RDDs split contents to the BlockManager and making
+ * sure a node doesn't load two copies of an RDD at once.
+ */
 private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
 
   /** Keys of RDD splits that are being computed/loaded. */
@@ -49,11 +50,11 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
               try {loading.wait()} catch {case _ : Throwable =>}
             }
             logInfo("Finished waiting for %s".format(key))
-            // See whether someone else has successfully loaded it. The main way this would fail
-            // is for the RDD-level cache eviction policy if someone else has loaded the same RDD
-            // partition but we didn't want to make space for it. However, that case is unlikely
-            // because it's unlikely that two threads would work on the same RDD partition. One
-            // downside of the current code is that threads wait serially if this does happen.
+            /* See whether someone else has successfully loaded it. The main way this would fail
+             * is for the RDD-level cache eviction policy if someone else has loaded the same RDD
+             * partition but we didn't want to make space for it. However, that case is unlikely
+             * because it's unlikely that two threads would work on the same RDD partition. One
+             * downside of the current code is that threads wait serially if this does happen. */
             blockManager.get(key) match {
               case Some(values) =>
                 return new InterruptibleIterator(context, values.asInstanceOf[Iterator[T]])
@@ -69,32 +70,45 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
           // If we got here, we have to load the split
           logInfo("Partition %s not found, computing it".format(key))
           val computedValues = rdd.computeOrReadCheckpoint(split, context)
+
           // Persist the result, so long as the task is not running locally
           if (context.runningLocally) { return computedValues }
-          if (storageLevel.useDisk && !storageLevel.useMemory) {
-            // In the case that this RDD is to be persisted using DISK_ONLY
-            // the iterator will be passed directly to the blockManager (rather then
-            // caching it to an ArrayBuffer first), then the resulting block data iterator
-            // will be passed back to the user. If the iterator generates a lot of data,
-            // this means that it doesn't all have to be held in memory at one time.
-            // This could also apply to MEMORY_ONLY_SER storage, but we need to make sure
-            // blocks aren't dropped by the block store before enabling that.
-            blockManager.put(key, computedValues, storageLevel, tellMaster = true)
-            return blockManager.get(key) match {
-              case Some(values) =>
-                return new InterruptibleIterator(context, values.asInstanceOf[Iterator[T]])
-              case None =>
-                logInfo("Failure to store %s".format(key))
-                throw new Exception("Block manager failed to return persisted valued")
+
+          // Keep track of blocks with updated statuses
+          var updatedBlocks = Seq[(BlockId, BlockStatus)]()
+          val returnValue: Iterator[T] = {
+            if (storageLevel.useDisk && !storageLevel.useMemory) {
+              /* In the case that this RDD is to be persisted using DISK_ONLY
+               * the iterator will be passed directly to the blockManager (rather then
+               * caching it to an ArrayBuffer first), then the resulting block data iterator
+               * will be passed back to the user. If the iterator generates a lot of data,
+               * this means that it doesn't all have to be held in memory at one time.
+               * This could also apply to MEMORY_ONLY_SER storage, but we need to make sure
+               * blocks aren't dropped by the block store before enabling that. */
+              updatedBlocks = blockManager.put(key, computedValues, storageLevel, tellMaster = true)
+              blockManager.get(key) match {
+                case Some(values) =>
+                  new InterruptibleIterator(context, values.asInstanceOf[Iterator[T]])
+                case None =>
+                  logInfo("Failure to store %s".format(key))
+                  throw new Exception("Block manager failed to return persisted valued")
+              }
+            } else {
+              // In this case the RDD is cached to an array buffer. This will save the results
+              // if we're dealing with a 'one-time' iterator
+              val elements = new ArrayBuffer[Any]
+              elements ++= computedValues
+              updatedBlocks = blockManager.put(key, elements, storageLevel, tellMaster = true)
+              elements.iterator.asInstanceOf[Iterator[T]]
             }
-          } else {
-            // In this case the RDD is cached to an array buffer. This will save the results
-            // if we're dealing with a 'one-time' iterator
-            val elements = new ArrayBuffer[Any]
-            elements ++= computedValues
-            blockManager.put(key, elements, storageLevel, tellMaster = true)
-            return elements.iterator.asInstanceOf[Iterator[T]]
           }
+
+          // Update task metrics to include any blocks whose storage status is updated
+          val metrics = context.taskMetrics
+          metrics.updatedBlocks = Some(updatedBlocks)
+
+          returnValue
+
         } finally {
           loading.synchronized {
             loading.remove(key)

diff --git a/core/src/main/scala/org/apache/spark/Dependency.scala b/core/src/main/scala/org/apache/spark/Dependency.scala
@@ -18,6 +18,7 @@
 package org.apache.spark
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.serializer.Serializer
 
 /**
  * Base class for dependencies.
@@ -43,12 +44,13 @@ abstract class NarrowDependency[T](rdd: RDD[T]) extends Dependency(rdd) {
  * Represents a dependency on the output of a shuffle stage.
  * @param rdd the parent RDD
  * @param partitioner partitioner used to partition the shuffle output
- * @param serializerClass class name of the serializer to use
+ * @param serializer [[Serializer]] to use. If set to null, the default serializer, as specified
+ *                  by `spark.serializer` config option, will be used.
  */
 class ShuffleDependency[K, V](
     @transient rdd: RDD[_ <: Product2[K, V]],
     val partitioner: Partitioner,
-    val serializerClass: String = null)
+    val serializer: Serializer = null)
   extends Dependency(rdd.asInstanceOf[RDD[Product2[K, V]]]) {
 
   val shuffleId: Int = rdd.context.newShuffleId()

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -35,13 +35,28 @@ private[spark] case class GetMapOutputStatuses(shuffleId: Int)
   extends MapOutputTrackerMessage
 private[spark] case object StopMapOutputTracker extends MapOutputTrackerMessage
 
-private[spark] class MapOutputTrackerMasterActor(tracker: MapOutputTrackerMaster)
+private[spark] class MapOutputTrackerMasterActor(tracker: MapOutputTrackerMaster, conf: SparkConf)
   extends Actor with Logging {
+  val maxAkkaFrameSize = AkkaUtils.maxFrameSizeBytes(conf)
+
   def receive = {
     case GetMapOutputStatuses(shuffleId: Int) =>
       val hostPort = sender.path.address.hostPort
       logInfo("Asked to send map output locations for shuffle " + shuffleId + " to " + hostPort)
-      sender ! tracker.getSerializedMapOutputStatuses(shuffleId)
+      val mapOutputStatuses = tracker.getSerializedMapOutputStatuses(shuffleId)
+      val serializedSize = mapOutputStatuses.size
+      if (serializedSize > maxAkkaFrameSize) {
+        val msg = s"Map output statuses were $serializedSize bytes which " +
+          s"exceeds spark.akka.frameSize ($maxAkkaFrameSize bytes)."
+
+        /* For SPARK-1244 we'll opt for just logging an error and then throwing an exception.
+         * Note that on exception the actor will just restart. A bigger refactoring (SPARK-1239)
+         * will ultimately remove this entire code path. */
+        val exception = new SparkException(msg)
+        logError(msg, exception)
+        throw exception
+      }
+      sender ! mapOutputStatuses
 
     case StopMapOutputTracker =>
       logInfo("MapOutputTrackerActor stopped!")

diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala
@@ -18,13 +18,13 @@
 package org.apache.spark
 
 import java.net.{Authenticator, PasswordAuthentication}
-import org.apache.hadoop.io.Text
-import org.apache.hadoop.security.Credentials
-import org.apache.hadoop.security.UserGroupInformation
-import org.apache.spark.deploy.SparkHadoopUtil
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.apache.hadoop.io.Text
+
+import org.apache.spark.deploy.SparkHadoopUtil
+
 /** 
  * Spark class responsible for security. 
  * 

diff --git a/core/src/main/scala/org/apache/spark/ShuffleFetcher.scala b/core/src/main/scala/org/apache/spark/ShuffleFetcher.scala
@@ -29,7 +29,7 @@ private[spark] abstract class ShuffleFetcher {
       shuffleId: Int,
       reduceId: Int,
       context: TaskContext,
-      serializer: Serializer = SparkEnv.get.serializerManager.default): Iterator[T]
+      serializer: Serializer = SparkEnv.get.serializer): Iterator[T]
 
   /** Stop the fetcher */
   def stop() {}