Merge branch 'master' into api

witgo · Apr 9, 2014 · 555e0fe · 555e0fe
2 parents ef1a717 + 9689b66
commit 555e0fe
Show file tree

Hide file tree

Showing 191 changed files with 6,494 additions and 2,719 deletions.
diff --git a/.rat-excludes b/.rat-excludes
@@ -39,4 +39,4 @@ work
 .*\.q
 golden
 test.out/*
-.*iml
+.*iml
diff --git a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala b/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
@@ -220,27 +220,31 @@ object Bagel extends Logging {
    */
   private def comp[K: Manifest, V <: Vertex, M <: Message[K], C](
     sc: SparkContext,
-    grouped: RDD[(K, (Seq[C], Seq[V]))],
+    grouped: RDD[(K, (Iterable[C], Iterable[V]))],
     compute: (V, Option[C]) => (V, Array[M]),
     storageLevel: StorageLevel
   ): (RDD[(K, (V, Array[M]))], Int, Int) = {
     var numMsgs = sc.accumulator(0)
     var numActiveVerts = sc.accumulator(0)
-    val processed = grouped.flatMapValues {
-      case (_, vs) if vs.size == 0 => None
-      case (c, vs) =>
+    val processed = grouped.mapValues(x => (x._1.iterator, x._2.iterator))
+      .flatMapValues {
+      case (_, vs) if !vs.hasNext => None
+      case (c, vs) => {
         val (newVert, newMsgs) =
-          compute(vs(0), c match {
-            case Seq(comb) => Some(comb)
-            case Seq() => None
-          })
+          compute(vs.next,
+            c.hasNext match {
+              case true => Some(c.next)
+              case false => None
+            }
+          )
 
         numMsgs += newMsgs.size
         if (newVert.active) {
           numActiveVerts += 1
         }
 
         Some((newVert, newMsgs))
+      }
     }.persist(storageLevel)
 
     // Force evaluation of processed RDD for accurate performance measurements

diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
@@ -63,7 +63,7 @@ fi
 # built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
 # assembly is built for Hive, before actually populating the CLASSPATH with the jars.
 # Note that this check order is faster (by up to half a second) in the case where Hive is not used.
-num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ | grep "datanucleus-.*\\.jar" | wc -l)
+num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ 2>/dev/null | grep "datanucleus-.*\\.jar" | wc -l)
 if [ $num_datanucleus_jars -gt 0 ]; then
   AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
   num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)

diff --git a/bin/spark-shell b/bin/spark-shell
@@ -34,7 +34,7 @@ set -o posix
 FWDIR="$(cd `dirname $0`/..; pwd)"
 
 SPARK_REPL_OPTS="${SPARK_REPL_OPTS:-""}"
-DEFAULT_MASTER="local"
+DEFAULT_MASTER="local[*]"
 MASTER=${MASTER:-""}
 
 info_log=0
@@ -64,7 +64,7 @@ ${txtbld}OPTIONS${txtrst}:
                               is followed by m for megabytes or g for gigabytes, e.g. "1g".
     -dm --driver-memory     : The memory used by the Spark Shell, the number is followed 
                               by m for megabytes or g for gigabytes, e.g. "1g".
-    -m  --master            : A full string that describes the Spark Master, defaults to "local"
+    -m  --master            : A full string that describes the Spark Master, defaults to "local[*]"
                               e.g. "spark://localhost:7077".
     --log-conf              : Enables logging of the supplied SparkConf as INFO at start of the
                               Spark Context.

diff --git a/core/src/main/scala/org/apache/spark/ContextCleaner.scala b/core/src/main/scala/org/apache/spark/ContextCleaner.scala
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import java.lang.ref.{ReferenceQueue, WeakReference}
+
+import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer}
+
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+
+/**
+ * Classes that represent cleaning tasks.
+ */
+private sealed trait CleanupTask
+private case class CleanRDD(rddId: Int) extends CleanupTask
+private case class CleanShuffle(shuffleId: Int) extends CleanupTask
+private case class CleanBroadcast(broadcastId: Long) extends CleanupTask
+
+/**
+ * A WeakReference associated with a CleanupTask.
+ *
+ * When the referent object becomes only weakly reachable, the corresponding
+ * CleanupTaskWeakReference is automatically added to the given reference queue.
+ */
+private class CleanupTaskWeakReference(
+    val task: CleanupTask,
+    referent: AnyRef,
+    referenceQueue: ReferenceQueue[AnyRef])
+  extends WeakReference(referent, referenceQueue)
+
+/**
+ * An asynchronous cleaner for RDD, shuffle, and broadcast state.
+ *
+ * This maintains a weak reference for each RDD, ShuffleDependency, and Broadcast of interest,
+ * to be processed when the associated object goes out of scope of the application. Actual
+ * cleanup is performed in a separate daemon thread.
+ */
+private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
+
+  private val referenceBuffer = new ArrayBuffer[CleanupTaskWeakReference]
+    with SynchronizedBuffer[CleanupTaskWeakReference]
+
+  private val referenceQueue = new ReferenceQueue[AnyRef]
+
+  private val listeners = new ArrayBuffer[CleanerListener]
+    with SynchronizedBuffer[CleanerListener]
+
+  private val cleaningThread = new Thread() { override def run() { keepCleaning() }}
+
+  /**
+   * Whether the cleaning thread will block on cleanup tasks.
+   * This is set to true only for tests.
+   */
+  private val blockOnCleanupTasks = sc.conf.getBoolean(
+    "spark.cleaner.referenceTracking.blocking", false)
+
+  @volatile private var stopped = false
+
+  /** Attach a listener object to get information of when objects are cleaned. */
+  def attachListener(listener: CleanerListener) {
+    listeners += listener
+  }
+
+  /** Start the cleaner. */
+  def start() {
+    cleaningThread.setDaemon(true)
+    cleaningThread.setName("Spark Context Cleaner")
+    cleaningThread.start()
+  }
+
+  /** Stop the cleaner. */
+  def stop() {
+    stopped = true
+  }
+
+  /** Register a RDD for cleanup when it is garbage collected. */
+  def registerRDDForCleanup(rdd: RDD[_]) {
+    registerForCleanup(rdd, CleanRDD(rdd.id))
+  }
+
+  /** Register a ShuffleDependency for cleanup when it is garbage collected. */
+  def registerShuffleForCleanup(shuffleDependency: ShuffleDependency[_, _]) {
+    registerForCleanup(shuffleDependency, CleanShuffle(shuffleDependency.shuffleId))
+  }
+
+  /** Register a Broadcast for cleanup when it is garbage collected. */
+  def registerBroadcastForCleanup[T](broadcast: Broadcast[T]) {
+    registerForCleanup(broadcast, CleanBroadcast(broadcast.id))
+  }
+
+  /** Register an object for cleanup. */
+  private def registerForCleanup(objectForCleanup: AnyRef, task: CleanupTask) {
+    referenceBuffer += new CleanupTaskWeakReference(task, objectForCleanup, referenceQueue)
+  }
+
+  /** Keep cleaning RDD, shuffle, and broadcast state. */
+  private def keepCleaning() {
+    while (!stopped) {
+      try {
+        val reference = Option(referenceQueue.remove(ContextCleaner.REF_QUEUE_POLL_TIMEOUT))
+          .map(_.asInstanceOf[CleanupTaskWeakReference])
+        reference.map(_.task).foreach { task =>
+          logDebug("Got cleaning task " + task)
+          referenceBuffer -= reference.get
+          task match {
+            case CleanRDD(rddId) =>
+              doCleanupRDD(rddId, blocking = blockOnCleanupTasks)
+            case CleanShuffle(shuffleId) =>
+              doCleanupShuffle(shuffleId, blocking = blockOnCleanupTasks)
+            case CleanBroadcast(broadcastId) =>
+              doCleanupBroadcast(broadcastId, blocking = blockOnCleanupTasks)
+          }
+        }
+      } catch {
+        case t: Throwable => logError("Error in cleaning thread", t)
+      }
+    }
+  }
+
+  /** Perform RDD cleanup. */
+  def doCleanupRDD(rddId: Int, blocking: Boolean) {
+    try {
+      logDebug("Cleaning RDD " + rddId)
+      sc.unpersistRDD(rddId, blocking)
+      listeners.foreach(_.rddCleaned(rddId))
+      logInfo("Cleaned RDD " + rddId)
+    } catch {
+      case t: Throwable => logError("Error cleaning RDD " + rddId, t)
+    }
+  }
+
+  /** Perform shuffle cleanup, asynchronously. */
+  def doCleanupShuffle(shuffleId: Int, blocking: Boolean) {
+    try {
+      logDebug("Cleaning shuffle " + shuffleId)
+      mapOutputTrackerMaster.unregisterShuffle(shuffleId)
+      blockManagerMaster.removeShuffle(shuffleId, blocking)
+      listeners.foreach(_.shuffleCleaned(shuffleId))
+      logInfo("Cleaned shuffle " + shuffleId)
+    } catch {
+      case t: Throwable => logError("Error cleaning shuffle " + shuffleId, t)
+    }
+  }
+
+  /** Perform broadcast cleanup. */
+  def doCleanupBroadcast(broadcastId: Long, blocking: Boolean) {
+    try {
+      logDebug("Cleaning broadcast " + broadcastId)
+      broadcastManager.unbroadcast(broadcastId, true, blocking)
+      listeners.foreach(_.broadcastCleaned(broadcastId))
+      logInfo("Cleaned broadcast " + broadcastId)
+    } catch {
+      case t: Throwable => logError("Error cleaning broadcast " + broadcastId, t)
+    }
+  }
+
+  private def blockManagerMaster = sc.env.blockManager.master
+  private def broadcastManager = sc.env.broadcastManager
+  private def mapOutputTrackerMaster = sc.env.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
+
+  // Used for testing. These methods explicitly blocks until cleanup is completed
+  // to ensure that more reliable testing.
+}
+
+private object ContextCleaner {
+  private val REF_QUEUE_POLL_TIMEOUT = 100
+}
+
+/**
+ * Listener class used for testing when any item has been cleaned by the Cleaner class.
+ */
+private[spark] trait CleanerListener {
+  def rddCleaned(rddId: Int)
+  def shuffleCleaned(shuffleId: Int)
+  def broadcastCleaned(broadcastId: Long)
+}
diff --git a/core/src/main/scala/org/apache/spark/Dependency.scala b/core/src/main/scala/org/apache/spark/Dependency.scala
@@ -55,6 +55,8 @@ class ShuffleDependency[K, V](
   extends Dependency(rdd.asInstanceOf[RDD[Product2[K, V]]]) {
 
   val shuffleId: Int = rdd.context.newShuffleId()
+
+  rdd.sparkContext.cleaner.foreach(_.registerShuffleForCleanup(this))
 }
 
 

diff --git a/core/src/main/scala/org/apache/spark/FutureAction.scala b/core/src/main/scala/org/apache/spark/FutureAction.scala
@@ -141,7 +141,7 @@ class SimpleFutureAction[T] private[spark](jobWaiter: JobWaiter[_], resultFunc:
   private def awaitResult(): Try[T] = {
     jobWaiter.awaitResult() match {
       case JobSucceeded => scala.util.Success(resultFunc)
-      case JobFailed(e: Exception, _) => scala.util.Failure(e)
+      case JobFailed(e: Exception) => scala.util.Failure(e)
     }
   }
 }