Reuse more code in YarnClientSchedulerBackend

We implement a while loop to monitor an application's state in four separate places (stable/Client, alpha/Client, and twice in YarnClientSchedulerBackend). This commit reduces this to one.
apache · Sep 9, 2014 · fabe4c4 · fabe4c4
1 parent 3f941dc
commit fabe4c4
Show file tree

Hide file tree

Showing 3 changed files with 72 additions and 61 deletions.
diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
@@ -435,36 +435,51 @@ private[spark] trait ClientBase extends Logging {
 
   /**
    * Report the state of an application until it has exited, either successfully or
-   * due to some failure.
+   * due to some failure, then return the application state.
+   *
+   * @param returnOnRunning Whether to also return the application state when it is RUNNING.
+   * @param logApplicationReport Whether to log details of the application report every iteration.
+   * @return state of the application, one of FINISHED, FAILED, KILLED, and RUNNING.
    */
-  def monitorApplication(appId: ApplicationId): Unit = {
+  def monitorApplication(
+      appId: ApplicationId,
+      returnOnRunning: Boolean = false,
+      logApplicationReport: Boolean = true): YarnApplicationState = {
     val interval = sparkConf.getLong("spark.yarn.report.interval", 1000)
     while (true) {
       Thread.sleep(interval)
       val report = getApplicationReport(appId)
       val state = report.getYarnApplicationState
 
-      logInfo(s"Application report from ResourceManager for ${appId.getId} (state: $state)")
-
-      logDebug(
-        s"\t full application identifier: $appId\n" +
-        s"\t clientToken: ${getClientToken(report)}\n" +
-        s"\t appDiagnostics: ${report.getDiagnostics}\n" +
-        s"\t appMasterHost: ${report.getHost}\n" +
-        s"\t appQueue: ${report.getQueue}\n" +
-        s"\t appMasterRpcPort: ${report.getRpcPort}\n" +
-        s"\t appStartTime: ${report.getStartTime}\n" +
-        s"\t yarnAppState: $state\n" +
-        s"\t distributedFinalState: ${report.getFinalApplicationStatus}\n" +
-        s"\t appTrackingUrl: ${report.getTrackingUrl}\n" +
-        s"\t appUser: ${report.getUser}")
+      if (logApplicationReport) {
+        logInfo(s"Application report from ResourceManager for application ${appId.getId} " +
+          s"(state: $state)")
+        logDebug(
+          s"\t full application identifier: $appId\n" +
+          s"\t clientToken: ${getClientToken(report)}\n" +
+          s"\t appDiagnostics: ${report.getDiagnostics}\n" +
+          s"\t appMasterHost: ${report.getHost}\n" +
+          s"\t appQueue: ${report.getQueue}\n" +
+          s"\t appMasterRpcPort: ${report.getRpcPort}\n" +
+          s"\t appStartTime: ${report.getStartTime}\n" +
+          s"\t yarnAppState: $state\n" +
+          s"\t distributedFinalState: ${report.getFinalApplicationStatus}\n" +
+          s"\t appTrackingUrl: ${report.getTrackingUrl}\n" +
+          s"\t appUser: ${report.getUser}")
+      }
 
       if (state == YarnApplicationState.FINISHED ||
         state == YarnApplicationState.FAILED ||
         state == YarnApplicationState.KILLED) {
-        return
+        return state
+      }
+
+      if (returnOnRunning && state == YarnApplicationState.RUNNING) {
+        return state
       }
     }
+    // Never reached, but keeps compiler happy
+    throw new SparkException("While loop is depleted! This should never happen...")
   }
 
   /**

diff --git a/...common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/...common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -19,7 +19,7 @@ package org.apache.spark.scheduler.cluster
 
 import org.apache.hadoop.yarn.api.records.{ApplicationId, YarnApplicationState}
 import org.apache.spark.{SparkException, Logging, SparkContext}
-import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnSparkHadoopUtil}
+import org.apache.spark.deploy.yarn.{Client, ClientArguments}
 import org.apache.spark.scheduler.TaskSchedulerImpl
 
 import scala.collection.mutable.ArrayBuffer
@@ -36,7 +36,6 @@ private[spark] class YarnClientSchedulerBackend(
 
   var client: Client = null
   var appId: ApplicationId = null
-  var checkerThread: Thread = null
   var stopping: Boolean = false
   var totalExpectedExecutors = 0
 
@@ -83,66 +82,62 @@ private[spark] class YarnClientSchedulerBackend(
     totalExpectedExecutors = args.numExecutors
     client = new Client(args, conf)
     appId = client.submitApplication()
-    waitForApp()
-    checkerThread = yarnApplicationStateCheckerThread()
+    waitForApplication()
+    asyncMonitorApplication()
   }
 
-  def waitForApp() {
-
-    // TODO : need a better way to find out whether the executors are ready or not
-    // maybe by resource usage report?
-    while(true) {
-      val report = client.getApplicationReport(appId)
-
-      logInfo("Application report from ASM: \n" +
-        "\t appMasterRpcPort: " + report.getRpcPort() + "\n" +
-        "\t appStartTime: " + report.getStartTime() + "\n" +
-        "\t yarnAppState: " + report.getYarnApplicationState() + "\n"
-      )
-
-      // Ready to go, or already gone.
-      val state = report.getYarnApplicationState()
-      if (state == YarnApplicationState.RUNNING) {
-        return
-      } else if (state == YarnApplicationState.FINISHED ||
-        state == YarnApplicationState.FAILED ||
-        state == YarnApplicationState.KILLED) {
-        throw new SparkException("Yarn application already ended," +
-          "might be killed or not able to launch application master.")
-      }
-
-      Thread.sleep(1000)
+  /**
+   * Report the state of the application until it is running.
+   * If the application has finished, failed or been killed in the process, throw an exception.
+   * This assumes both `client` and `appId` have already been set.
+   */
+  private def waitForApplication(): Unit = {
+    assert(client != null && appId != null, "Application has not been submitted yet!")
+    val state = client.monitorApplication(appId, returnOnRunning = true) // blocking
+    if (state == YarnApplicationState.FINISHED ||
+      state == YarnApplicationState.FAILED ||
+      state == YarnApplicationState.KILLED) {
+      throw new SparkException("Yarn application has already ended! " +
+        "It might have been killed or unable to launch application master.")
+    }
+    if (state == YarnApplicationState.RUNNING) {
+      logInfo(s"Application ${appId.getId} has started running.")
     }
   }
 
-  private def yarnApplicationStateCheckerThread(): Thread = {
+  /**
+   * Monitor the application state in a separate thread.
+   * If the application has exited for any reason, stop the SparkContext.
+   * This assumes both `client` and `appId` have already been set.
+   */
+  private def asyncMonitorApplication(): Thread = {
+    assert(client != null && appId != null, "Application has not been submitted yet!")
     val t = new Thread {
       override def run() {
-        while (!stopping) {
-          val report = client.getApplicationReport(appId)
-          val state = report.getYarnApplicationState()
-          if (state == YarnApplicationState.FINISHED || state == YarnApplicationState.KILLED
-            || state == YarnApplicationState.FAILED) {
-            logError(s"Yarn application already ended: $state")
-            sc.stop()
-            stopping = true
-          }
-          Thread.sleep(1000L)
+        val state = client.monitorApplication(appId, logApplicationReport = false) // blocking
+        if (state == YarnApplicationState.FINISHED ||
+          state == YarnApplicationState.KILLED ||
+          state == YarnApplicationState.FAILED) {
+          logWarning(s"Yarn application has exited: $state")
+          sc.stop()
+          stopping = true
         }
-        checkerThread = null
-        Thread.currentThread().interrupt()
       }
     }
-    t.setName("Yarn Application State Checker")
+    t.setName("Yarn Application State Monitor")
     t.setDaemon(true)
     t.start()
     t
   }
 
+  /**
+   * Stop the scheduler. This assumes `start()` has already been called.
+   */
   override def stop() {
+    assert(client != null, "Attempted to stop this scheduler before starting it!")
     stopping = true
     super.stop()
-    client.stop
+    client.stop()
     logInfo("Stopped")
   }
 

diff --git a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -109,6 +109,7 @@ private[spark] class Client(
     yarnClient.getApplicationReport(appId)
 
   /** */
+  // FIXME: This could throw NPE
   override def getClientToken(report: ApplicationReport): String =
     report.getClientToAMToken.toString
 }