Skip to content

Commit

Permalink
Reuse more code in YarnClientSchedulerBackend
Browse files Browse the repository at this point in the history
We implement a while loop to monitor an application's state in
four separate places (stable/Client, alpha/Client, and twice in
YarnClientSchedulerBackend). This commit reduces this to one.
  • Loading branch information
andrewor14 committed Sep 9, 2014
1 parent 3f941dc commit fabe4c4
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 61 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -435,36 +435,51 @@ private[spark] trait ClientBase extends Logging {

/**
* Report the state of an application until it has exited, either successfully or
* due to some failure.
* due to some failure, then return the application state.
*
* @param returnOnRunning Whether to also return the application state when it is RUNNING.
* @param logApplicationReport Whether to log details of the application report every iteration.
* @return state of the application, one of FINISHED, FAILED, KILLED, and RUNNING.
*/
def monitorApplication(appId: ApplicationId): Unit = {
def monitorApplication(
appId: ApplicationId,
returnOnRunning: Boolean = false,
logApplicationReport: Boolean = true): YarnApplicationState = {
val interval = sparkConf.getLong("spark.yarn.report.interval", 1000)
while (true) {
Thread.sleep(interval)
val report = getApplicationReport(appId)
val state = report.getYarnApplicationState

logInfo(s"Application report from ResourceManager for ${appId.getId} (state: $state)")

logDebug(
s"\t full application identifier: $appId\n" +
s"\t clientToken: ${getClientToken(report)}\n" +
s"\t appDiagnostics: ${report.getDiagnostics}\n" +
s"\t appMasterHost: ${report.getHost}\n" +
s"\t appQueue: ${report.getQueue}\n" +
s"\t appMasterRpcPort: ${report.getRpcPort}\n" +
s"\t appStartTime: ${report.getStartTime}\n" +
s"\t yarnAppState: $state\n" +
s"\t distributedFinalState: ${report.getFinalApplicationStatus}\n" +
s"\t appTrackingUrl: ${report.getTrackingUrl}\n" +
s"\t appUser: ${report.getUser}")
if (logApplicationReport) {
logInfo(s"Application report from ResourceManager for application ${appId.getId} " +
s"(state: $state)")
logDebug(
s"\t full application identifier: $appId\n" +
s"\t clientToken: ${getClientToken(report)}\n" +
s"\t appDiagnostics: ${report.getDiagnostics}\n" +
s"\t appMasterHost: ${report.getHost}\n" +
s"\t appQueue: ${report.getQueue}\n" +
s"\t appMasterRpcPort: ${report.getRpcPort}\n" +
s"\t appStartTime: ${report.getStartTime}\n" +
s"\t yarnAppState: $state\n" +
s"\t distributedFinalState: ${report.getFinalApplicationStatus}\n" +
s"\t appTrackingUrl: ${report.getTrackingUrl}\n" +
s"\t appUser: ${report.getUser}")
}

if (state == YarnApplicationState.FINISHED ||
state == YarnApplicationState.FAILED ||
state == YarnApplicationState.KILLED) {
return
return state
}

if (returnOnRunning && state == YarnApplicationState.RUNNING) {
return state
}
}
// Never reached, but keeps compiler happy
throw new SparkException("While loop is depleted! This should never happen...")
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.scheduler.cluster

import org.apache.hadoop.yarn.api.records.{ApplicationId, YarnApplicationState}
import org.apache.spark.{SparkException, Logging, SparkContext}
import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnSparkHadoopUtil}
import org.apache.spark.deploy.yarn.{Client, ClientArguments}
import org.apache.spark.scheduler.TaskSchedulerImpl

import scala.collection.mutable.ArrayBuffer
Expand All @@ -36,7 +36,6 @@ private[spark] class YarnClientSchedulerBackend(

var client: Client = null
var appId: ApplicationId = null
var checkerThread: Thread = null
var stopping: Boolean = false
var totalExpectedExecutors = 0

Expand Down Expand Up @@ -83,66 +82,62 @@ private[spark] class YarnClientSchedulerBackend(
totalExpectedExecutors = args.numExecutors
client = new Client(args, conf)
appId = client.submitApplication()
waitForApp()
checkerThread = yarnApplicationStateCheckerThread()
waitForApplication()
asyncMonitorApplication()
}

def waitForApp() {

// TODO : need a better way to find out whether the executors are ready or not
// maybe by resource usage report?
while(true) {
val report = client.getApplicationReport(appId)

logInfo("Application report from ASM: \n" +
"\t appMasterRpcPort: " + report.getRpcPort() + "\n" +
"\t appStartTime: " + report.getStartTime() + "\n" +
"\t yarnAppState: " + report.getYarnApplicationState() + "\n"
)

// Ready to go, or already gone.
val state = report.getYarnApplicationState()
if (state == YarnApplicationState.RUNNING) {
return
} else if (state == YarnApplicationState.FINISHED ||
state == YarnApplicationState.FAILED ||
state == YarnApplicationState.KILLED) {
throw new SparkException("Yarn application already ended," +
"might be killed or not able to launch application master.")
}

Thread.sleep(1000)
/**
* Report the state of the application until it is running.
* If the application has finished, failed or been killed in the process, throw an exception.
* This assumes both `client` and `appId` have already been set.
*/
private def waitForApplication(): Unit = {
assert(client != null && appId != null, "Application has not been submitted yet!")
val state = client.monitorApplication(appId, returnOnRunning = true) // blocking
if (state == YarnApplicationState.FINISHED ||
state == YarnApplicationState.FAILED ||
state == YarnApplicationState.KILLED) {
throw new SparkException("Yarn application has already ended! " +
"It might have been killed or unable to launch application master.")
}
if (state == YarnApplicationState.RUNNING) {
logInfo(s"Application ${appId.getId} has started running.")
}
}

private def yarnApplicationStateCheckerThread(): Thread = {
/**
* Monitor the application state in a separate thread.
* If the application has exited for any reason, stop the SparkContext.
* This assumes both `client` and `appId` have already been set.
*/
private def asyncMonitorApplication(): Thread = {
assert(client != null && appId != null, "Application has not been submitted yet!")
val t = new Thread {
override def run() {
while (!stopping) {
val report = client.getApplicationReport(appId)
val state = report.getYarnApplicationState()
if (state == YarnApplicationState.FINISHED || state == YarnApplicationState.KILLED
|| state == YarnApplicationState.FAILED) {
logError(s"Yarn application already ended: $state")
sc.stop()
stopping = true
}
Thread.sleep(1000L)
val state = client.monitorApplication(appId, logApplicationReport = false) // blocking
if (state == YarnApplicationState.FINISHED ||
state == YarnApplicationState.KILLED ||
state == YarnApplicationState.FAILED) {
logWarning(s"Yarn application has exited: $state")
sc.stop()
stopping = true
}
checkerThread = null
Thread.currentThread().interrupt()
}
}
t.setName("Yarn Application State Checker")
t.setName("Yarn Application State Monitor")
t.setDaemon(true)
t.start()
t
}

/**
* Stop the scheduler. This assumes `start()` has already been called.
*/
override def stop() {
assert(client != null, "Attempted to stop this scheduler before starting it!")
stopping = true
super.stop()
client.stop
client.stop()
logInfo("Stopped")
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ private[spark] class Client(
yarnClient.getApplicationReport(appId)

/** */
// FIXME: This could throw NPE
override def getClientToken(report: ApplicationReport): String =
report.getClientToAMToken.toString
}
Expand Down

0 comments on commit fabe4c4

Please sign in to comment.