Skip to content

Commit

Permalink
SPARK-1689 AppClient should indicate app is dead() when removed
Browse files Browse the repository at this point in the history
Previously, we indicated disconnected(), which keeps the application in a limbo state where it has no executors but thinks it will get them soon.

This is a bug fix that hopefully can be included in 1.0.

Author: Aaron Davidson <[email protected]>

Closes #605 from aarondav/appremoved and squashes the following commits:

bea02a2 [Aaron Davidson] SPARK-1689 AppClient should indicate app is dead() when removed
  • Loading branch information
aarondav authored and pwendell committed May 3, 2014
1 parent ce72c72 commit 34719ba
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,7 @@ private[spark] class AppClient(
if (registered) {
retryTimer.cancel()
} else if (retries >= REGISTRATION_RETRIES) {
logError("All masters are unresponsive! Giving up.")
markDead()
markDead("All masters are unresponsive! Giving up.")
} else {
tryRegisterAllMasters()
}
Expand Down Expand Up @@ -126,8 +125,7 @@ private[spark] class AppClient(
listener.connected(appId)

case ApplicationRemoved(message) =>
logError("Master removed our application: %s; stopping client".format(message))
markDisconnected()
markDead("Master removed our application: %s".format(message))
context.stop(self)

case ExecutorAdded(id: Int, workerId: String, hostPort: String, cores: Int, memory: Int) =>
Expand Down Expand Up @@ -158,7 +156,7 @@ private[spark] class AppClient(
logWarning(s"Could not connect to $address: $cause")

case StopAppClient =>
markDead()
markDead("Application has been stopped.")
sender ! true
context.stop(self)
}
Expand All @@ -173,9 +171,9 @@ private[spark] class AppClient(
}
}

def markDead() {
def markDead(reason: String) {
if (!alreadyDead) {
listener.dead()
listener.dead(reason)
alreadyDead = true
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ private[spark] trait AppClientListener {
/** Disconnection may be a temporary state, as we fail over to a new Master. */
def disconnected(): Unit

/** Dead means that we couldn't find any Masters to connect to, and have given up. */
def dead(): Unit
/** An application death is an unrecoverable failure condition. */
def dead(reason: String): Unit

def executorAdded(fullId: String, workerId: String, hostPort: String, cores: Int, memory: Int)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ private[spark] object TestClient {
System.exit(0)
}

def dead() {
logInfo("Could not connect to master")
def dead(reason: String) {
logInfo("Application died with error: " + reason)
System.exit(0)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,10 @@ private[spark] class SparkDeploySchedulerBackend(
}
}

override def dead() {
override def dead(reason: String) {
if (!stopping) {
logError("Spark cluster looks dead, giving up.")
scheduler.error("Spark cluster looks down")
logError("Application has been killed. Reason: " + reason)
scheduler.error(reason)
}
}

Expand Down

0 comments on commit 34719ba

Please sign in to comment.