Skip to content

Commit

Permalink
E2E: clarify drain -deadline and -force flag behaviors (#16868)
Browse files Browse the repository at this point in the history
The `-deadline` and `-force` flag for the `nomad node drain` command only cause
the draining to ignore the `migrate` block's healthy deadline, max parallel,
etc. These flags don't have anything to do with the `kill_timeout` or
`shutdown_delay` options of the jobspec.

This changeset fixes the skipped E2E tests so that they validate the intended
behavior, and updates the docs for more clarity.
  • Loading branch information
tgross committed Apr 12, 2023
1 parent da368d3 commit f26cf29
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 227 deletions.
2 changes: 1 addition & 1 deletion e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
_ "github.com/hashicorp/nomad/e2e/metrics"
_ "github.com/hashicorp/nomad/e2e/namespaces"
_ "github.com/hashicorp/nomad/e2e/networking"
_ "github.com/hashicorp/nomad/e2e/nodedrain"
_ "github.com/hashicorp/nomad/e2e/nomadexec"
_ "github.com/hashicorp/nomad/e2e/oversubscription"
_ "github.com/hashicorp/nomad/e2e/parameterized"
Expand All @@ -42,6 +41,7 @@ import (
// these are no longer on the old framework but by importing them
// we get a quick check that they compile on every commit
_ "github.com/hashicorp/nomad/e2e/disconnectedclients"
_ "github.com/hashicorp/nomad/e2e/nodedrain"
_ "github.com/hashicorp/nomad/e2e/volumes"
)

Expand Down
25 changes: 9 additions & 16 deletions e2e/nodedrain/input/drain_deadline.nomad
Original file line number Diff line number Diff line change
@@ -1,38 +1,31 @@
job "drain_deadline" {
datacenters = ["dc1", "dc2"]

constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}

migrate {
max_parallel = 1
min_healthy_time = "30s"
}

group "group" {

count = 2

task "task" {
driver = "docker"

kill_timeout = "2m"

config {
image = "busybox:1"
command = "/bin/sh"
args = ["local/script.sh"]
}

template {
data = <<EOF
#!/bin/sh
trap 'sleep 60' 2
sleep 600
EOF

destination = "local/script.sh"
change_mode = "noop"
args = ["-c", "sleep 600"]
}

resources {
cpu = 256
memory = 128
memory = 64
}
}
}
Expand Down
174 changes: 149 additions & 25 deletions e2e/nodedrain/node_drain_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ func TestNodeDrain(t *testing.T) {
t.Run("IgnoreSystem", testIgnoreSystem)
t.Run("EphemeralMigrate", testEphemeralMigrate)
t.Run("KeepIneligible", testKeepIneligible)
t.Run("DeadlineFlag", testDeadlineFlag)
t.Run("ForceFlag", testForceFlag)
}

// testIgnoreSystem tests that system jobs are left behind when the
Expand All @@ -48,12 +50,15 @@ func testIgnoreSystem(t *testing.T) {
// Run a system job, which will not be moved when we drain the node
systemJobID := "test-node-drain-system-" + uuid.Short()
t.Cleanup(cleanupJobState(t, systemJobID))
registerAndWaitForRunning(t, nomadClient, systemJobID, "./input/drain_ignore_system.nomad", count)

must.NoError(t, e2eutil.Register(systemJobID, "./input/drain_ignore_system.nomad"))
waitForRunningAllocs(t, nomadClient, systemJobID, count)

// Also run a service job so we can verify when the drain is done
serviceJobID := "test-node-drain-service-" + uuid.Short()
t.Cleanup(cleanupJobState(t, serviceJobID))
serviceAllocs := registerAndWaitForRunning(t, nomadClient, serviceJobID, "./input/drain_simple.nomad", 1)
must.NoError(t, e2eutil.Register(serviceJobID, "./input/drain_simple.nomad"))
serviceAllocs := waitForRunningAllocs(t, nomadClient, serviceJobID, 1)
oldAllocID := serviceAllocs[0].ID
oldNodeID := serviceAllocs[0].NodeID

Expand All @@ -64,7 +69,8 @@ func testIgnoreSystem(t *testing.T) {
must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))

// The service job should be drained
newAllocs := waitForAllocDrain(t, nomadClient, serviceJobID, oldAllocID, oldNodeID)
newAllocs := waitForAllocDrainComplete(t, nomadClient, serviceJobID,
oldAllocID, oldNodeID, time.Second*120)
must.Len(t, 1, newAllocs, must.Sprint("expected 1 new service job alloc"))

// The system job should not have been drained
Expand All @@ -89,7 +95,8 @@ func testEphemeralMigrate(t *testing.T) {
nomadClient := e2eutil.NomadClient(t)
jobID := "drain-migrate-" + uuid.Short()

allocs := registerAndWaitForRunning(t, nomadClient, jobID, "./input/drain_migrate.nomad", 1)
must.NoError(t, e2eutil.Register(jobID, "./input/drain_migrate.nomad"))
allocs := waitForRunningAllocs(t, nomadClient, jobID, 1)
t.Cleanup(cleanupJobState(t, jobID))
oldAllocID := allocs[0].ID
oldNodeID := allocs[0].NodeID
Expand All @@ -114,7 +121,8 @@ func testEphemeralMigrate(t *testing.T) {
out, err := e2eutil.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", oldNodeID)
must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))

newAllocs := waitForAllocDrain(t, nomadClient, jobID, oldAllocID, oldNodeID)
newAllocs := waitForAllocDrainComplete(t, nomadClient, jobID,
oldAllocID, oldNodeID, time.Second*120)
must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc"))
newAllocID := newAllocs[0].ID
newNodeID := newAllocs[0].NodeID
Expand Down Expand Up @@ -173,42 +181,129 @@ func testKeepIneligible(t *testing.T) {
}
}

// registerAndWaitForRunning registers a job and waits for the expected number
// of allocations to be in a running state. Returns the allocations.
func registerAndWaitForRunning(t *testing.T, nomadClient *api.Client, jobID, jobSpec string, expectedCount int) []*api.AllocationListStub {
// testDeadlineFlag tests the enforcement of the node drain deadline so that
// allocations are moved even if max_parallel says we should be waiting
func testDeadlineFlag(t *testing.T) {

nomadClient := e2eutil.NomadClient(t)
t.Cleanup(cleanupDrainState(t))

jobID := "test-node-drain-" + uuid.Short()
must.NoError(t, e2eutil.Register(jobID, "./input/drain_deadline.nomad"))
allocs := waitForRunningAllocs(t, nomadClient, jobID, 2)

t.Cleanup(cleanupJobState(t, jobID))
oldAllocID1 := allocs[0].ID
oldNodeID1 := allocs[0].NodeID
oldAllocID2 := allocs[1].ID
oldNodeID2 := allocs[1].NodeID

t.Logf("draining nodes %s, %s", oldNodeID1, oldNodeID2)
out, err := e2eutil.Command("nomad", "node", "eligibility", "-disable", oldNodeID1)
must.NoError(t, err, must.Sprintf("nomad node eligibility -disable failed: %v\n%v", err, out))
out, err = e2eutil.Command("nomad", "node", "eligibility", "-disable", oldNodeID2)
must.NoError(t, err, must.Sprintf("nomad node eligibility -disable failed: %v\n%v", err, out))

out, err = e2eutil.Command(
"nomad", "node", "drain",
"-deadline", "1s",
"-enable", "-yes", "-detach", oldNodeID1)
must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID1, err, out))

out, err = e2eutil.Command(
"nomad", "node", "drain",
"-deadline", "1s",
"-enable", "-yes", "-detach", oldNodeID2)
must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID2, err, out))

// with max_parallel=1 and min_healthy_time=30s we'd expect it to take ~60
// for both to be marked complete. Instead, because of the -deadline flag
// we'll expect the allocs to be stoppped almost immediately (give it 10s to
// avoid flakiness), and then the new allocs should come up and get marked
// healthy after ~30s
t.Log("waiting for old allocs to stop")
waitForAllocsStop(t, nomadClient, time.Second*10, oldAllocID1, oldAllocID2)

t.Log("waiting for running allocs")
waitForRunningAllocs(t, nomadClient, jobID, 2)
}

// testForceFlag tests the enforcement of the node drain -force flag so that
// allocations are terminated immediately.
func testForceFlag(t *testing.T) {

nomadClient := e2eutil.NomadClient(t)
t.Cleanup(cleanupDrainState(t))

jobID := "test-node-drain-" + uuid.Short()
must.NoError(t, e2eutil.Register(jobID, "./input/drain_deadline.nomad"))
allocs := waitForRunningAllocs(t, nomadClient, jobID, 2)

t.Cleanup(cleanupJobState(t, jobID))
oldAllocID1 := allocs[0].ID
oldNodeID1 := allocs[0].NodeID
oldAllocID2 := allocs[1].ID
oldNodeID2 := allocs[1].NodeID

t.Logf("draining nodes %s, %s", oldNodeID1, oldNodeID2)
out, err := e2eutil.Command("nomad", "node", "eligibility", "-disable", oldNodeID1)
must.NoError(t, err, must.Sprintf("nomad node eligibility -disable failed: %v\n%v", err, out))
out, err = e2eutil.Command("nomad", "node", "eligibility", "-disable", oldNodeID2)
must.NoError(t, err, must.Sprintf("nomad node eligibility -disable failed: %v\n%v", err, out))

out, err = e2eutil.Command(
"nomad", "node", "drain", "-force",
"-enable", "-yes", "-detach", oldNodeID1)
must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID1, err, out))

out, err = e2eutil.Command(
"nomad", "node", "drain", "-force",
"-enable", "-yes", "-detach", oldNodeID2)
must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID2, err, out))

// with max_parallel=1 and min_healthy_time=30s we'd expect it to take ~60
// for both to be marked complete. Instead, because of the -force flag
// we'll expect the allocs to be stoppped almost immediately (give it 10s to
// avoid flakiness), and then the new allocs should come up and get marked
// healthy after ~30s
t.Log("waiting for old allocs to stop")
waitForAllocsStop(t, nomadClient, time.Second*10, oldAllocID1, oldAllocID2)

t.Log("waiting for running allocs")
waitForRunningAllocs(t, nomadClient, jobID, 2)
}

func waitForRunningAllocs(t *testing.T, nomadClient *api.Client, jobID string, expectedRunningCount int) []*api.AllocationListStub {
t.Helper()

var allocs []*api.AllocationListStub
var err error
must.NoError(t, e2eutil.Register(jobID, jobSpec))
runningAllocs := set.From([]*api.AllocationListStub{})

must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(func() error {
allocs, _, err = nomadClient.Jobs().Allocations(jobID, false, nil)
if err != nil {
return fmt.Errorf("expected no error listing allocs: %v", err)
}
if len(allocs) != expectedCount {
return fmt.Errorf("expected %d allocs but found %d", expectedCount, len(allocs))
}
allocs, _, err := nomadClient.Jobs().Allocations(jobID, false, nil)
must.NoError(t, err)
count := 0
for _, alloc := range allocs {
if alloc.ClientStatus != structs.AllocClientStatusRunning {
return fmt.Errorf("alloc %q was %q, not running", alloc.ID, alloc.ClientStatus)
if alloc.ClientStatus == structs.AllocClientStatusRunning {
runningAllocs.Insert(alloc)
}
}
if runningAllocs.Size() < expectedRunningCount {
return fmt.Errorf("expected %d running allocs, got %d", expectedRunningCount, count)
}
return nil
}),
wait.Timeout(60*time.Second),
wait.Gap(500*time.Millisecond),
))
return allocs
return runningAllocs.Slice()
}

// waitForAllocDrain polls the allocation statues for a job until we've finished
// waitForAllocDrainComplete polls the allocation statues for a job until we've finished
// migrating:
// - the old alloc should be stopped
// - the new alloc should be running
func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID, oldNodeID string) []*api.AllocationListStub {
func waitForAllocDrainComplete(t *testing.T, nomadClient *api.Client, jobID, oldAllocID, oldNodeID string, deadline time.Duration) []*api.AllocationListStub {

t.Helper()
newAllocs := set.From([]*api.AllocationListStub{})
Expand Down Expand Up @@ -240,16 +335,45 @@ func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID,
oldNodeID[:8], time.Now().Sub(start))
return nil
}),
wait.Timeout(120*time.Second),
wait.Timeout(deadline),
wait.Gap(500*time.Millisecond),
))

return newAllocs.Slice()
}

// waitForAllocsStop polls the allocation statues for specific allocations until
// they've stopped
func waitForAllocsStop(t *testing.T, nomadClient *api.Client, deadline time.Duration, oldAllocIDs ...string) {
t.Helper()

must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(func() error {
for _, allocID := range oldAllocIDs {
alloc, _, err := nomadClient.Allocations().Info(allocID, nil)
must.NoError(t, err)
if alloc.ClientStatus != structs.AllocClientStatusComplete {
return fmt.Errorf("expected alloc %s to be complete, got %q",
allocID[:8], alloc.ClientStatus)
}
}
return nil
}),
wait.Timeout(deadline),
wait.Gap(500*time.Millisecond),
))
}

func cleanupJobState(t *testing.T, jobID string) func() {
return func() {
_, err := e2eutil.Command("nomad", "job", "stop", "-purge", jobID)
if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
return
}

// we can't use the CLI here because some tests will stop the job during
// a running deployment, which returns a non-zero exit code
nomadClient := e2eutil.NomadClient(t)
_, _, err := nomadClient.Jobs().Deregister(jobID, true, nil)
test.NoError(t, err)
}
}
Expand Down
Loading

0 comments on commit f26cf29

Please sign in to comment.