From 3aad802f28f056adfa92284a104238200972e472 Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Fri, 7 Apr 2023 09:17:19 -0400
Subject: [PATCH 1/3] E2E: update subset of node drain tests off the old
 framework (#16823)

While working on several open drain issues, I'm fixing up the E2E tests. This
subset of tests being refactored are existing ones that already work. I'm
shipping these as their own PR to keep review sizes manageable when I push up
PRs in the next few days for #9902, #12314, and #12915.
---
 e2e/nodedrain/doc.go                          |   4 +
 e2e/nodedrain/input/drain_ignore_system.nomad |   1 -
 e2e/nodedrain/input/drain_simple.nomad        |   1 -
 e2e/nodedrain/node_drain_test.go              | 208 ++++++++++++++++++
 e2e/nodedrain/nodedrain.go                    | 106 ---------
 5 files changed, 212 insertions(+), 108 deletions(-)
 create mode 100644 e2e/nodedrain/doc.go
 create mode 100644 e2e/nodedrain/node_drain_test.go

diff --git a/e2e/nodedrain/doc.go b/e2e/nodedrain/doc.go
new file mode 100644
index 00000000000..7709145580c
--- /dev/null
+++ b/e2e/nodedrain/doc.go
@@ -0,0 +1,4 @@
+package nodedrain
+
+// This package contains only tests, so this is a placeholder file to
+// make sure builds don't fail with "no non-test Go files in" errors
diff --git a/e2e/nodedrain/input/drain_ignore_system.nomad b/e2e/nodedrain/input/drain_ignore_system.nomad
index 95b5f0b2fbd..f960da8ad50 100644
--- a/e2e/nodedrain/input/drain_ignore_system.nomad
+++ b/e2e/nodedrain/input/drain_ignore_system.nomad
@@ -1,5 +1,4 @@
 job "drain_ignore_system_service" {
-  datacenters = ["dc1", "dc2"]
 
   type = "system"
 
diff --git a/e2e/nodedrain/input/drain_simple.nomad b/e2e/nodedrain/input/drain_simple.nomad
index e412323a659..0ce14efe598 100644
--- a/e2e/nodedrain/input/drain_simple.nomad
+++ b/e2e/nodedrain/input/drain_simple.nomad
@@ -1,5 +1,4 @@
 job "drain_simple" {
-  datacenters = ["dc1", "dc2"]
 
   constraint {
     attribute = "${attr.kernel.name}"
diff --git a/e2e/nodedrain/node_drain_test.go b/e2e/nodedrain/node_drain_test.go
new file mode 100644
index 00000000000..e4fcdfbfcf7
--- /dev/null
+++ b/e2e/nodedrain/node_drain_test.go
@@ -0,0 +1,208 @@
+package nodedrain
+
+import (
+	"fmt"
+	"os"
+	"testing"
+	"time"
+
+	"github.com/hashicorp/go-set"
+	"github.com/shoenig/test"
+	"github.com/shoenig/test/must"
+	"github.com/shoenig/test/wait"
+
+	"github.com/hashicorp/nomad/api"
+	"github.com/hashicorp/nomad/e2e/e2eutil"
+	"github.com/hashicorp/nomad/helper/uuid"
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+func TestNodeDrain(t *testing.T) {
+
+	nomadClient := e2eutil.NomadClient(t)
+	e2eutil.WaitForLeader(t, nomadClient)
+	e2eutil.WaitForNodesReady(t, nomadClient, 2) // needs at least 2 to test migration
+
+	t.Run("IgnoreSystem", testIgnoreSystem)
+	t.Run("KeepIneligible", testKeepIneligible)
+}
+
+// testIgnoreSystem tests that system jobs are left behind when the
+// -ignore-system flag is used.
+func testIgnoreSystem(t *testing.T) {
+
+	t.Cleanup(cleanupDrainState(t))
+	nomadClient := e2eutil.NomadClient(t)
+
+	// Figure out how many system alloc we'll expect to see
+	nodes, err := e2eutil.NodeStatusListFiltered(
+		func(section string) bool {
+			kernelName, err := e2eutil.GetField(section, "kernel.name")
+			return err == nil && kernelName == "linux"
+		})
+	must.NoError(t, err, must.Sprint("could not get node status listing"))
+	count := len(nodes)
+
+	// Run a system job, which will not be moved when we drain the node
+	systemJobID := "test-node-drain-system-" + uuid.Short()
+	t.Cleanup(cleanupJobState(t, systemJobID))
+	registerAndWaitForRunning(t, nomadClient, systemJobID, "./input/drain_ignore_system.nomad", count)
+
+	// Also run a service job so we can verify when the drain is done
+	serviceJobID := "test-node-drain-service-" + uuid.Short()
+	t.Cleanup(cleanupJobState(t, serviceJobID))
+	serviceAllocs := registerAndWaitForRunning(t, nomadClient, serviceJobID, "./input/drain_simple.nomad", 1)
+	oldAllocID := serviceAllocs[0].ID
+	oldNodeID := serviceAllocs[0].NodeID
+
+	// Drain the node with -ignore-system
+	out, err := e2eutil.Command(
+		"nomad", "node", "drain",
+		"-ignore-system", "-enable", "-yes", "-detach", oldNodeID)
+	must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))
+
+	// The service job should be drained
+	newAllocs := waitForAllocDrain(t, nomadClient, serviceJobID, oldAllocID, oldNodeID)
+	must.Len(t, 1, newAllocs, must.Sprint("expected 1 new service job alloc"))
+
+	// The system job should not have been drained
+	got, err := e2eutil.AllocsForJob(systemJobID, structs.DefaultNamespace)
+	must.NoError(t, err, must.Sprintf("could not read allocs for system job: %v", got))
+	must.Len(t, count, got, must.Sprintf("expected %d system allocs", count))
+
+	for _, systemAlloc := range got {
+		must.Eq(t, "running", systemAlloc["Status"],
+			must.Sprint("expected all system allocs to be left client=running"))
+		must.Eq(t, "run", systemAlloc["Desired"],
+			must.Sprint("expected all system allocs to be left desired=run"))
+	}
+}
+
+// testKeepIneligible tests that nodes can be kept ineligible for scheduling after
+// disabling drain.
+func testKeepIneligible(t *testing.T) {
+
+	nodes, err := e2eutil.NodeStatusList()
+	must.NoError(t, err, must.Sprint("expected no error when listing nodes"))
+
+	nodeID := nodes[0]["ID"]
+
+	t.Cleanup(cleanupDrainState(t))
+
+	out, err := e2eutil.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID)
+	must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))
+
+	out, err = e2eutil.Command(
+		"nomad", "node", "drain",
+		"-disable", "-keep-ineligible", "-yes", nodeID)
+	must.NoError(t, err, must.Sprintf("expected no error when disabling drain for node: %v", out))
+
+	nodes, err = e2eutil.NodeStatusList()
+	must.NoError(t, err, must.Sprint("expected no error when listing nodes"))
+
+	for _, node := range nodes {
+		if node["ID"] == nodeID {
+			must.Eq(t, "ineligible", nodes[0]["Eligibility"])
+			must.Eq(t, "false", nodes[0]["Drain"])
+		}
+	}
+}
+
+// registerAndWaitForRunning registers a job and waits for the expected number
+// of allocations to be in a running state. Returns the allocations.
+func registerAndWaitForRunning(t *testing.T, nomadClient *api.Client, jobID, jobSpec string, expectedCount int) []*api.AllocationListStub {
+	t.Helper()
+
+	var allocs []*api.AllocationListStub
+	var err error
+	must.NoError(t, e2eutil.Register(jobID, jobSpec))
+
+	must.Wait(t, wait.InitialSuccess(
+		wait.ErrorFunc(func() error {
+			allocs, _, err = nomadClient.Jobs().Allocations(jobID, false, nil)
+			if err != nil {
+				return fmt.Errorf("expected no error listing allocs: %v", err)
+			}
+			if len(allocs) != expectedCount {
+				return fmt.Errorf("expected %d allocs but found %d", expectedCount, len(allocs))
+			}
+			for _, alloc := range allocs {
+				if alloc.ClientStatus != structs.AllocClientStatusRunning {
+					return fmt.Errorf("alloc %q was %q, not running", alloc.ID, alloc.ClientStatus)
+				}
+			}
+			return nil
+		}),
+		wait.Timeout(60*time.Second),
+		wait.Gap(500*time.Millisecond),
+	))
+	return allocs
+}
+
+// waitForAllocDrain polls the allocation statues for a job until we've finished
+// migrating:
+// - the old alloc should be stopped
+// - the new alloc should be running
+func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID, oldNodeID string) []*api.AllocationListStub {
+
+	t.Helper()
+	newAllocs := set.From([]*api.AllocationListStub{})
+	start := time.Now()
+
+	must.Wait(t, wait.InitialSuccess(
+		wait.ErrorFunc(func() error {
+			allocs, _, err := nomadClient.Jobs().Allocations(jobID, false, nil)
+			if err != nil {
+				return fmt.Errorf("could not read allocations for node: %w", err)
+			}
+			if len(allocs) == 1 {
+				return fmt.Errorf("no new alloc started")
+			}
+
+			for _, alloc := range allocs {
+				if alloc.ID == oldAllocID {
+					if alloc.ClientStatus != structs.AllocClientStatusComplete {
+						return fmt.Errorf("old alloc was not marked complete")
+					}
+				} else {
+					if alloc.ClientStatus != structs.AllocClientStatusRunning {
+						return fmt.Errorf("new alloc was not marked running")
+					}
+					newAllocs.Insert(alloc)
+				}
+			}
+			t.Logf("alloc has drained from node=%s after %v",
+				oldNodeID, time.Now().Sub(start))
+			return nil
+		}),
+		wait.Timeout(120*time.Second),
+		wait.Gap(500*time.Millisecond),
+	))
+
+	return newAllocs.Slice()
+}
+
+func cleanupJobState(t *testing.T, jobID string) func() {
+	return func() {
+		_, err := e2eutil.Command("nomad", "job", "stop", "-purge", jobID)
+		test.NoError(t, err)
+	}
+}
+
+func cleanupDrainState(t *testing.T) func() {
+	return func() {
+		if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
+			return
+		}
+
+		nomadClient := e2eutil.NomadClient(t)
+		nodes, _, err := nomadClient.Nodes().List(nil)
+		must.NoError(t, err, must.Sprint("expected no error when listing nodes"))
+		for _, node := range nodes {
+			_, err := e2eutil.Command("nomad", "node", "drain", "-disable", "-yes", node.ID)
+			test.NoError(t, err)
+			_, err = e2eutil.Command("nomad", "node", "eligibility", "-enable", node.ID)
+			test.NoError(t, err)
+		}
+	}
+}
diff --git a/e2e/nodedrain/nodedrain.go b/e2e/nodedrain/nodedrain.go
index bd9cbb154a3..f45971fa4fe 100644
--- a/e2e/nodedrain/nodedrain.go
+++ b/e2e/nodedrain/nodedrain.go
@@ -162,87 +162,6 @@ func (tc *NodeDrainE2ETest) TestNodeDrainEphemeralMigrate(f *framework.F) {
 	f.Equal(oldAllocID, strings.TrimSpace(got), "node drained but migration failed")
 }
 
-// TestNodeDrainIgnoreSystem tests that system jobs are left behind when the
-// -ignore-system flag is used.
-func (tc *NodeDrainE2ETest) TestNodeDrainIgnoreSystem(f *framework.F) {
-
-	nodes, err := e2e.NodeStatusListFiltered(
-		func(section string) bool {
-			kernelName, err := e2e.GetField(section, "kernel.name")
-			return err == nil && kernelName == "linux"
-		})
-	f.NoError(err, "could not get node status listing")
-
-	serviceJobID := "test-node-drain-service-" + uuid.Generate()[0:8]
-	systemJobID := "test-node-drain-system-" + uuid.Generate()[0:8]
-
-	f.NoError(e2e.Register(serviceJobID, "nodedrain/input/drain_simple.nomad"))
-	tc.jobIDs = append(tc.jobIDs, serviceJobID)
-
-	f.NoError(e2e.WaitForAllocStatusExpected(serviceJobID, ns, []string{"running"}))
-
-	allocs, err := e2e.AllocsForJob(serviceJobID, ns)
-	f.NoError(err, "could not get allocs for service job")
-	f.Len(allocs, 1, "could not get allocs for service job")
-	oldAllocID := allocs[0]["ID"]
-
-	f.NoError(e2e.Register(systemJobID, "nodedrain/input/drain_ignore_system.nomad"))
-	tc.jobIDs = append(tc.jobIDs, systemJobID)
-
-	expected := []string{"running"}
-	f.NoError(e2e.WaitForAllocStatusExpected(serviceJobID, ns, expected),
-		"service job should be running")
-
-	// can't just give it a static list because the number of nodes can vary
-	f.NoError(
-		e2e.WaitForAllocStatusComparison(
-			func() ([]string, error) { return e2e.AllocStatuses(systemJobID, ns) },
-			func(got []string) bool {
-				if len(got) != len(nodes) {
-					return false
-				}
-				for _, status := range got {
-					if status != "running" {
-						return false
-					}
-				}
-				return true
-			}, nil,
-		),
-		"system job should be running on every node",
-	)
-
-	jobNodes, err := nodesForJob(serviceJobID)
-	f.NoError(err, "could not get nodes for job")
-	f.Len(jobNodes, 1, "could not get nodes for job")
-	nodeID := jobNodes[0]
-
-	out, err := e2e.Command(
-		"nomad", "node", "drain",
-		"-ignore-system", "-enable", "-yes", "-detach", nodeID)
-	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
-	tc.nodeIDs = append(tc.nodeIDs, nodeID)
-
-	f.NoError(waitForNodeDrain(nodeID,
-		func(got []map[string]string) bool {
-			for _, alloc := range got {
-				if alloc["ID"] == oldAllocID && alloc["Status"] == "complete" {
-					return true
-				}
-			}
-			return false
-		}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 500},
-	), "node did not drain")
-
-	allocs, err = e2e.AllocsForJob(systemJobID, ns)
-	f.NoError(err, "could not query allocs for system job")
-	f.Equal(len(nodes), len(allocs), "system job should still be running on every node")
-	for _, alloc := range allocs {
-		f.Equal("run", alloc["Desired"], "no system allocs should be draining")
-		f.Equal("running", alloc["Status"], "no system allocs should be draining")
-	}
-}
-
 // TestNodeDrainDeadline tests the enforcement of the node drain deadline so
 // that allocations are terminated even if they haven't gracefully exited.
 func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) {
@@ -327,28 +246,3 @@ func (tc *NodeDrainE2ETest) TestNodeDrainForce(f *framework.F) {
 	), "node did not drain immediately when forced")
 
 }
-
-// TestNodeDrainKeepIneligible tests that nodes can be kept ineligible for
-// scheduling after disabling drain.
-func (tc *NodeDrainE2ETest) TestNodeDrainKeepIneligible(f *framework.F) {
-
-	nodes, err := e2e.NodeStatusList()
-	f.NoError(err, "could not get node status listing")
-
-	nodeID := nodes[0]["ID"]
-
-	out, err := e2e.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID)
-	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
-	tc.nodeIDs = append(tc.nodeIDs, nodeID)
-
-	_, err = e2e.Command(
-		"nomad", "node", "drain",
-		"-disable", "-keep-ineligible", "-yes", nodeID)
-	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
-
-	nodes, err = e2e.NodeStatusList()
-	f.NoError(err, "could not get updated node status listing")
-
-	f.Equal("ineligible", nodes[0]["Eligibility"])
-	f.Equal("false", nodes[0]["Drain"])
-}

From da368d3cd9f2e5c7f571fcacb6535917618bb6a9 Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Wed, 12 Apr 2023 15:58:09 -0400
Subject: [PATCH 2/3] backport of #16826 E2E test changes only

---
 e2e/nodedrain/input/drain_migrate.nomad | 18 +------
 e2e/nodedrain/node_drain_test.go        | 67 +++++++++++++++++++++++-
 e2e/nodedrain/nodedrain.go              | 68 -------------------------
 3 files changed, 67 insertions(+), 86 deletions(-)

diff --git a/e2e/nodedrain/input/drain_migrate.nomad b/e2e/nodedrain/input/drain_migrate.nomad
index 4c67cc8b3b4..27949026dd0 100644
--- a/e2e/nodedrain/input/drain_migrate.nomad
+++ b/e2e/nodedrain/input/drain_migrate.nomad
@@ -1,5 +1,4 @@
 job "drain_migrate" {
-  datacenters = ["dc1", "dc2"]
 
   constraint {
     attribute = "${attr.kernel.name}"
@@ -19,22 +18,7 @@ job "drain_migrate" {
       config {
         image   = "busybox:1"
         command = "/bin/sh"
-        args    = ["local/test.sh"]
-      }
-
-      template {
-        data = <<EOT
-#!/bin/sh
-if [ ! -f /alloc/data/{{ env "NOMAD_JOB_NAME" }} ]; then
-  echo writing {{ env "NOMAD_ALLOC_ID" }} to /alloc/data/{{ env "NOMAD_JOB_NAME" }}
-  echo {{ env "NOMAD_ALLOC_ID" }} > /alloc/data/{{ env "NOMAD_JOB_NAME" }}
-else
-   echo /alloc/data/{{ env "NOMAD_JOB_NAME" }} already exists
-fi
-sleep 3600
-EOT
-
-        destination = "local/test.sh"
+        args    = ["-c", "echo \"data from $NOMAD_ALLOC_ID\" >> /alloc/data/migrate.txt && sleep 120"]
       }
 
       resources {
diff --git a/e2e/nodedrain/node_drain_test.go b/e2e/nodedrain/node_drain_test.go
index e4fcdfbfcf7..7ee3a274b11 100644
--- a/e2e/nodedrain/node_drain_test.go
+++ b/e2e/nodedrain/node_drain_test.go
@@ -3,6 +3,7 @@ package nodedrain
 import (
 	"fmt"
 	"os"
+	"strings"
 	"testing"
 	"time"
 
@@ -24,6 +25,7 @@ func TestNodeDrain(t *testing.T) {
 	e2eutil.WaitForNodesReady(t, nomadClient, 2) // needs at least 2 to test migration
 
 	t.Run("IgnoreSystem", testIgnoreSystem)
+	t.Run("EphemeralMigrate", testEphemeralMigrate)
 	t.Run("KeepIneligible", testKeepIneligible)
 }
 
@@ -78,6 +80,69 @@ func testIgnoreSystem(t *testing.T) {
 	}
 }
 
+// testEphemeralMigrate tests that ephermeral_disk migrations work as expected
+// even during a node drain.
+func testEphemeralMigrate(t *testing.T) {
+
+	t.Cleanup(cleanupDrainState(t))
+
+	nomadClient := e2eutil.NomadClient(t)
+	jobID := "drain-migrate-" + uuid.Short()
+
+	allocs := registerAndWaitForRunning(t, nomadClient, jobID, "./input/drain_migrate.nomad", 1)
+	t.Cleanup(cleanupJobState(t, jobID))
+	oldAllocID := allocs[0].ID
+	oldNodeID := allocs[0].NodeID
+
+	// make sure the allocation has written its ID to disk so we have something to migrate
+	must.Wait(t, wait.InitialSuccess(
+		wait.ErrorFunc(func() error {
+			got, err := e2eutil.Command("nomad", "alloc", "fs", oldAllocID,
+				"alloc/data/migrate.txt")
+			if err != nil {
+				return fmt.Errorf("did not expect error reading alloc fs: %v", err)
+			}
+			if !strings.Contains(got, oldAllocID) {
+				return fmt.Errorf("expected data to be written for alloc %q", oldAllocID)
+			}
+			return nil
+		}),
+		wait.Timeout(10*time.Second),
+		wait.Gap(500*time.Millisecond),
+	))
+
+	out, err := e2eutil.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", oldNodeID)
+	must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))
+
+	newAllocs := waitForAllocDrain(t, nomadClient, jobID, oldAllocID, oldNodeID)
+	must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc"))
+	newAllocID := newAllocs[0].ID
+	newNodeID := newAllocs[0].NodeID
+
+	// although migrate=true implies sticky=true, the drained node is ineligible
+	// for scheduling so the alloc should have been migrated
+	must.NotEq(t, oldNodeID, newNodeID, must.Sprint("new alloc was placed on draining node"))
+
+	// once the new allocation is running, it should quickly have the right data
+	must.Wait(t, wait.InitialSuccess(
+		wait.ErrorFunc(func() error {
+			got, err := e2eutil.Command("nomad", "alloc", "fs", newAllocID,
+				"alloc/data/migrate.txt")
+			if err != nil {
+				return fmt.Errorf("did not expect error reading alloc fs: %v", err)
+			}
+			if !strings.Contains(got, oldAllocID) || !strings.Contains(got, newAllocID) {
+				return fmt.Errorf(
+					"expected data to be migrated from alloc=%s on node=%s to alloc=%s on node=%s but got:\n%q",
+					oldAllocID[:8], oldNodeID[:8], newAllocID[:8], newNodeID[:8], got)
+			}
+			return nil
+		}),
+		wait.Timeout(10*time.Second),
+		wait.Gap(500*time.Millisecond),
+	))
+}
+
 // testKeepIneligible tests that nodes can be kept ineligible for scheduling after
 // disabling drain.
 func testKeepIneligible(t *testing.T) {
@@ -172,7 +237,7 @@ func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID,
 				}
 			}
 			t.Logf("alloc has drained from node=%s after %v",
-				oldNodeID, time.Now().Sub(start))
+				oldNodeID[:8], time.Now().Sub(start))
 			return nil
 		}),
 		wait.Timeout(120*time.Second),
diff --git a/e2e/nodedrain/nodedrain.go b/e2e/nodedrain/nodedrain.go
index f45971fa4fe..75d9697e1da 100644
--- a/e2e/nodedrain/nodedrain.go
+++ b/e2e/nodedrain/nodedrain.go
@@ -3,7 +3,6 @@ package nodedrain
 import (
 	"fmt"
 	"os"
-	"strings"
 	"time"
 
 	e2e "github.com/hashicorp/nomad/e2e/e2eutil"
@@ -95,73 +94,6 @@ func waitForNodeDrain(nodeID string, comparison func([]map[string]string) bool,
 	return err
 }
 
-// TestNodeDrainEphemeralMigrate tests that ephermeral_disk migrations work as
-// expected even during a node drain.
-func (tc *NodeDrainE2ETest) TestNodeDrainEphemeralMigrate(f *framework.F) {
-	jobID := "test-node-drain-" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "nodedrain/input/drain_migrate.nomad"))
-	tc.jobIDs = append(tc.jobIDs, jobID)
-
-	expected := []string{"running"}
-	f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")
-
-	allocs, err := e2e.AllocsForJob(jobID, ns)
-	f.NoError(err, "could not get allocs for job")
-	f.Len(allocs, 1, "could not get allocs for job")
-	oldAllocID := allocs[0]["ID"]
-
-	nodes, err := nodesForJob(jobID)
-	f.NoError(err, "could not get nodes for job")
-	f.Len(nodes, 1, "could not get nodes for job")
-	nodeID := nodes[0]
-
-	out, err := e2e.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID)
-	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
-	tc.nodeIDs = append(tc.nodeIDs, nodeID)
-
-	f.NoError(waitForNodeDrain(nodeID,
-		func(got []map[string]string) bool {
-			for _, alloc := range got {
-				if alloc["ID"] == oldAllocID && alloc["Status"] == "complete" {
-					return true
-				}
-			}
-			return false
-		}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 500},
-	), "node did not drain")
-
-	// wait for the allocation to be migrated
-	expected = []string{"running", "complete"}
-	f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")
-
-	allocs, err = e2e.AllocsForJob(jobID, ns)
-	f.NoError(err, "could not get allocations for job")
-
-	// the task writes its alloc ID to a file if it hasn't been previously
-	// written, so find the contents of the migrated file and make sure they
-	// match the old allocation, not the running one
-	var got string
-	var fsErr error
-	testutil.WaitForResultRetries(10, func() (bool, error) {
-		time.Sleep(time.Millisecond * 100)
-		for _, alloc := range allocs {
-			if alloc["Status"] == "running" && alloc["Node ID"] != nodeID && alloc["ID"] != oldAllocID {
-				got, fsErr = e2e.Command("nomad", "alloc", "fs",
-					alloc["ID"], fmt.Sprintf("alloc/data/%s", jobID))
-				if err != nil {
-					return false, err
-				}
-				return true, nil
-			}
-		}
-		return false, fmt.Errorf("missing expected allocation")
-	}, func(e error) {
-		fsErr = e
-	})
-	f.NoError(fsErr, "could not get allocation data")
-	f.Equal(oldAllocID, strings.TrimSpace(got), "node drained but migration failed")
-}
-
 // TestNodeDrainDeadline tests the enforcement of the node drain deadline so
 // that allocations are terminated even if they haven't gracefully exited.
 func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) {

From f26cf29d3aaa8f19382b28fc2eb23a496098d2ff Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Wed, 12 Apr 2023 15:59:31 -0400
Subject: [PATCH 3/3] E2E: clarify drain -deadline and -force flag behaviors
 (#16868)

The `-deadline` and `-force` flag for the `nomad node drain` command only cause
the draining to ignore the `migrate` block's healthy deadline, max parallel,
etc. These flags don't have anything to do with the `kill_timeout` or
`shutdown_delay` options of the jobspec.

This changeset fixes the skipped E2E tests so that they validate the intended
behavior, and updates the docs for more clarity.
---
 e2e/e2e_test.go                              |   2 +-
 e2e/nodedrain/input/drain_deadline.nomad     |  25 +--
 e2e/nodedrain/node_drain_test.go             | 174 +++++++++++++++---
 e2e/nodedrain/nodedrain.go                   | 180 -------------------
 website/content/docs/commands/node/drain.mdx |  12 +-
 5 files changed, 166 insertions(+), 227 deletions(-)
 delete mode 100644 e2e/nodedrain/nodedrain.go

diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go
index 5aeb8492d93..8c2907b8ad8 100644
--- a/e2e/e2e_test.go
+++ b/e2e/e2e_test.go
@@ -22,7 +22,6 @@ import (
 	_ "github.com/hashicorp/nomad/e2e/metrics"
 	_ "github.com/hashicorp/nomad/e2e/namespaces"
 	_ "github.com/hashicorp/nomad/e2e/networking"
-	_ "github.com/hashicorp/nomad/e2e/nodedrain"
 	_ "github.com/hashicorp/nomad/e2e/nomadexec"
 	_ "github.com/hashicorp/nomad/e2e/oversubscription"
 	_ "github.com/hashicorp/nomad/e2e/parameterized"
@@ -42,6 +41,7 @@ import (
 	// these are no longer on the old framework but by importing them
 	// we get a quick check that they compile on every commit
 	_ "github.com/hashicorp/nomad/e2e/disconnectedclients"
+	_ "github.com/hashicorp/nomad/e2e/nodedrain"
 	_ "github.com/hashicorp/nomad/e2e/volumes"
 )
 
diff --git a/e2e/nodedrain/input/drain_deadline.nomad b/e2e/nodedrain/input/drain_deadline.nomad
index d869234481c..e91fc197a5e 100644
--- a/e2e/nodedrain/input/drain_deadline.nomad
+++ b/e2e/nodedrain/input/drain_deadline.nomad
@@ -1,38 +1,31 @@
 job "drain_deadline" {
-  datacenters = ["dc1", "dc2"]
 
   constraint {
     attribute = "${attr.kernel.name}"
     value     = "linux"
   }
 
+  migrate {
+    max_parallel     = 1
+    min_healthy_time = "30s"
+  }
+
   group "group" {
 
+    count = 2
+
     task "task" {
       driver = "docker"
 
-      kill_timeout = "2m"
-
       config {
         image   = "busybox:1"
         command = "/bin/sh"
-        args    = ["local/script.sh"]
-      }
-
-      template {
-        data = <<EOF
-#!/bin/sh
-trap 'sleep 60' 2
-sleep 600
-EOF
-
-        destination = "local/script.sh"
-        change_mode = "noop"
+        args    = ["-c", "sleep 600"]
       }
 
       resources {
         cpu    = 256
-        memory = 128
+        memory = 64
       }
     }
   }
diff --git a/e2e/nodedrain/node_drain_test.go b/e2e/nodedrain/node_drain_test.go
index 7ee3a274b11..43151e8b4db 100644
--- a/e2e/nodedrain/node_drain_test.go
+++ b/e2e/nodedrain/node_drain_test.go
@@ -27,6 +27,8 @@ func TestNodeDrain(t *testing.T) {
 	t.Run("IgnoreSystem", testIgnoreSystem)
 	t.Run("EphemeralMigrate", testEphemeralMigrate)
 	t.Run("KeepIneligible", testKeepIneligible)
+	t.Run("DeadlineFlag", testDeadlineFlag)
+	t.Run("ForceFlag", testForceFlag)
 }
 
 // testIgnoreSystem tests that system jobs are left behind when the
@@ -48,12 +50,15 @@ func testIgnoreSystem(t *testing.T) {
 	// Run a system job, which will not be moved when we drain the node
 	systemJobID := "test-node-drain-system-" + uuid.Short()
 	t.Cleanup(cleanupJobState(t, systemJobID))
-	registerAndWaitForRunning(t, nomadClient, systemJobID, "./input/drain_ignore_system.nomad", count)
+
+	must.NoError(t, e2eutil.Register(systemJobID, "./input/drain_ignore_system.nomad"))
+	waitForRunningAllocs(t, nomadClient, systemJobID, count)
 
 	// Also run a service job so we can verify when the drain is done
 	serviceJobID := "test-node-drain-service-" + uuid.Short()
 	t.Cleanup(cleanupJobState(t, serviceJobID))
-	serviceAllocs := registerAndWaitForRunning(t, nomadClient, serviceJobID, "./input/drain_simple.nomad", 1)
+	must.NoError(t, e2eutil.Register(serviceJobID, "./input/drain_simple.nomad"))
+	serviceAllocs := waitForRunningAllocs(t, nomadClient, serviceJobID, 1)
 	oldAllocID := serviceAllocs[0].ID
 	oldNodeID := serviceAllocs[0].NodeID
 
@@ -64,7 +69,8 @@ func testIgnoreSystem(t *testing.T) {
 	must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))
 
 	// The service job should be drained
-	newAllocs := waitForAllocDrain(t, nomadClient, serviceJobID, oldAllocID, oldNodeID)
+	newAllocs := waitForAllocDrainComplete(t, nomadClient, serviceJobID,
+		oldAllocID, oldNodeID, time.Second*120)
 	must.Len(t, 1, newAllocs, must.Sprint("expected 1 new service job alloc"))
 
 	// The system job should not have been drained
@@ -89,7 +95,8 @@ func testEphemeralMigrate(t *testing.T) {
 	nomadClient := e2eutil.NomadClient(t)
 	jobID := "drain-migrate-" + uuid.Short()
 
-	allocs := registerAndWaitForRunning(t, nomadClient, jobID, "./input/drain_migrate.nomad", 1)
+	must.NoError(t, e2eutil.Register(jobID, "./input/drain_migrate.nomad"))
+	allocs := waitForRunningAllocs(t, nomadClient, jobID, 1)
 	t.Cleanup(cleanupJobState(t, jobID))
 	oldAllocID := allocs[0].ID
 	oldNodeID := allocs[0].NodeID
@@ -114,7 +121,8 @@ func testEphemeralMigrate(t *testing.T) {
 	out, err := e2eutil.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", oldNodeID)
 	must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))
 
-	newAllocs := waitForAllocDrain(t, nomadClient, jobID, oldAllocID, oldNodeID)
+	newAllocs := waitForAllocDrainComplete(t, nomadClient, jobID,
+		oldAllocID, oldNodeID, time.Second*120)
 	must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc"))
 	newAllocID := newAllocs[0].ID
 	newNodeID := newAllocs[0].NodeID
@@ -173,42 +181,129 @@ func testKeepIneligible(t *testing.T) {
 	}
 }
 
-// registerAndWaitForRunning registers a job and waits for the expected number
-// of allocations to be in a running state. Returns the allocations.
-func registerAndWaitForRunning(t *testing.T, nomadClient *api.Client, jobID, jobSpec string, expectedCount int) []*api.AllocationListStub {
+// testDeadlineFlag tests the enforcement of the node drain deadline so that
+// allocations are moved even if max_parallel says we should be waiting
+func testDeadlineFlag(t *testing.T) {
+
+	nomadClient := e2eutil.NomadClient(t)
+	t.Cleanup(cleanupDrainState(t))
+
+	jobID := "test-node-drain-" + uuid.Short()
+	must.NoError(t, e2eutil.Register(jobID, "./input/drain_deadline.nomad"))
+	allocs := waitForRunningAllocs(t, nomadClient, jobID, 2)
+
+	t.Cleanup(cleanupJobState(t, jobID))
+	oldAllocID1 := allocs[0].ID
+	oldNodeID1 := allocs[0].NodeID
+	oldAllocID2 := allocs[1].ID
+	oldNodeID2 := allocs[1].NodeID
+
+	t.Logf("draining nodes %s, %s", oldNodeID1, oldNodeID2)
+	out, err := e2eutil.Command("nomad", "node", "eligibility", "-disable", oldNodeID1)
+	must.NoError(t, err, must.Sprintf("nomad node eligibility -disable failed: %v\n%v", err, out))
+	out, err = e2eutil.Command("nomad", "node", "eligibility", "-disable", oldNodeID2)
+	must.NoError(t, err, must.Sprintf("nomad node eligibility -disable failed: %v\n%v", err, out))
+
+	out, err = e2eutil.Command(
+		"nomad", "node", "drain",
+		"-deadline", "1s",
+		"-enable", "-yes", "-detach", oldNodeID1)
+	must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID1, err, out))
+
+	out, err = e2eutil.Command(
+		"nomad", "node", "drain",
+		"-deadline", "1s",
+		"-enable", "-yes", "-detach", oldNodeID2)
+	must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID2, err, out))
+
+	// with max_parallel=1 and min_healthy_time=30s we'd expect it to take ~60
+	// for both to be marked complete. Instead, because of the -deadline flag
+	// we'll expect the allocs to be stoppped almost immediately (give it 10s to
+	// avoid flakiness), and then the new allocs should come up and get marked
+	// healthy after ~30s
+	t.Log("waiting for old allocs to stop")
+	waitForAllocsStop(t, nomadClient, time.Second*10, oldAllocID1, oldAllocID2)
+
+	t.Log("waiting for running allocs")
+	waitForRunningAllocs(t, nomadClient, jobID, 2)
+}
+
+// testForceFlag tests the enforcement of the node drain -force flag so that
+// allocations are terminated immediately.
+func testForceFlag(t *testing.T) {
+
+	nomadClient := e2eutil.NomadClient(t)
+	t.Cleanup(cleanupDrainState(t))
+
+	jobID := "test-node-drain-" + uuid.Short()
+	must.NoError(t, e2eutil.Register(jobID, "./input/drain_deadline.nomad"))
+	allocs := waitForRunningAllocs(t, nomadClient, jobID, 2)
+
+	t.Cleanup(cleanupJobState(t, jobID))
+	oldAllocID1 := allocs[0].ID
+	oldNodeID1 := allocs[0].NodeID
+	oldAllocID2 := allocs[1].ID
+	oldNodeID2 := allocs[1].NodeID
+
+	t.Logf("draining nodes %s, %s", oldNodeID1, oldNodeID2)
+	out, err := e2eutil.Command("nomad", "node", "eligibility", "-disable", oldNodeID1)
+	must.NoError(t, err, must.Sprintf("nomad node eligibility -disable failed: %v\n%v", err, out))
+	out, err = e2eutil.Command("nomad", "node", "eligibility", "-disable", oldNodeID2)
+	must.NoError(t, err, must.Sprintf("nomad node eligibility -disable failed: %v\n%v", err, out))
+
+	out, err = e2eutil.Command(
+		"nomad", "node", "drain", "-force",
+		"-enable", "-yes", "-detach", oldNodeID1)
+	must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID1, err, out))
+
+	out, err = e2eutil.Command(
+		"nomad", "node", "drain", "-force",
+		"-enable", "-yes", "-detach", oldNodeID2)
+	must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID2, err, out))
+
+	// with max_parallel=1 and min_healthy_time=30s we'd expect it to take ~60
+	// for both to be marked complete. Instead, because of the -force flag
+	// we'll expect the allocs to be stoppped almost immediately (give it 10s to
+	// avoid flakiness), and then the new allocs should come up and get marked
+	// healthy after ~30s
+	t.Log("waiting for old allocs to stop")
+	waitForAllocsStop(t, nomadClient, time.Second*10, oldAllocID1, oldAllocID2)
+
+	t.Log("waiting for running allocs")
+	waitForRunningAllocs(t, nomadClient, jobID, 2)
+}
+
+func waitForRunningAllocs(t *testing.T, nomadClient *api.Client, jobID string, expectedRunningCount int) []*api.AllocationListStub {
 	t.Helper()
 
-	var allocs []*api.AllocationListStub
-	var err error
-	must.NoError(t, e2eutil.Register(jobID, jobSpec))
+	runningAllocs := set.From([]*api.AllocationListStub{})
 
 	must.Wait(t, wait.InitialSuccess(
 		wait.ErrorFunc(func() error {
-			allocs, _, err = nomadClient.Jobs().Allocations(jobID, false, nil)
-			if err != nil {
-				return fmt.Errorf("expected no error listing allocs: %v", err)
-			}
-			if len(allocs) != expectedCount {
-				return fmt.Errorf("expected %d allocs but found %d", expectedCount, len(allocs))
-			}
+			allocs, _, err := nomadClient.Jobs().Allocations(jobID, false, nil)
+			must.NoError(t, err)
+			count := 0
 			for _, alloc := range allocs {
-				if alloc.ClientStatus != structs.AllocClientStatusRunning {
-					return fmt.Errorf("alloc %q was %q, not running", alloc.ID, alloc.ClientStatus)
+				if alloc.ClientStatus == structs.AllocClientStatusRunning {
+					runningAllocs.Insert(alloc)
 				}
 			}
+			if runningAllocs.Size() < expectedRunningCount {
+				return fmt.Errorf("expected %d running allocs, got %d", expectedRunningCount, count)
+			}
 			return nil
 		}),
 		wait.Timeout(60*time.Second),
 		wait.Gap(500*time.Millisecond),
 	))
-	return allocs
+	return runningAllocs.Slice()
 }
 
-// waitForAllocDrain polls the allocation statues for a job until we've finished
+// waitForAllocDrainComplete polls the allocation statues for a job until we've finished
 // migrating:
 // - the old alloc should be stopped
 // - the new alloc should be running
-func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID, oldNodeID string) []*api.AllocationListStub {
+func waitForAllocDrainComplete(t *testing.T, nomadClient *api.Client, jobID, oldAllocID, oldNodeID string, deadline time.Duration) []*api.AllocationListStub {
 
 	t.Helper()
 	newAllocs := set.From([]*api.AllocationListStub{})
@@ -240,16 +335,45 @@ func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID,
 				oldNodeID[:8], time.Now().Sub(start))
 			return nil
 		}),
-		wait.Timeout(120*time.Second),
+		wait.Timeout(deadline),
 		wait.Gap(500*time.Millisecond),
 	))
 
 	return newAllocs.Slice()
 }
 
+// waitForAllocsStop polls the allocation statues for specific allocations until
+// they've stopped
+func waitForAllocsStop(t *testing.T, nomadClient *api.Client, deadline time.Duration, oldAllocIDs ...string) {
+	t.Helper()
+
+	must.Wait(t, wait.InitialSuccess(
+		wait.ErrorFunc(func() error {
+			for _, allocID := range oldAllocIDs {
+				alloc, _, err := nomadClient.Allocations().Info(allocID, nil)
+				must.NoError(t, err)
+				if alloc.ClientStatus != structs.AllocClientStatusComplete {
+					return fmt.Errorf("expected alloc %s to be complete, got %q",
+						allocID[:8], alloc.ClientStatus)
+				}
+			}
+			return nil
+		}),
+		wait.Timeout(deadline),
+		wait.Gap(500*time.Millisecond),
+	))
+}
+
 func cleanupJobState(t *testing.T, jobID string) func() {
 	return func() {
-		_, err := e2eutil.Command("nomad", "job", "stop", "-purge", jobID)
+		if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
+			return
+		}
+
+		// we can't use the CLI here because some tests will stop the job during
+		// a running deployment, which returns a non-zero exit code
+		nomadClient := e2eutil.NomadClient(t)
+		_, _, err := nomadClient.Jobs().Deregister(jobID, true, nil)
 		test.NoError(t, err)
 	}
 }
diff --git a/e2e/nodedrain/nodedrain.go b/e2e/nodedrain/nodedrain.go
deleted file mode 100644
index 75d9697e1da..00000000000
--- a/e2e/nodedrain/nodedrain.go
+++ /dev/null
@@ -1,180 +0,0 @@
-package nodedrain
-
-import (
-	"fmt"
-	"os"
-	"time"
-
-	e2e "github.com/hashicorp/nomad/e2e/e2eutil"
-	"github.com/hashicorp/nomad/e2e/framework"
-	"github.com/hashicorp/nomad/helper/uuid"
-	"github.com/hashicorp/nomad/testutil"
-)
-
-const ns = ""
-
-type NodeDrainE2ETest struct {
-	framework.TC
-	jobIDs  []string
-	nodeIDs []string
-}
-
-func init() {
-	framework.AddSuites(&framework.TestSuite{
-		Component:   "NodeDrain",
-		CanRunLocal: true,
-		Consul:      true,
-		Cases: []framework.TestCase{
-			new(NodeDrainE2ETest),
-		},
-	})
-
-}
-
-func (tc *NodeDrainE2ETest) BeforeAll(f *framework.F) {
-	e2e.WaitForLeader(f.T(), tc.Nomad())
-	e2e.WaitForNodesReady(f.T(), tc.Nomad(), 2) // needs at least 2 to test migration
-}
-
-func (tc *NodeDrainE2ETest) AfterEach(f *framework.F) {
-	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
-		return
-	}
-
-	for _, id := range tc.jobIDs {
-		_, err := e2e.Command("nomad", "job", "stop", "-purge", id)
-		f.Assert().NoError(err)
-	}
-	tc.jobIDs = []string{}
-
-	for _, id := range tc.nodeIDs {
-		_, err := e2e.Command("nomad", "node", "drain", "-disable", "-yes", id)
-		f.Assert().NoError(err)
-		_, err = e2e.Command("nomad", "node", "eligibility", "-enable", id)
-		f.Assert().NoError(err)
-	}
-	tc.nodeIDs = []string{}
-
-	_, err := e2e.Command("nomad", "system", "gc")
-	f.Assert().NoError(err)
-}
-
-func nodesForJob(jobID string) ([]string, error) {
-	allocs, err := e2e.AllocsForJob(jobID, ns)
-	if err != nil {
-		return nil, err
-	}
-	if len(allocs) < 1 {
-		return nil, fmt.Errorf("no allocs found for job: %v", jobID)
-	}
-	nodes := []string{}
-	for _, alloc := range allocs {
-		nodes = append(nodes, alloc["Node ID"])
-	}
-	return nodes, nil
-}
-
-// waitForNodeDrain is a convenience wrapper that polls 'node status'
-// until the comparison function over the state of the job's allocs on that
-// node returns true
-func waitForNodeDrain(nodeID string, comparison func([]map[string]string) bool, wc *e2e.WaitConfig) error {
-	var got []map[string]string
-	var err error
-	interval, retries := wc.OrDefault()
-	testutil.WaitForResultRetries(retries, func() (bool, error) {
-		time.Sleep(interval)
-		got, err = e2e.AllocsForNode(nodeID)
-		if err != nil {
-			return false, err
-		}
-		return comparison(got), nil
-	}, func(e error) {
-		err = fmt.Errorf("node drain status check failed: %v\n%#v", e, got)
-	})
-	return err
-}
-
-// TestNodeDrainDeadline tests the enforcement of the node drain deadline so
-// that allocations are terminated even if they haven't gracefully exited.
-func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) {
-	f.T().Skip("The behavior is unclear and test assertions don't capture intent.  Issue 9902")
-
-	jobID := "test-node-drain-" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad"))
-	tc.jobIDs = append(tc.jobIDs, jobID)
-
-	expected := []string{"running"}
-	f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")
-
-	nodes, err := nodesForJob(jobID)
-	f.NoError(err, "could not get nodes for job")
-	f.Len(nodes, 1, "could not get nodes for job")
-	nodeID := nodes[0]
-
-	f.T().Logf("draining node %v", nodeID)
-	out, err := e2e.Command(
-		"nomad", "node", "drain",
-		"-deadline", "5s",
-		"-enable", "-yes", "-detach", nodeID)
-	f.NoError(err, fmt.Sprintf("'nomad node drain %v' failed: %v\n%v", nodeID, err, out))
-	tc.nodeIDs = append(tc.nodeIDs, nodeID)
-
-	// the deadline is 40s but we can't guarantee its instantly terminated at
-	// that point, so we give it 30s which is well under the 2m kill_timeout in
-	// the job.
-	// deadline here needs to account for scheduling and propagation delays.
-	f.NoError(waitForNodeDrain(nodeID,
-		func(got []map[string]string) bool {
-			// FIXME: check the drain job alloc specifically. test
-			// may pass if client had another completed alloc
-			for _, alloc := range got {
-				if alloc["Status"] == "complete" {
-					return true
-				}
-			}
-			return false
-		}, &e2e.WaitConfig{Interval: time.Second, Retries: 40},
-	), "node did not drain immediately following deadline")
-}
-
-// TestNodeDrainForce tests the enforcement of the node drain -force flag so
-// that allocations are terminated immediately.
-func (tc *NodeDrainE2ETest) TestNodeDrainForce(f *framework.F) {
-	f.T().Skip("The behavior is unclear and test assertions don't capture intent.  Issue 9902")
-
-	jobID := "test-node-drain-" + uuid.Generate()[0:8]
-	f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad"))
-	tc.jobIDs = append(tc.jobIDs, jobID)
-
-	expected := []string{"running"}
-	f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")
-
-	nodes, err := nodesForJob(jobID)
-	f.NoError(err, "could not get nodes for job")
-	f.Len(nodes, 1, "could not get nodes for job")
-	nodeID := nodes[0]
-
-	out, err := e2e.Command(
-		"nomad", "node", "drain",
-		"-force",
-		"-enable", "-yes", "-detach", nodeID)
-	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
-	tc.nodeIDs = append(tc.nodeIDs, nodeID)
-
-	// we've passed -force but we can't guarantee its instantly terminated at
-	// that point, so we give it 30s which is under the 2m kill_timeout in
-	// the job
-	f.NoError(waitForNodeDrain(nodeID,
-		func(got []map[string]string) bool {
-			// FIXME: check the drain job alloc specifically. test
-			// may pass if client had another completed alloc
-			for _, alloc := range got {
-				if alloc["Status"] == "complete" {
-					return true
-				}
-			}
-			return false
-		}, &e2e.WaitConfig{Interval: time.Second, Retries: 40},
-	), "node did not drain immediately when forced")
-
-}
diff --git a/website/content/docs/commands/node/drain.mdx b/website/content/docs/commands/node/drain.mdx
index 3422e3ddcb2..eff783e134e 100644
--- a/website/content/docs/commands/node/drain.mdx
+++ b/website/content/docs/commands/node/drain.mdx
@@ -57,17 +57,19 @@ capability.
 - `-disable`: Disable node drain mode.
 
 - `-deadline`: Set the deadline by which all allocations must be moved off the
-  node. Remaining allocations after the deadline are force removed from the
-  node. Defaults to 1 hour.
+  node. Remaining allocations after the deadline are removed from the node,
+  regardless of their [`migrate`][] block. Defaults to 1 hour.
 
 - `-detach`: Return immediately instead of entering monitor mode.
 
 - `-monitor`: Enter monitor mode directly without modifying the drain status.
 
-- `-force`: Force remove allocations off the node immediately.
+- `-force`: Remove allocations off the node immediately, regardless of the
+  allocation's [`migrate`][] block.
 
-- `-no-deadline`: No deadline allows the allocations to drain off the node
-  without being force stopped after a certain deadline.
+- `-no-deadline`: No deadline allows the allocations to drain off the node,
+  ignoring the default 1 hour deadline before allocations are removed regardless
+  of their [`migrate`][] block.
 
 - `-ignore-system`: Ignore system allows the drain to complete without
   stopping system job allocations. By default system jobs (and CSI