Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ephemeral disk: migrate should imply sticky #16826

Merged
merged 2 commits into from
Apr 7, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/16826.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
ephemeral disk: migrate=true now implies sticky=true
```
2 changes: 1 addition & 1 deletion client/allocwatcher/alloc_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ func newMigratorForAlloc(c Config, tg *structs.TaskGroup, watchedAllocID string,
logger := c.Logger.Named("alloc_migrator").With("alloc_id", c.Alloc.ID).With("previous_alloc", watchedAllocID)

tasks := tg.Tasks
sticky := tg.EphemeralDisk != nil && tg.EphemeralDisk.Sticky
migrate := tg.EphemeralDisk != nil && tg.EphemeralDisk.Migrate
sticky := tg.EphemeralDisk != nil && (tg.EphemeralDisk.Sticky || migrate)

if m != nil {
// Local Allocation because there's an alloc runner
Expand Down
18 changes: 1 addition & 17 deletions e2e/nodedrain/input/drain_migrate.nomad
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
job "drain_migrate" {
datacenters = ["dc1", "dc2"]

constraint {
attribute = "${attr.kernel.name}"
Expand All @@ -19,22 +18,7 @@ job "drain_migrate" {
config {
image = "busybox:1"
command = "/bin/sh"
args = ["local/test.sh"]
}

template {
data = <<EOT
#!/bin/sh
if [ ! -f /alloc/data/{{ env "NOMAD_JOB_NAME" }} ]; then
echo writing {{ env "NOMAD_ALLOC_ID" }} to /alloc/data/{{ env "NOMAD_JOB_NAME" }}
echo {{ env "NOMAD_ALLOC_ID" }} > /alloc/data/{{ env "NOMAD_JOB_NAME" }}
else
echo /alloc/data/{{ env "NOMAD_JOB_NAME" }} already exists
fi
sleep 3600
EOT

destination = "local/test.sh"
args = ["-c", "echo \"data from $NOMAD_ALLOC_ID\" >> /alloc/data/migrate.txt && sleep 120"]
}

resources {
Expand Down
67 changes: 66 additions & 1 deletion e2e/nodedrain/node_drain_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package nodedrain
import (
"fmt"
"os"
"strings"
"testing"
"time"

Expand All @@ -24,6 +25,7 @@ func TestNodeDrain(t *testing.T) {
e2eutil.WaitForNodesReady(t, nomadClient, 2) // needs at least 2 to test migration

t.Run("IgnoreSystem", testIgnoreSystem)
t.Run("EphemeralMigrate", testEphemeralMigrate)
t.Run("KeepIneligible", testKeepIneligible)
}

Expand Down Expand Up @@ -78,6 +80,69 @@ func testIgnoreSystem(t *testing.T) {
}
}

// testEphemeralMigrate tests that ephermeral_disk migrations work as expected
// even during a node drain.
func testEphemeralMigrate(t *testing.T) {

t.Cleanup(cleanupDrainState(t))

nomadClient := e2eutil.NomadClient(t)
jobID := "drain-migrate-" + uuid.Short()

allocs := registerAndWaitForRunning(t, nomadClient, jobID, "./input/drain_migrate.nomad", 1)
t.Cleanup(cleanupJobState(t, jobID))
oldAllocID := allocs[0].ID
oldNodeID := allocs[0].NodeID

// make sure the allocation has written its ID to disk so we have something to migrate
must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(func() error {
got, err := e2eutil.Command("nomad", "alloc", "fs", oldAllocID,
"alloc/data/migrate.txt")
if err != nil {
return fmt.Errorf("did not expect error reading alloc fs: %v", err)
}
if !strings.Contains(got, oldAllocID) {
return fmt.Errorf("expected data to be written for alloc %q", oldAllocID)
}
return nil
}),
wait.Timeout(10*time.Second),
wait.Gap(500*time.Millisecond),
))

out, err := e2eutil.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", oldNodeID)
must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))

newAllocs := waitForAllocDrain(t, nomadClient, jobID, oldAllocID, oldNodeID)
must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc"))
newAllocID := newAllocs[0].ID
newNodeID := newAllocs[0].NodeID

// although migrate=true implies sticky=true, the drained node is ineligible
// for scheduling so the alloc should have been migrated
must.NotEq(t, oldNodeID, newNodeID, must.Sprint("new alloc was placed on draining node"))

// once the new allocation is running, it should quickly have the right data
must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(func() error {
got, err := e2eutil.Command("nomad", "alloc", "fs", newAllocID,
"alloc/data/migrate.txt")
if err != nil {
return fmt.Errorf("did not expect error reading alloc fs: %v", err)
}
if !strings.Contains(got, oldAllocID) || !strings.Contains(got, newAllocID) {
return fmt.Errorf(
"expected data to be migrated from alloc=%s on node=%s to alloc=%s on node=%s but got:\n%q",
oldAllocID[:8], oldNodeID[:8], newAllocID[:8], newNodeID[:8], got)
}
return nil
}),
wait.Timeout(10*time.Second),
wait.Gap(500*time.Millisecond),
))
}

// testKeepIneligible tests that nodes can be kept ineligible for scheduling after
// disabling drain.
func testKeepIneligible(t *testing.T) {
Expand Down Expand Up @@ -172,7 +237,7 @@ func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID,
}
}
t.Logf("alloc has drained from node=%s after %v",
oldNodeID, time.Now().Sub(start))
oldNodeID[:8], time.Now().Sub(start))
return nil
}),
wait.Timeout(120*time.Second),
Expand Down
68 changes: 0 additions & 68 deletions e2e/nodedrain/nodedrain.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package nodedrain
import (
"fmt"
"os"
"strings"
"time"

e2e "github.com/hashicorp/nomad/e2e/e2eutil"
Expand Down Expand Up @@ -95,73 +94,6 @@ func waitForNodeDrain(nodeID string, comparison func([]map[string]string) bool,
return err
}

// TestNodeDrainEphemeralMigrate tests that ephermeral_disk migrations work as
// expected even during a node drain.
func (tc *NodeDrainE2ETest) TestNodeDrainEphemeralMigrate(f *framework.F) {
jobID := "test-node-drain-" + uuid.Generate()[0:8]
f.NoError(e2e.Register(jobID, "nodedrain/input/drain_migrate.nomad"))
tc.jobIDs = append(tc.jobIDs, jobID)

expected := []string{"running"}
f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")

allocs, err := e2e.AllocsForJob(jobID, ns)
f.NoError(err, "could not get allocs for job")
f.Len(allocs, 1, "could not get allocs for job")
oldAllocID := allocs[0]["ID"]

nodes, err := nodesForJob(jobID)
f.NoError(err, "could not get nodes for job")
f.Len(nodes, 1, "could not get nodes for job")
nodeID := nodes[0]

out, err := e2e.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID)
f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
tc.nodeIDs = append(tc.nodeIDs, nodeID)

f.NoError(waitForNodeDrain(nodeID,
func(got []map[string]string) bool {
for _, alloc := range got {
if alloc["ID"] == oldAllocID && alloc["Status"] == "complete" {
return true
}
}
return false
}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 500},
), "node did not drain")

// wait for the allocation to be migrated
expected = []string{"running", "complete"}
f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")

allocs, err = e2e.AllocsForJob(jobID, ns)
f.NoError(err, "could not get allocations for job")

// the task writes its alloc ID to a file if it hasn't been previously
// written, so find the contents of the migrated file and make sure they
// match the old allocation, not the running one
var got string
var fsErr error
testutil.WaitForResultRetries(10, func() (bool, error) {
time.Sleep(time.Millisecond * 100)
for _, alloc := range allocs {
if alloc["Status"] == "running" && alloc["Node ID"] != nodeID && alloc["ID"] != oldAllocID {
got, fsErr = e2e.Command("nomad", "alloc", "fs",
alloc["ID"], fmt.Sprintf("alloc/data/%s", jobID))
if err != nil {
return false, err
}
return true, nil
}
}
return false, fmt.Errorf("missing expected allocation")
}, func(e error) {
fsErr = e
})
f.NoError(fsErr, "could not get allocation data")
f.Equal(oldAllocID, strings.TrimSpace(got), "node drained but migration failed")
}

// TestNodeDrainDeadline tests the enforcement of the node drain deadline so
// that allocations are terminated even if they haven't gracefully exited.
func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) {
Expand Down
5 changes: 2 additions & 3 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -10634,9 +10634,8 @@ func (a *Allocation) ShouldMigrate() bool {
return false
}

// We won't migrate any data is the user hasn't enabled migration or the
// disk is not marked as sticky
if !tg.EphemeralDisk.Migrate || !tg.EphemeralDisk.Sticky {
// We won't migrate any data if the user hasn't enabled migration
if !tg.EphemeralDisk.Migrate {
tgross marked this conversation as resolved.
Show resolved Hide resolved
return false
}

Expand Down
Loading