Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stale allocation data leads to incorrect (and even negative) metrics #5637

Merged
merged 2 commits into from
May 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions client/allocrunner/state/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,13 @@ func (s *State) Copy() *State {
TaskStates: taskStates,
}
}

// ClientTerminalStatus returns if the client status is terminal and will no longer transition
func (a *State) ClientTerminalStatus() bool {
switch a.ClientStatus {
case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed, structs.AllocClientStatusLost:
return true
default:
return false
}
}
23 changes: 6 additions & 17 deletions client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ import (
"sync"
"time"

metrics "github.com/armon/go-metrics"
"github.com/armon/go-metrics"
consulapi "github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/lib"
hclog "github.com/hashicorp/go-hclog"
multierror "github.com/hashicorp/go-multierror"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/allocrunner"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
Expand Down Expand Up @@ -2709,11 +2709,11 @@ func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.Comparab
}

// Sum the allocated resources
allocs := c.allAllocs()
var allocated structs.ComparableResources
allocatedDeviceMbits := make(map[string]int)
for _, alloc := range allocs {
if alloc.TerminalStatus() {
for _, ar := range c.getAllocRunners() {
alloc := ar.Alloc()
if alloc.ServerTerminalStatus() || ar.AllocState().ClientTerminalStatus() {
continue
}

Expand Down Expand Up @@ -2760,17 +2760,6 @@ func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.Comparab
return &allocated
}

// allAllocs returns all the allocations managed by the client
func (c *Client) allAllocs() map[string]*structs.Allocation {
ars := c.getAllocRunners()
allocs := make(map[string]*structs.Allocation, len(ars))
for _, ar := range ars {
a := ar.Alloc()
allocs[a.ID] = a
}
return allocs
}

// GetTaskEventHandler returns an event handler for the given allocID and task name
func (c *Client) GetTaskEventHandler(allocID, taskName string) drivermanager.EventHandler {
c.allocLock.RLock()
Expand Down
116 changes: 113 additions & 3 deletions client/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (
"testing"
"time"

memdb "github.com/hashicorp/go-memdb"
"github.com/hashicorp/go-memdb"
"github.com/hashicorp/nomad/client/config"
consulApi "github.com/hashicorp/nomad/client/consul"
"github.com/hashicorp/nomad/client/fingerprint"
Expand All @@ -26,7 +26,7 @@ import (
"github.com/hashicorp/nomad/testutil"
"github.com/stretchr/testify/assert"

hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-hclog"
cstate "github.com/hashicorp/nomad/client/state"
ctestutil "github.com/hashicorp/nomad/client/testutil"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -893,7 +893,7 @@ func TestClient_BlockedAllocations(t *testing.T) {
t.Fatalf("err: %v", err)
}

// Enusre that the chained allocation is being tracked as blocked
// Ensure that the chained allocation is being tracked as blocked
testutil.WaitForResult(func() (bool, error) {
ar := c1.getAllocRunners()[alloc2.ID]
if ar == nil {
Expand Down Expand Up @@ -1414,6 +1414,116 @@ func TestClient_computeAllocatedDeviceStats(t *testing.T) {
assert.EqualValues(t, expected, result)
}

func TestClient_getAllocatedResources(t *testing.T) {
t.Parallel()
require := require.New(t)
client, cleanup := TestClient(t, nil)
defer cleanup()

allocStops := mock.BatchAlloc()
allocStops.Job.TaskGroups[0].Count = 1
allocStops.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
allocStops.Job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
"run_for": "1ms",
"exit_code": "0",
}
allocStops.Job.TaskGroups[0].RestartPolicy.Attempts = 0
allocStops.AllocatedResources.Shared.DiskMB = 64
allocStops.AllocatedResources.Tasks["web"].Cpu = structs.AllocatedCpuResources{CpuShares: 64}
allocStops.AllocatedResources.Tasks["web"].Memory = structs.AllocatedMemoryResources{MemoryMB: 64}
require.Nil(client.addAlloc(allocStops, ""))

allocFails := mock.BatchAlloc()
allocFails.Job.TaskGroups[0].Count = 1
allocFails.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
allocFails.Job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
"run_for": "1ms",
"exit_code": "1",
}
allocFails.Job.TaskGroups[0].RestartPolicy.Attempts = 0
allocFails.AllocatedResources.Shared.DiskMB = 128
allocFails.AllocatedResources.Tasks["web"].Cpu = structs.AllocatedCpuResources{CpuShares: 128}
allocFails.AllocatedResources.Tasks["web"].Memory = structs.AllocatedMemoryResources{MemoryMB: 128}
require.Nil(client.addAlloc(allocFails, ""))

allocRuns := mock.Alloc()
allocRuns.Job.TaskGroups[0].Count = 1
allocRuns.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
allocRuns.Job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
"run_for": "3s",
}
allocRuns.AllocatedResources.Shared.DiskMB = 256
allocRuns.AllocatedResources.Tasks["web"].Cpu = structs.AllocatedCpuResources{CpuShares: 256}
allocRuns.AllocatedResources.Tasks["web"].Memory = structs.AllocatedMemoryResources{MemoryMB: 256}
require.Nil(client.addAlloc(allocRuns, ""))

allocPends := mock.Alloc()
allocPends.Job.TaskGroups[0].Count = 1
allocPends.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
allocPends.Job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
"run_for": "5s",
"start_block_for": "10s",
}
allocPends.AllocatedResources.Shared.DiskMB = 512
allocPends.AllocatedResources.Tasks["web"].Cpu = structs.AllocatedCpuResources{CpuShares: 512}
allocPends.AllocatedResources.Tasks["web"].Memory = structs.AllocatedMemoryResources{MemoryMB: 512}
require.Nil(client.addAlloc(allocPends, ""))

// wait for allocStops to stop running and for allocRuns to be pending/running
testutil.WaitForResult(func() (bool, error) {
as, err := client.GetAllocState(allocPends.ID)
if err != nil {
return false, err
} else if as.ClientStatus != structs.AllocClientStatusPending {
return false, fmt.Errorf("allocPends not yet pending: %#v", as)
}

as, err = client.GetAllocState(allocRuns.ID)
if as.ClientStatus != structs.AllocClientStatusRunning {
return false, fmt.Errorf("allocRuns not yet running: %#v", as)
} else if err != nil {
return false, err
}

as, err = client.GetAllocState(allocStops.ID)
if err != nil {
return false, err
} else if as.ClientStatus != structs.AllocClientStatusComplete {
return false, fmt.Errorf("allocStops not yet complete: %#v", as)
}

as, err = client.GetAllocState(allocFails.ID)
if err != nil {
return false, err
} else if as.ClientStatus != structs.AllocClientStatusFailed {
return false, fmt.Errorf("allocFails not yet failed: %#v", as)
}

return true, nil
}, func(err error) {
require.NoError(err)
})

result := client.getAllocatedResources(client.config.Node)

expected := structs.ComparableResources{
Flattened: structs.AllocatedTaskResources{
Cpu: structs.AllocatedCpuResources{
CpuShares: 768,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 768,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I figured out right here why you were doubling the resource settings through this test, and I like it. Maybe worth a comment up near the top just to draw attention to it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i actually started to write a comment and it felt like i was ruining the joke ;)

},
Networks: nil,
},
Shared: structs.AllocatedSharedResources{
DiskMB: 768,
},
}

assert.EqualValues(t, expected, *result)
}

func TestClient_updateNodeFromDriverUpdatesAll(t *testing.T) {
t.Parallel()
client, cleanup := TestClient(t, nil)
Expand Down