Skip to content

Commit

Permalink
Merge pull request #121 from auhlig/node_problem_detector
Browse files Browse the repository at this point in the history
add NodeMemoryPressure, NodeDiskPressure, NodeNetworkUnavailable
  • Loading branch information
brancz authored Apr 26, 2017
2 parents 652ea1f + ac266ac commit 28fcd65
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 1 deletion.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ additional metrics!
| kube_node_status_allocatable_cpu_cores | Gauge | `node`=<node-address>|
| kube_node_status_allocatable_memory_bytes | Gauge | `node`=<node-address>|
| kube_node_status_allocatable_pods | Gauge | `node`=<node-address>|
| kube_node_status_memory_pressure | Gauge | `node`=&lt;node-address&gt; <br> `condition`=&lt;true\|false\|unknown&gt; |
| kube_node_status_disk_pressure | Gauge | `node`=&lt;node-address&gt; <br> `condition`=&lt;true\|false\|unknown&gt; |
| kube_node_status_network_unavailable | Gauge | `node`=&lt;node-address&gt; <br> `condition`=&lt;true\|false\|unknown&gt; |

### DaemonSet Metrics

Expand Down
25 changes: 24 additions & 1 deletion node.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,21 @@ var (
"The phase the node is currently in.",
[]string{"node", "phase"}, nil,
)
descNodeStatusMemoryPressure = prometheus.NewDesc(
"kube_node_status_memory_pressure",
"Whether the kubelet is under pressure due to insufficient available memory.",
[]string{"node", "condition"}, nil,
)
descNodeStatusDiskPressure = prometheus.NewDesc(
"kube_node_status_disk_pressure",
"Whether the kubelet is under pressure due to insufficient available disk.",
[]string{"node", "condition"}, nil,
)
descNodeStatusNetworkUnavailable = prometheus.NewDesc(
"kube_node_status_network_unavailable",
"Whether the network is correctly configured for the node.",
[]string{"node", "condition"}, nil,
)

descNodeStatusCapacityPods = prometheus.NewDesc(
"kube_node_status_capacity_pods",
Expand Down Expand Up @@ -131,6 +146,9 @@ func (nc *nodeCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- descNodeInfo
ch <- descNodeSpecUnschedulable
ch <- descNodeStatusReady
ch <- descNodeStatusMemoryPressure
ch <- descNodeStatusDiskPressure
ch <- descNodeStatusNetworkUnavailable
ch <- descNodeStatusOutOfDisk
ch <- descNodeStatusPhase
ch <- descNodeStatusCapacityCPU
Expand Down Expand Up @@ -171,13 +189,18 @@ func (nc *nodeCollector) collectNode(ch chan<- prometheus.Metric, n v1.Node) {
addGauge(descNodeSpecUnschedulable, boolFloat64(n.Spec.Unschedulable))

// Collect node conditions and while default to false.
// TODO(fabxc): add remaining conditions: NodeMemoryPressure, NodeDiskPressure, NodeNetworkUnavailable
for _, c := range n.Status.Conditions {
switch c.Type {
case v1.NodeReady:
addConditionMetrics(ch, descNodeStatusReady, c.Status, n.Name)
case v1.NodeOutOfDisk:
addConditionMetrics(ch, descNodeStatusOutOfDisk, c.Status, n.Name)
case v1.NodeMemoryPressure:
addConditionMetrics(ch, descNodeStatusMemoryPressure, c.Status, n.Name)
case v1.NodeDiskPressure:
addConditionMetrics(ch, descNodeStatusDiskPressure, c.Status, n.Name)
case v1.NodeNetworkUnavailable:
addConditionMetrics(ch, descNodeStatusNetworkUnavailable, c.Status, n.Name)
}
}

Expand Down
147 changes: 147 additions & 0 deletions node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ func TestNodeCollector(t *testing.T) {
# HELP kube_node_status_allocatable_cpu_cores The CPU resources of a node that are available for scheduling.
# TYPE kube_node_status_allocatable_memory_bytes gauge
# HELP kube_node_status_allocatable_memory_bytes The memory resources of a node that are available for scheduling.
# HELP kube_node_status_memory_pressure Whether the kubelet is under pressure due to insufficient available memory.
# TYPE kube_node_status_memory_pressure gauge
# HELP kube_node_status_disk_pressure Whether the kubelet is under pressure due to insufficient available disk.
# TYPE kube_node_status_disk_pressure gauge
# HELP kube_node_status_network_unavailable Whether the network is correctly configured for the node.
# TYPE kube_node_status_network_unavailable gauge
`
cases := []struct {
nodes []v1.Node
Expand Down Expand Up @@ -214,6 +220,147 @@ func TestNodeCollector(t *testing.T) {
`,
metrics: []string{"kube_node_status_phase"},
},
// Verify MemoryPressure
{
nodes: []v1.Node{
{
ObjectMeta: v1.ObjectMeta{
Name: "127.0.0.1",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{Type: v1.NodeMemoryPressure, Status: v1.ConditionTrue},
},
},
},
{
ObjectMeta: v1.ObjectMeta{
Name: "127.0.0.2",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{Type: v1.NodeMemoryPressure, Status: v1.ConditionUnknown},
},
},
},
{
ObjectMeta: v1.ObjectMeta{
Name: "127.0.0.3",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{Type: v1.NodeMemoryPressure, Status: v1.ConditionFalse},
},
},
},
},
want: metadata + `
kube_node_status_memory_pressure{node="127.0.0.1",condition="true"} 1
kube_node_status_memory_pressure{node="127.0.0.1",condition="false"} 0
kube_node_status_memory_pressure{node="127.0.0.1",condition="unknown"} 0
kube_node_status_memory_pressure{node="127.0.0.2",condition="true"} 0
kube_node_status_memory_pressure{node="127.0.0.2",condition="false"} 0
kube_node_status_memory_pressure{node="127.0.0.2",condition="unknown"} 1
kube_node_status_memory_pressure{node="127.0.0.3",condition="true"} 0
kube_node_status_memory_pressure{node="127.0.0.3",condition="false"} 1
kube_node_status_memory_pressure{node="127.0.0.3",condition="unknown"} 0
`,
metrics: []string{"kube_node_status_memory_pressure"},
},
// Verify DiskPressure
{
nodes: []v1.Node{
{
ObjectMeta: v1.ObjectMeta{
Name: "127.0.0.1",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{Type: v1.NodeDiskPressure, Status: v1.ConditionTrue},
},
},
},
{
ObjectMeta: v1.ObjectMeta{
Name: "127.0.0.2",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{Type: v1.NodeDiskPressure, Status: v1.ConditionUnknown},
},
},
},
{
ObjectMeta: v1.ObjectMeta{
Name: "127.0.0.3",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{Type: v1.NodeDiskPressure, Status: v1.ConditionFalse},
},
},
},
},
want: metadata + `
kube_node_status_disk_pressure{node="127.0.0.1",condition="true"} 1
kube_node_status_disk_pressure{node="127.0.0.1",condition="false"} 0
kube_node_status_disk_pressure{node="127.0.0.1",condition="unknown"} 0
kube_node_status_disk_pressure{node="127.0.0.2",condition="true"} 0
kube_node_status_disk_pressure{node="127.0.0.2",condition="false"} 0
kube_node_status_disk_pressure{node="127.0.0.2",condition="unknown"} 1
kube_node_status_disk_pressure{node="127.0.0.3",condition="true"} 0
kube_node_status_disk_pressure{node="127.0.0.3",condition="false"} 1
kube_node_status_disk_pressure{node="127.0.0.3",condition="unknown"} 0
`,
metrics: []string{"kube_node_status_disk_pressure"},
},
// Verify NetworkUnavailable
{
nodes: []v1.Node{
{
ObjectMeta: v1.ObjectMeta{
Name: "127.0.0.1",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{Type: v1.NodeNetworkUnavailable, Status: v1.ConditionTrue},
},
},
},
{
ObjectMeta: v1.ObjectMeta{
Name: "127.0.0.2",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{Type: v1.NodeNetworkUnavailable, Status: v1.ConditionUnknown},
},
},
},
{
ObjectMeta: v1.ObjectMeta{
Name: "127.0.0.3",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{Type: v1.NodeNetworkUnavailable, Status: v1.ConditionFalse},
},
},
},
},
want: metadata + `
kube_node_status_network_unavailable{node="127.0.0.1",condition="true"} 1
kube_node_status_network_unavailable{node="127.0.0.1",condition="false"} 0
kube_node_status_network_unavailable{node="127.0.0.1",condition="unknown"} 0
kube_node_status_network_unavailable{node="127.0.0.2",condition="true"} 0
kube_node_status_network_unavailable{node="127.0.0.2",condition="false"} 0
kube_node_status_network_unavailable{node="127.0.0.2",condition="unknown"} 1
kube_node_status_network_unavailable{node="127.0.0.3",condition="true"} 0
kube_node_status_network_unavailable{node="127.0.0.3",condition="false"} 1
kube_node_status_network_unavailable{node="127.0.0.3",condition="unknown"} 0
`,
metrics: []string{"kube_node_status_network_unavailable"},
},
}
for _, c := range cases {
dc := &nodeCollector{
Expand Down

0 comments on commit 28fcd65

Please sign in to comment.