Skip to content

Commit

Permalink
Expose PSI metrics with prometheus
Browse files Browse the repository at this point in the history
This adds support for reading PSI metrics via prometheus. We exposes the
following for `psi_total`:

```
container_cpu_psi_total_seconds
container_memory_psi_total_seconds
container_io_psi_total_seconds
```

And for `psi_avg`:

```
container_cpu_psi_avg10_ratio
container_cpu_psi_avg60_ratio
container_cpu_psi_avg300_ratio

container_memory_psi_avg10_ratio
container_memory_psi_avg60_ratio
container_memory_psi_avg300_ratio

container_io_psi_avg10_ratio
container_io_psi_avg60_ratio
container_io_psi_avg300_ratio
```

Signed-off-by: Daniel Dao <[email protected]>
  • Loading branch information
dqminh committed May 17, 2022
1 parent b09dcc9 commit 0a22793
Show file tree
Hide file tree
Showing 4 changed files with 215 additions and 0 deletions.
77 changes: 77 additions & 0 deletions metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -1768,6 +1768,64 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
})
}

if includedMetrics.Has(container.PSITotalMetrics) {
c.containerMetrics = append(c.containerMetrics, []containerMetric{
{
name: "container_cpu_psi_total_seconds",
help: "Total time spent under cpu pressure in seconds.",
valueType: prometheus.CounterValue,
extraLabels: []string{"kind"},
getValues: func(s *info.ContainerStats) metricValues {
return getPSIValues(s, &s.Cpu.PSI, "total")
},
}, {
name: "container_memory_psi_total_seconds",
help: "Total container time spent under memory pressure in seconds.",
valueType: prometheus.CounterValue,
extraLabels: []string{"kind"},
getValues: func(s *info.ContainerStats) metricValues {
return getPSIValues(s, &s.Memory.PSI, "total")
},
}, {
name: "container_io_psi_total_seconds",
help: "Total time spent under io pressure in seconds.",
valueType: prometheus.CounterValue,
extraLabels: []string{"kind"},
getValues: func(s *info.ContainerStats) metricValues {
return getPSIValues(s, &s.DiskIo.PSI, "total")
},
},
}...)
}

if includedMetrics.Has(container.PSIAvgMetrics) {
makePSIAvgMetric := func(controller, window string) containerMetric {
return containerMetric{
name: fmt.Sprintf("container_%s_psi_avg%s_ratio", controller, window),
help: fmt.Sprintf("Ratio of time spent under %s pressure over time window of %s seconds", controller, window),
valueType: prometheus.GaugeValue,
extraLabels: []string{"kind"},
getValues: func(s *info.ContainerStats) metricValues {
switch controller {
case "cpu":
return getPSIValues(s, &s.Cpu.PSI, "avg"+window)
case "memory":
return getPSIValues(s, &s.Memory.PSI, "avg"+window)
case "io":
return getPSIValues(s, &s.DiskIo.PSI, "avg"+window)
default:
return nil
}
},
}
}
for _, controller := range []string{"cpu", "memory", "io"} {
for _, window := range []string{"10", "60", "300"} {
c.containerMetrics = append(c.containerMetrics, makePSIAvgMetric(controller, window))
}
}
}

return c
}

Expand Down Expand Up @@ -2060,3 +2118,22 @@ func getMinCoreScalingRatio(s *info.ContainerStats) metricValues {
}
return values
}

func getPSIValues(s *info.ContainerStats, psi *info.PSIStats, psiMetric string) metricValues {
v := make(metricValues, 0, 2)
switch psiMetric {
case "avg10":
v = append(v, metricValue{value: psi.Some.Avg10, timestamp: s.Timestamp, labels: []string{"some"}})
v = append(v, metricValue{value: psi.Full.Avg10, timestamp: s.Timestamp, labels: []string{"full"}})
case "avg60":
v = append(v, metricValue{value: psi.Some.Avg60, timestamp: s.Timestamp, labels: []string{"some"}})
v = append(v, metricValue{value: psi.Full.Avg60, timestamp: s.Timestamp, labels: []string{"full"}})
case "avg300":
v = append(v, metricValue{value: psi.Some.Avg300, timestamp: s.Timestamp, labels: []string{"some"}})
v = append(v, metricValue{value: psi.Full.Avg300, timestamp: s.Timestamp, labels: []string{"full"}})
case "total":
v = append(v, metricValue{value: float64(psi.Some.Total / 1e9), timestamp: s.Timestamp, labels: []string{"some"}})
v = append(v, metricValue{value: float64(psi.Full.Total / 1e9), timestamp: s.Timestamp, labels: []string{"full"}})
}
return v
}
42 changes: 42 additions & 0 deletions metrics/prometheus_fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
RunPeriods: 984285,
},
LoadAverage: 2,
PSI: info.PSIStats{
Some: info.PSIData{
Avg10: 0.1,
Avg60: 0.2,
Avg300: 0.3,
Total: 100,
},
Full: info.PSIData{
Avg10: 0.4,
Avg60: 0.5,
Avg300: 0.6,
Total: 200,
},
},
},
Memory: info.MemoryStats{
Usage: 8,
Expand Down Expand Up @@ -346,6 +360,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
RSS: 15,
MappedFile: 16,
Swap: 8192,
PSI: info.PSIStats{
Some: info.PSIData{
Avg10: 0.01,
Avg60: 0.02,
Avg300: 0.03,
Total: 1000,
},
Full: info.PSIData{
Avg10: 0.04,
Avg60: 0.05,
Avg300: 0.06,
Total: 2000,
},
},
},
Hugetlb: map[string]info.HugetlbStats{
"2Mi": {
Expand Down Expand Up @@ -538,6 +566,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
"Write": 6,
},
}},
PSI: info.PSIStats{
Some: info.PSIData{
Avg10: 0.11,
Avg60: 0.12,
Avg300: 0.13,
Total: 1111,
},
Full: info.PSIData{
Avg10: 0.14,
Avg60: 0.15,
Avg300: 0.16,
Total: 2222,
},
},
},
Filesystem: []info.FsStats{
{
Expand Down
48 changes: 48 additions & 0 deletions metrics/testdata/prometheus_metrics
Original file line number Diff line number Diff line change
Expand Up @@ -433,3 +433,51 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",container_label_fo
# TYPE container_memory_bandwidth_local_bytes gauge
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000
# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds
# TYPE container_cpu_psi_avg10_ratio gauge
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000
# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds
# TYPE container_cpu_psi_avg300_ratio gauge
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000
# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds
# TYPE container_cpu_psi_avg60_ratio gauge
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000
# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds.
# TYPE container_cpu_psi_total_seconds counter
container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0 1395066363000
container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0 1395066363000
# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds
# TYPE container_io_psi_avg10_ratio gauge
container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000
container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000
# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds
# TYPE container_io_psi_avg300_ratio gauge
container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000
container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000
# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds
# TYPE container_io_psi_avg60_ratio gauge
container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000
container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000
# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds.
# TYPE container_io_psi_total_seconds counter
container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0 1395066363000
container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0 1395066363000
# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds
# TYPE container_memory_psi_avg10_ratio gauge
container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000
container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000
# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds
# TYPE container_memory_psi_avg300_ratio gauge
container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000
container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000
# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds
# TYPE container_memory_psi_avg60_ratio gauge
container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000
container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000
# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds.
# TYPE container_memory_psi_total_seconds counter
container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0 1395066363000
container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0 1395066363000
48 changes: 48 additions & 0 deletions metrics/testdata/prometheus_metrics_whitelist_filtered
Original file line number Diff line number Diff line change
Expand Up @@ -433,3 +433,51 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",id="testcontainer"
# TYPE container_memory_bandwidth_local_bytes gauge
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000
# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds
# TYPE container_cpu_psi_avg10_ratio gauge
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000
# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds
# TYPE container_cpu_psi_avg300_ratio gauge
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000
# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds
# TYPE container_cpu_psi_avg60_ratio gauge
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000
# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds.
# TYPE container_cpu_psi_total_seconds counter
container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0 1395066363000
container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0 1395066363000
# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds
# TYPE container_io_psi_avg10_ratio gauge
container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000
container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000
# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds
# TYPE container_io_psi_avg300_ratio gauge
container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000
container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000
# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds
# TYPE container_io_psi_avg60_ratio gauge
container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000
container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000
# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds.
# TYPE container_io_psi_total_seconds counter
container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0 1395066363000
container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0 1395066363000
# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds
# TYPE container_memory_psi_avg10_ratio gauge
container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000
container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000
# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds
# TYPE container_memory_psi_avg300_ratio gauge
container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000
container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000
# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds
# TYPE container_memory_psi_avg60_ratio gauge
container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000
container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000
# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds.
# TYPE container_memory_psi_total_seconds counter
container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0 1395066363000
container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0 1395066363000

0 comments on commit 0a22793

Please sign in to comment.