From e5f16184dabfc8cb6111c129827d336e7da850a6 Mon Sep 17 00:00:00 2001 From: Andy Xie Date: Wed, 6 Dec 2017 23:10:10 +0800 Subject: [PATCH] add disk io metrics --- docs/storage-schema.md | 6 +- integration/heapster_api_test.go | 6 +- metrics/core/metrics.go | 117 +++++++++++++++++++++++++- metrics/processors/rate_calculator.go | 78 ++++++++++++----- 4 files changed, 183 insertions(+), 24 deletions(-) diff --git a/docs/storage-schema.md b/docs/storage-schema.md index 2f22e8ed8a..8f9d79e475 100644 --- a/docs/storage-schema.md +++ b/docs/storage-schema.md @@ -17,6 +17,10 @@ Heapster exports the following metrics to its backends. | filesystem/available | The number of available bytes remaining in a the filesystem | | filesystem/inodes | The number of available inodes in a the filesystem | | filesystem/inodes_free | The number of free inodes remaining in a the filesystem | +| disk/io_read_bytes | Number of bytes read from a disk partition | +| disk/io_write_bytes | Number of bytes written to a disk partition | +| disk/io_read_bytes_rate | Number of bytes read from a disk partition per second | +| disk/io_write_bytes_rate | Number of bytes written to a disk partition per second | | memory/limit | Memory hard limit in bytes. | | memory/major_page_faults | Number of major page faults. | | memory/major_page_faults_rate | Number of major page faults per second. | @@ -62,7 +66,7 @@ Heapster tags each metric with the following labels. | labels | Comma-separated(Default) list of user-provided labels. Format is 'key:value' | | namespace_id | UID of the namespace of a Pod | | namespace_name | User-provided name of a Namespace | -| resource_id | A unique identifier used to differentiate multiple metrics of the same type. e.x. Fs partitions under filesystem/usage | +| resource_id | A unique identifier used to differentiate multiple metrics of the same type. e.x. Fs partitions under filesystem/usage, disk device name under disk/io_read_bytes | | make | Make of the accelerator (nvidia, amd, google etc.) | | model | Model of the accelerator (tesla-p100, tesla-k80 etc.) | | accelerator_id | ID of the accelerator | diff --git a/integration/heapster_api_test.go b/integration/heapster_api_test.go index 554bd4ffe3..9371fec965 100644 --- a/integration/heapster_api_test.go +++ b/integration/heapster_api_test.go @@ -359,7 +359,11 @@ func runMetricExportTest(fm kubeFramework, svc *kube_v1.Service) error { core.MetricFilesystemInodesFree.Name: {core.LabelResourceID.Key}, core.MetricAcceleratorMemoryTotal.Name: {core.LabelAcceleratorMake.Key, core.LabelAcceleratorModel.Key, core.LabelAcceleratorID.Key}, core.MetricAcceleratorMemoryUsed.Name: {core.LabelAcceleratorMake.Key, core.LabelAcceleratorModel.Key, core.LabelAcceleratorID.Key}, - core.MetricAcceleratorDutyCycle.Name: {core.LabelAcceleratorMake.Key, core.LabelAcceleratorModel.Key, core.LabelAcceleratorID.Key}} + core.MetricAcceleratorDutyCycle.Name: {core.LabelAcceleratorMake.Key, core.LabelAcceleratorModel.Key, core.LabelAcceleratorID.Key}, + core.MetricDiskIORead.Name: {core.LabelResourceID.Key}, + core.MetricDiskIOReadRate.Name: {core.LabelResourceID.Key}, + core.MetricDiskIOWrite.Name: {core.LabelResourceID.Key}, + core.MetricDiskIOWriteRate.Name: {core.LabelResourceID.Key}} for metricName, points := range ts.Metrics { md, exists := mdMap[metricName] diff --git a/metrics/core/metrics.go b/metrics/core/metrics.go index 6c74ff4ae1..6870637105 100644 --- a/metrics/core/metrics.go +++ b/metrics/core/metrics.go @@ -15,6 +15,7 @@ package core import ( + "fmt" "time" cadvisor "github.com/google/cadvisor/info/v1" @@ -54,7 +55,9 @@ var RateMetrics = []Metric{ MetricNetworkRxRate, MetricNetworkRxErrorsRate, MetricNetworkTxRate, - MetricNetworkTxErrorsRate} + MetricNetworkTxErrorsRate, + MetricDiskIOReadRate, + MetricDiskIOWriteRate} var RateMetricsMapping = map[string]Metric{ MetricCpuUsage.MetricDescriptor.Name: MetricCpuUsageRate, @@ -63,9 +66,15 @@ var RateMetricsMapping = map[string]Metric{ MetricNetworkRx.MetricDescriptor.Name: MetricNetworkRxRate, MetricNetworkRxErrors.MetricDescriptor.Name: MetricNetworkRxErrorsRate, MetricNetworkTx.MetricDescriptor.Name: MetricNetworkTxRate, - MetricNetworkTxErrors.MetricDescriptor.Name: MetricNetworkTxErrorsRate} + MetricNetworkTxErrors.MetricDescriptor.Name: MetricNetworkTxErrorsRate, + MetricDiskIORead.MetricDescriptor.Name: MetricDiskIOReadRate, + MetricDiskIOWrite.MetricDescriptor.Name: MetricDiskIOWriteRate} var LabeledMetrics = []Metric{ + MetricDiskIORead, + MetricDiskIOReadRate, + MetricDiskIOWrite, + MetricDiskIOWriteRate, MetricFilesystemUsage, MetricFilesystemLimit, MetricFilesystemAvailable, @@ -885,6 +894,110 @@ var MetricAcceleratorDutyCycle = Metric{ }, } +var MetricDiskIORead = Metric{ + MetricDescriptor: MetricDescriptor{ + Name: "disk/io_read_bytes", + Description: "Cumulative number of bytes read over disk", + Type: MetricCumulative, + ValueType: ValueInt64, + Units: UnitsBytes, + Labels: metricLabels, + }, + HasLabeledMetric: func(spec *cadvisor.ContainerSpec) bool { + return spec.HasDiskIo + }, + GetLabeledMetric: func(spec *cadvisor.ContainerSpec, stat *cadvisor.ContainerStats) []LabeledMetric { + result := make([]LabeledMetric, 0, len(stat.DiskIo.IoServiceBytes)) + for _, ioServiceBytesPerPartition := range stat.DiskIo.IoServiceBytes { + resourceIDKey := ioServiceBytesPerPartition.Device + if resourceIDKey == "" { + resourceIDKey = fmt.Sprintf("%d:%d", ioServiceBytesPerPartition.Major, ioServiceBytesPerPartition.Minor) + } + + var value uint64 + if v, exists := ioServiceBytesPerPartition.Stats["Read"]; exists { + value = v + } + + result = append(result, LabeledMetric{ + Name: "disk/io_read_bytes", + Labels: map[string]string{ + LabelResourceID.Key: resourceIDKey, + }, + MetricValue: MetricValue{ + ValueType: ValueInt64, + MetricType: MetricGauge, + IntValue: int64(value), + }, + }) + } + return result + }, +} + +var MetricDiskIOWrite = Metric{ + MetricDescriptor: MetricDescriptor{ + Name: "disk/io_write_bytes", + Description: "Cumulative number of bytes write over disk", + Type: MetricCumulative, + ValueType: ValueInt64, + Units: UnitsBytes, + Labels: metricLabels, + }, + HasLabeledMetric: func(spec *cadvisor.ContainerSpec) bool { + return spec.HasDiskIo + }, + GetLabeledMetric: func(spec *cadvisor.ContainerSpec, stat *cadvisor.ContainerStats) []LabeledMetric { + result := make([]LabeledMetric, 0, len(stat.DiskIo.IoServiceBytes)) + for _, ioServiceBytesPerPartition := range stat.DiskIo.IoServiceBytes { + resourceIDKey := ioServiceBytesPerPartition.Device + if resourceIDKey == "" { + resourceIDKey = fmt.Sprintf("%d:%d", ioServiceBytesPerPartition.Major, ioServiceBytesPerPartition.Minor) + } + + var value uint64 + if v, exists := ioServiceBytesPerPartition.Stats["Write"]; exists { + value = v + } + + result = append(result, LabeledMetric{ + Name: "disk/io_write_bytes", + Labels: map[string]string{ + LabelResourceID.Key: resourceIDKey, + }, + MetricValue: MetricValue{ + ValueType: ValueInt64, + MetricType: MetricGauge, + IntValue: int64(value), + }, + }) + } + return result + }, +} + +var MetricDiskIOReadRate = Metric{ + MetricDescriptor: MetricDescriptor{ + Name: "disk/io_read_bytes_rate", + Description: "Rate of bytes read over disk in bytes per second", + Type: MetricGauge, + ValueType: ValueFloat, + Units: UnitsCount, + Labels: metricLabels, + }, +} + +var MetricDiskIOWriteRate = Metric{ + MetricDescriptor: MetricDescriptor{ + Name: "disk/io_write_bytes_rate", + Description: "Rate of bytes written over disk in bytes per second", + Type: MetricGauge, + ValueType: ValueFloat, + Units: UnitsCount, + Labels: metricLabels, + }, +} + func IsNodeAutoscalingMetric(name string) bool { for _, autoscalingMetric := range NodeAutoscalingMetrics { if autoscalingMetric.MetricDescriptor.Name == name { diff --git a/metrics/processors/rate_calculator.go b/metrics/processors/rate_calculator.go index 33880c35d9..22fbede06d 100644 --- a/metrics/processors/rate_calculator.go +++ b/metrics/processors/rate_calculator.go @@ -57,31 +57,69 @@ func (this *RateCalculator) Process(batch *core.DataBatch) (*core.DataBatch, err continue } + var metricValNew, metricValOld core.MetricValue + var foundNew, foundOld bool + for metricName, targetMetric := range this.rateMetricsMapping { - metricValNew, foundNew := newMs.MetricValues[metricName] - metricValOld, foundOld := oldMs.MetricValues[metricName] - if foundNew && foundOld && metricName == core.MetricCpuUsage.MetricDescriptor.Name { - // cpu/usage values are in nanoseconds; we want to have it in millicores (that's why constant 1000 is here). - newVal := 1000 * (metricValNew.IntValue - metricValOld.IntValue) / - (newMs.ScrapeTime.UnixNano() - oldMs.ScrapeTime.UnixNano()) - - newMs.MetricValues[targetMetric.MetricDescriptor.Name] = core.MetricValue{ - ValueType: core.ValueInt64, - MetricType: core.MetricGauge, - IntValue: newVal, + if metricName == core.MetricDiskIORead.MetricDescriptor.Name || metricName == core.MetricDiskIOWrite.MetricDescriptor.Name { + for _, itemNew := range newMs.LabeledMetrics { + foundNew, foundOld = false, false + if itemNew.Name == metricName { + metricValNew, foundNew = itemNew.MetricValue, true + for _, itemOld := range oldMs.LabeledMetrics { + if itemOld.Name == metricName { + metricValOld, foundOld = itemOld.MetricValue, true + break + } + } + } + + if foundNew && foundOld { + if targetMetric.MetricDescriptor.ValueType == core.ValueFloat { + newVal := 1e9 * float32(metricValNew.IntValue-metricValOld.IntValue) / + float32(newMs.ScrapeTime.UnixNano()-oldMs.ScrapeTime.UnixNano()) + + newMs.LabeledMetrics = append(newMs.LabeledMetrics, core.LabeledMetric{ + Name: targetMetric.MetricDescriptor.Name, + Labels: itemNew.Labels, + MetricValue: core.MetricValue{ + ValueType: core.ValueFloat, + MetricType: core.MetricGauge, + FloatValue: newVal, + }, + }) + } + } else if foundNew && !foundOld || !foundNew && foundOld { + glog.V(4).Infof("Skipping rates for %s in %s: metric not found in one of old (%v) or new (%v)", metricName, key, foundOld, foundNew) + } } + } else { + metricValNew, foundNew = newMs.MetricValues[metricName] + metricValOld, foundOld = oldMs.MetricValues[metricName] + + if foundNew && foundOld && metricName == core.MetricCpuUsage.MetricDescriptor.Name { + // cpu/usage values are in nanoseconds; we want to have it in millicores (that's why constant 1000 is here). + newVal := 1000 * (metricValNew.IntValue - metricValOld.IntValue) / + (newMs.ScrapeTime.UnixNano() - oldMs.ScrapeTime.UnixNano()) + + newMs.MetricValues[targetMetric.MetricDescriptor.Name] = core.MetricValue{ + ValueType: core.ValueInt64, + MetricType: core.MetricGauge, + IntValue: newVal, + } - } else if foundNew && foundOld && targetMetric.MetricDescriptor.ValueType == core.ValueFloat { - newVal := 1e9 * float32(metricValNew.IntValue-metricValOld.IntValue) / - float32(newMs.ScrapeTime.UnixNano()-oldMs.ScrapeTime.UnixNano()) + } else if foundNew && foundOld && targetMetric.MetricDescriptor.ValueType == core.ValueFloat { + newVal := 1e9 * float32(metricValNew.IntValue-metricValOld.IntValue) / + float32(newMs.ScrapeTime.UnixNano()-oldMs.ScrapeTime.UnixNano()) - newMs.MetricValues[targetMetric.MetricDescriptor.Name] = core.MetricValue{ - ValueType: core.ValueFloat, - MetricType: core.MetricGauge, - FloatValue: newVal, + newMs.MetricValues[targetMetric.MetricDescriptor.Name] = core.MetricValue{ + ValueType: core.ValueFloat, + MetricType: core.MetricGauge, + FloatValue: newVal, + } + } else if foundNew && !foundOld || !foundNew && foundOld { + glog.V(4).Infof("Skipping rates for %s in %s: metric not found in one of old (%v) or new (%v)", metricName, key, foundOld, foundNew) } - } else if foundNew && !foundOld || !foundNew && foundOld { - glog.V(4).Infof("Skipping rates for %s in %s: metric not found in one of old (%v) or new (%v)", metricName, key, foundOld, foundNew) } } }