Skip to content

Commit

Permalink
Merge pull request #104806 from itsbilal/backport23.1-104640
Browse files Browse the repository at this point in the history
release-23.1: server: don't double-count RAID volumes in disk metrics
  • Loading branch information
itsbilal authored Jun 14, 2023
2 parents 89d3f29 + 2627b2e commit 88f9534
Show file tree
Hide file tree
Showing 6 changed files with 145 additions and 7 deletions.
3 changes: 3 additions & 0 deletions pkg/server/status/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ go_library(
"health_check.go",
"recorder.go",
"runtime.go",
"runtime_generic.go",
"runtime_jemalloc.go",
"runtime_jemalloc_darwin.go",
"runtime_linux.go",
"runtime_log.go",
],
# keep
Expand Down Expand Up @@ -126,6 +128,7 @@ go_test(
"jemalloc_test.go",
"main_test.go",
"recorder_test.go",
"runtime_linux_test.go",
"runtime_stats_test.go",
"runtime_test.go",
],
Expand Down
34 changes: 29 additions & 5 deletions pkg/server/status/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@ package status
import (
"context"
"os"
"regexp"
"runtime"
"runtime/debug"
"time"

"github.com/cockroachdb/cockroach/pkg/build"
"github.com/cockroachdb/cockroach/pkg/util/cgroups"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/goschedstats"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
Expand Down Expand Up @@ -245,6 +247,13 @@ var (
}
)

// diskMetricsIgnoredDevices is a regex that matches any block devices that must be
// ignored for disk metrics (eg. sys.host.disk.write.bytes), as those devices
// have likely been counted elsewhere. This prevents us from double-counting,
// for instance, RAID volumes under both the logical volume and under the
// physical volume(s).
var diskMetricsIgnoredDevices = envutil.EnvOrDefaultString("COCKROACH_DISK_METRICS_IGNORED_DEVICES", getDefaultIgnoredDevices())

// getCgoMemStats is a function that fetches stats for the C++ portion of the code.
// We will not necessarily have implementations for all builds, so check for nil first.
// Returns the following:
Expand Down Expand Up @@ -682,7 +691,7 @@ func getSummedDiskCounters(ctx context.Context) (DiskStats, error) {
return DiskStats{}, err
}

return sumDiskCounters(diskCounters), nil
return sumAndFilterDiskCounters(diskCounters)
}

func getSummedNetStats(ctx context.Context) (net.IOCountersStat, error) {
Expand All @@ -694,11 +703,26 @@ func getSummedNetStats(ctx context.Context) (net.IOCountersStat, error) {
return sumNetworkCounters(netCounters), nil
}

// sumDiskCounters returns a new disk.IOCountersStat whose values are the sum of the
// values in the slice of disk.IOCountersStats passed in.
func sumDiskCounters(disksStats []DiskStats) DiskStats {
// sumAndFilterDiskCounters returns a new disk.IOCountersStat whose values are
// the sum of the values in the slice of disk.IOCountersStats passed in. It
// filters out any disk counters that are likely reflecting values already
// counted elsewhere, eg. md* logical volumes that are created out of RAIDing
// underlying drives. The filtering regex defaults to a platform-dependent one.
func sumAndFilterDiskCounters(disksStats []DiskStats) (DiskStats, error) {
output := DiskStats{}
var ignored *regexp.Regexp
if diskMetricsIgnoredDevices != "" {
var err error
ignored, err = regexp.Compile(diskMetricsIgnoredDevices)
if err != nil {
return output, err
}
}

for _, stats := range disksStats {
if ignored != nil && ignored.MatchString(stats.Name) {
continue
}
output.ReadBytes += stats.ReadBytes
output.readCount += stats.readCount
output.readTime += stats.readTime
Expand All @@ -712,7 +736,7 @@ func sumDiskCounters(disksStats []DiskStats) DiskStats {

output.iopsInProgress += stats.iopsInProgress
}
return output
return output, nil
}

// subtractDiskCounters subtracts the counters in `sub` from the counters in `from`,
Expand Down
18 changes: 18 additions & 0 deletions pkg/server/status/runtime_generic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

//go:build !linux
// +build !linux

package status

func getDefaultIgnoredDevices() string {
return ""
}
24 changes: 24 additions & 0 deletions pkg/server/status/runtime_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

//go:build linux
// +build linux

package status

func getDefaultIgnoredDevices() string {
// Excludes disks that have likely been counted elsewhere already, eg.
// sda1 gets excluded because sda would count it instead, and nvme1n1p1 is
// excluded as nvme1n1 is counted.
//
// This default regex is taken from Prometheus:
// https://github.com/prometheus/node_exporter/blob/690efa61e86acefdf05bb4334a3d68128ded49c9/collector/diskstats_linux.go#L39
return "^(ram|loop|fd|(h|s|v|xv)d[a-z]|nvme\\d+n\\d+p)\\d+$"
}
66 changes: 66 additions & 0 deletions pkg/server/status/runtime_linux_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

//go:build linux
// +build linux

package status

import (
"reflect"
"testing"

"github.com/cockroachdb/cockroach/pkg/util/leaktest"
)

func TestSumAndFilterDiskCountersLinux(t *testing.T) {
defer leaktest.AfterTest(t)()

counters := []DiskStats{
{
Name: "nvme0",
ReadBytes: 1,
readCount: 1,
iopsInProgress: 1,
WriteBytes: 1,
writeCount: 1,
},
{ // This must be excluded from the sum.
Name: "sda1",
ReadBytes: 100,
readCount: 100,
iopsInProgress: 100,
WriteBytes: 100,
writeCount: 100,
},
{
Name: "nvme1n1",
ReadBytes: 1,
readCount: 1,
iopsInProgress: 1,
WriteBytes: 1,
writeCount: 1,
},
}
summed, err := sumAndFilterDiskCounters(counters)
if err != nil {
t.Fatalf("error: %s", err.Error())
}
expected := DiskStats{
ReadBytes: 2,
readCount: 2,
WriteBytes: 2,
writeCount: 2,
iopsInProgress: 2,
}
if !reflect.DeepEqual(summed, expected) {
t.Fatalf("expected %+v; got %+v", expected, summed)
}
}
7 changes: 5 additions & 2 deletions pkg/server/status/runtime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import (
"github.com/shirou/gopsutil/v3/net"
)

func TestSumDiskCounters(t *testing.T) {
func TestSumAndFilterDiskCounters(t *testing.T) {
defer leaktest.AfterTest(t)()

counters := []DiskStats{
Expand All @@ -37,7 +37,10 @@ func TestSumDiskCounters(t *testing.T) {
writeCount: 1,
},
}
summed := sumDiskCounters(counters)
summed, err := sumAndFilterDiskCounters(counters)
if err != nil {
t.Fatalf("error: %s", err.Error())
}
expected := DiskStats{
ReadBytes: 2,
readCount: 2,
Expand Down

0 comments on commit 88f9534

Please sign in to comment.