Skip to content

Commit

Permalink
add additional stats from mdstat (prometheus#380)
Browse files Browse the repository at this point in the history
* Add several Infiniband counters

Counters added:
* excessive_buffer_overrun_errors
* local_link_integrity_errors

Signed-off-by: Trey Dockendorf <[email protected]>
Signed-off-by: John Seekins <[email protected]>

* add additional stats from mdstat

Signed-off-by: John Seekins <[email protected]>

* return successful values every time

Signed-off-by: John Seekins <[email protected]>

* add count of 'downed' disks

Signed-off-by: John Seekins <[email protected]>

Co-authored-by: Trey Dockendorf <[email protected]>
  • Loading branch information
2 people authored and jritter committed Jul 15, 2024
1 parent 8358822 commit 6ef78c8
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 45 deletions.
105 changes: 77 additions & 28 deletions mdstat.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,12 @@ import (
)

var (
statusLineRE = regexp.MustCompile(`(\d+) blocks .*\[(\d+)/(\d+)\] \[[U_]+\]`)
recoveryLineRE = regexp.MustCompile(`\((\d+)/\d+\)`)
componentDeviceRE = regexp.MustCompile(`(.*)\[\d+\]`)
statusLineRE = regexp.MustCompile(`(\d+) blocks .*\[(\d+)/(\d+)\] \[([U_]+)\]`)
recoveryLineBlocksRE = regexp.MustCompile(`\((\d+)/\d+\)`)
recoveryLinePctRE = regexp.MustCompile(`= (.+)%`)
recoveryLineFinishRE = regexp.MustCompile(`finish=(.+)min`)
recoveryLineSpeedRE = regexp.MustCompile(`speed=(.+)[A-Z]`)
componentDeviceRE = regexp.MustCompile(`(.*)\[\d+\]`)
)

// MDStat holds info parsed from /proc/mdstat.
Expand All @@ -39,12 +42,20 @@ type MDStat struct {
DisksTotal int64
// Number of failed disks.
DisksFailed int64
// Number of "down" disks. (the _ indicator in the status line)
DisksDown int64
// Spare disks in the device.
DisksSpare int64
// Number of blocks the device holds.
BlocksTotal int64
// Number of blocks on the device that are in sync.
BlocksSynced int64
// progress percentage of current sync
BlocksSyncedPct float64
// estimated finishing time for current sync (in minutes)
BlocksSyncedFinishTime float64
// current sync speed (in Kilobytes/sec)
BlocksSyncedSpeed float64
// Name of md component devices
Devices []string
}
Expand Down Expand Up @@ -91,7 +102,7 @@ func parseMDStat(mdStatData []byte) ([]MDStat, error) {
// Failed disks have the suffix (F) & Spare disks have the suffix (S).
fail := int64(strings.Count(line, "(F)"))
spare := int64(strings.Count(line, "(S)"))
active, total, size, err := evalStatusLine(lines[i], lines[i+1])
active, total, down, size, err := evalStatusLine(lines[i], lines[i+1])

if err != nil {
return nil, fmt.Errorf("error parsing md device lines: %w", err)
Expand All @@ -105,6 +116,9 @@ func parseMDStat(mdStatData []byte) ([]MDStat, error) {
// If device is syncing at the moment, get the number of currently
// synced bytes, otherwise that number equals the size of the device.
syncedBlocks := size
speed := float64(0)
finish := float64(0)
pct := float64(0)
recovering := strings.Contains(lines[syncLineIdx], "recovery")
resyncing := strings.Contains(lines[syncLineIdx], "resync")
checking := strings.Contains(lines[syncLineIdx], "check")
Expand All @@ -124,77 +138,112 @@ func parseMDStat(mdStatData []byte) ([]MDStat, error) {
strings.Contains(lines[syncLineIdx], "DELAYED") {
syncedBlocks = 0
} else {
syncedBlocks, err = evalRecoveryLine(lines[syncLineIdx])
syncedBlocks, pct, finish, speed, err = evalRecoveryLine(lines[syncLineIdx])
if err != nil {
return nil, fmt.Errorf("error parsing sync line in md device %q: %w", mdName, err)
}
}
}

mdStats = append(mdStats, MDStat{
Name: mdName,
ActivityState: state,
DisksActive: active,
DisksFailed: fail,
DisksSpare: spare,
DisksTotal: total,
BlocksTotal: size,
BlocksSynced: syncedBlocks,
Devices: evalComponentDevices(deviceFields),
Name: mdName,
ActivityState: state,
DisksActive: active,
DisksFailed: fail,
DisksDown: down,
DisksSpare: spare,
DisksTotal: total,
BlocksTotal: size,
BlocksSynced: syncedBlocks,
BlocksSyncedPct: pct,
BlocksSyncedFinishTime: finish,
BlocksSyncedSpeed: speed,
Devices: evalComponentDevices(deviceFields),
})
}

return mdStats, nil
}

func evalStatusLine(deviceLine, statusLine string) (active, total, size int64, err error) {
func evalStatusLine(deviceLine, statusLine string) (active, total, down, size int64, err error) {

sizeStr := strings.Fields(statusLine)[0]
size, err = strconv.ParseInt(sizeStr, 10, 64)
if err != nil {
return 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err)
return 0, 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err)
}

if strings.Contains(deviceLine, "raid0") || strings.Contains(deviceLine, "linear") {
// In the device deviceLine, only disks have a number associated with them in [].
total = int64(strings.Count(deviceLine, "["))
return total, total, size, nil
return total, total, 0, size, nil
}

if strings.Contains(deviceLine, "inactive") {
return 0, 0, size, nil
return 0, 0, 0, size, nil
}

matches := statusLineRE.FindStringSubmatch(statusLine)
if len(matches) != 4 {
return 0, 0, 0, fmt.Errorf("couldn't find all the substring matches: %s", statusLine)
if len(matches) != 5 {
return 0, 0, 0, 0, fmt.Errorf("couldn't find all the substring matches: %s", statusLine)
}

total, err = strconv.ParseInt(matches[2], 10, 64)
if err != nil {
return 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err)
return 0, 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err)
}

active, err = strconv.ParseInt(matches[3], 10, 64)
if err != nil {
return 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err)
return 0, 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err)
}
down = int64(strings.Count(matches[4], "_"))

return active, total, size, nil
return active, total, down, size, nil
}

func evalRecoveryLine(recoveryLine string) (syncedBlocks int64, err error) {
matches := recoveryLineRE.FindStringSubmatch(recoveryLine)
func evalRecoveryLine(recoveryLine string) (syncedBlocks int64, pct float64, finish float64, speed float64, err error) {
matches := recoveryLineBlocksRE.FindStringSubmatch(recoveryLine)
if len(matches) != 2 {
return 0, fmt.Errorf("unexpected recoveryLine: %s", recoveryLine)
return 0, 0, 0, 0, fmt.Errorf("unexpected recoveryLine: %s", recoveryLine)
}

syncedBlocks, err = strconv.ParseInt(matches[1], 10, 64)
if err != nil {
return 0, fmt.Errorf("error parsing int from recoveryLine %q: %w", recoveryLine, err)
return 0, 0, 0, 0, fmt.Errorf("error parsing int from recoveryLine %q: %w", recoveryLine, err)
}

return syncedBlocks, nil
// Get percentage complete
matches = recoveryLinePctRE.FindStringSubmatch(recoveryLine)
if len(matches) != 2 {
return syncedBlocks, 0, 0, 0, fmt.Errorf("unexpected recoveryLine matching percentage: %s", recoveryLine)
}
pct, err = strconv.ParseFloat(strings.TrimSpace(matches[1]), 64)
if err != nil {
return syncedBlocks, 0, 0, 0, fmt.Errorf("error parsing float from recoveryLine %q: %w", recoveryLine, err)
}

// Get time expected left to complete
matches = recoveryLineFinishRE.FindStringSubmatch(recoveryLine)
if len(matches) != 2 {
return syncedBlocks, pct, 0, 0, fmt.Errorf("unexpected recoveryLine matching est. finish time: %s", recoveryLine)
}
finish, err = strconv.ParseFloat(matches[1], 64)
if err != nil {
return syncedBlocks, pct, 0, 0, fmt.Errorf("error parsing float from recoveryLine %q: %w", recoveryLine, err)
}

// Get recovery speed
matches = recoveryLineSpeedRE.FindStringSubmatch(recoveryLine)
if len(matches) != 2 {
return syncedBlocks, pct, finish, 0, fmt.Errorf("unexpected recoveryLine matching speed: %s", recoveryLine)
}
speed, err = strconv.ParseFloat(matches[1], 64)
if err != nil {
return syncedBlocks, pct, finish, 0, fmt.Errorf("error parsing float from recoveryLine %q: %w", recoveryLine, err)
}

return syncedBlocks, pct, finish, speed, nil
}

func evalComponentDevices(deviceFields []string) []string {
Expand Down
34 changes: 17 additions & 17 deletions mdstat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,23 +25,23 @@ func TestFS_MDStat(t *testing.T) {
}

refs := map[string]MDStat{
"md127": {Name: "md127", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 312319552, BlocksSynced: 312319552, Devices: []string{"sdi2", "sdj2"}},
"md0": {Name: "md0", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 248896, BlocksSynced: 248896, Devices: []string{"sdi1", "sdj1"}},
"md4": {Name: "md4", ActivityState: "inactive", DisksActive: 0, DisksTotal: 0, DisksFailed: 1, DisksSpare: 1, BlocksTotal: 4883648, BlocksSynced: 4883648, Devices: []string{"sda3", "sdb3"}},
"md6": {Name: "md6", ActivityState: "recovering", DisksActive: 1, DisksTotal: 2, DisksFailed: 1, DisksSpare: 1, BlocksTotal: 195310144, BlocksSynced: 16775552, Devices: []string{"sdb2", "sdc", "sda2"}},
"md3": {Name: "md3", ActivityState: "active", DisksActive: 8, DisksTotal: 8, DisksFailed: 0, DisksSpare: 2, BlocksTotal: 5853468288, BlocksSynced: 5853468288, Devices: []string{"sda1", "sdh1", "sdg1", "sdf1", "sde1", "sdd1", "sdc1", "sdb1", "sdd1", "sdd2"}},
"md8": {Name: "md8", ActivityState: "resyncing", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 2, BlocksTotal: 195310144, BlocksSynced: 16775552, Devices: []string{"sdb1", "sda1", "sdc", "sde"}},
"md7": {Name: "md7", ActivityState: "active", DisksActive: 3, DisksTotal: 4, DisksFailed: 1, DisksSpare: 0, BlocksTotal: 7813735424, BlocksSynced: 7813735424, Devices: []string{"sdb1", "sde1", "sdd1", "sdc1"}},
"md9": {Name: "md9", ActivityState: "resyncing", DisksActive: 4, DisksTotal: 4, DisksSpare: 1, DisksFailed: 2, BlocksTotal: 523968, BlocksSynced: 0, Devices: []string{"sdc2", "sdd2", "sdb2", "sda2", "sde", "sdf", "sdg"}},
"md10": {Name: "md10", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 314159265, BlocksSynced: 314159265, Devices: []string{"sda1", "sdb1"}},
"md11": {Name: "md11", ActivityState: "resyncing", DisksActive: 2, DisksTotal: 2, DisksFailed: 1, DisksSpare: 2, BlocksTotal: 4190208, BlocksSynced: 0, Devices: []string{"sdb2", "sdc2", "sdc3", "hda", "ssdc2"}},
"md12": {Name: "md12", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksSpare: 0, DisksFailed: 0, BlocksTotal: 3886394368, BlocksSynced: 3886394368, Devices: []string{"sdc2", "sdd2"}},
"md120": {Name: "md120", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 2095104, BlocksSynced: 2095104, Devices: []string{"sda1", "sdb1"}},
"md126": {Name: "md126", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 1855870976, BlocksSynced: 1855870976, Devices: []string{"sdb", "sdc"}},
"md219": {Name: "md219", ActivityState: "inactive", DisksTotal: 0, DisksFailed: 0, DisksActive: 0, DisksSpare: 3, BlocksTotal: 7932, BlocksSynced: 7932, Devices: []string{"sdc", "sda"}},
"md00": {Name: "md00", ActivityState: "active", DisksActive: 1, DisksTotal: 1, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 4186624, BlocksSynced: 4186624, Devices: []string{"xvdb"}},
"md101": {Name: "md101", ActivityState: "active", DisksActive: 3, DisksTotal: 3, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 322560, BlocksSynced: 322560, Devices: []string{"sdb", "sdd", "sdc"}},
"md201": {Name: "md201", ActivityState: "checking", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 1993728, BlocksSynced: 114176, Devices: []string{"sda3", "sdb3"}},
"md127": {Name: "md127", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 312319552, BlocksSynced: 312319552, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdi2", "sdj2"}},
"md0": {Name: "md0", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 248896, BlocksSynced: 248896, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdi1", "sdj1"}},
"md4": {Name: "md4", ActivityState: "inactive", DisksActive: 0, DisksTotal: 0, DisksFailed: 1, DisksDown: 0, DisksSpare: 1, BlocksTotal: 4883648, BlocksSynced: 4883648, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sda3", "sdb3"}},
"md6": {Name: "md6", ActivityState: "recovering", DisksActive: 1, DisksTotal: 2, DisksFailed: 1, DisksDown: 1, DisksSpare: 1, BlocksTotal: 195310144, BlocksSynced: 16775552, BlocksSyncedPct: 8.5, BlocksSyncedFinishTime: 17, BlocksSyncedSpeed: 259783, Devices: []string{"sdb2", "sdc", "sda2"}},
"md3": {Name: "md3", ActivityState: "active", DisksActive: 8, DisksTotal: 8, DisksFailed: 0, DisksDown: 0, DisksSpare: 2, BlocksTotal: 5853468288, BlocksSynced: 5853468288, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sda1", "sdh1", "sdg1", "sdf1", "sde1", "sdd1", "sdc1", "sdb1", "sdd1", "sdd2"}},
"md8": {Name: "md8", ActivityState: "resyncing", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 2, BlocksTotal: 195310144, BlocksSynced: 16775552, BlocksSyncedPct: 8.5, BlocksSyncedFinishTime: 17, BlocksSyncedSpeed: 259783, Devices: []string{"sdb1", "sda1", "sdc", "sde"}},
"md7": {Name: "md7", ActivityState: "active", DisksActive: 3, DisksTotal: 4, DisksFailed: 1, DisksDown: 1, DisksSpare: 0, BlocksTotal: 7813735424, BlocksSynced: 7813735424, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdb1", "sde1", "sdd1", "sdc1"}},
"md9": {Name: "md9", ActivityState: "resyncing", DisksActive: 4, DisksTotal: 4, DisksSpare: 1, DisksDown: 0, DisksFailed: 2, BlocksTotal: 523968, BlocksSynced: 0, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdc2", "sdd2", "sdb2", "sda2", "sde", "sdf", "sdg"}},
"md10": {Name: "md10", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 314159265, BlocksSynced: 314159265, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sda1", "sdb1"}},
"md11": {Name: "md11", ActivityState: "resyncing", DisksActive: 2, DisksTotal: 2, DisksFailed: 1, DisksDown: 0, DisksSpare: 2, BlocksTotal: 4190208, BlocksSynced: 0, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdb2", "sdc2", "sdc3", "hda", "ssdc2"}},
"md12": {Name: "md12", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksSpare: 0, DisksDown: 0, DisksFailed: 0, BlocksTotal: 3886394368, BlocksSynced: 3886394368, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdc2", "sdd2"}},
"md120": {Name: "md120", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 2095104, BlocksSynced: 2095104, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sda1", "sdb1"}},
"md126": {Name: "md126", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 1855870976, BlocksSynced: 1855870976, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdb", "sdc"}},
"md219": {Name: "md219", ActivityState: "inactive", DisksTotal: 0, DisksFailed: 0, DisksActive: 0, DisksDown: 0, DisksSpare: 3, BlocksTotal: 7932, BlocksSynced: 7932, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdc", "sda"}},
"md00": {Name: "md00", ActivityState: "active", DisksActive: 1, DisksTotal: 1, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 4186624, BlocksSynced: 4186624, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"xvdb"}},
"md101": {Name: "md101", ActivityState: "active", DisksActive: 3, DisksTotal: 3, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 322560, BlocksSynced: 322560, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdb", "sdd", "sdc"}},
"md201": {Name: "md201", ActivityState: "checking", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 1993728, BlocksSynced: 114176, BlocksSyncedPct: 5.7, BlocksSyncedFinishTime: 0.2, BlocksSyncedSpeed: 114176, Devices: []string{"sda3", "sdb3"}},
}

if want, have := len(refs), len(mdStats); want != have {
Expand Down

0 comments on commit 6ef78c8

Please sign in to comment.