From 99f3c3718eb80878c45873c659eb6603e9024329 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Mon, 5 Jul 2021 09:17:10 -0600 Subject: [PATCH] add additional stats from mdstat (#380) * Add several Infiniband counters Counters added: * excessive_buffer_overrun_errors * local_link_integrity_errors Signed-off-by: Trey Dockendorf Signed-off-by: John Seekins * add additional stats from mdstat Signed-off-by: John Seekins * return successful values every time Signed-off-by: John Seekins * add count of 'downed' disks Signed-off-by: John Seekins Co-authored-by: Trey Dockendorf --- mdstat.go | 105 ++++++++++++++++++++++++++++++++++++------------- mdstat_test.go | 34 ++++++++-------- 2 files changed, 94 insertions(+), 45 deletions(-) diff --git a/mdstat.go b/mdstat.go index 4c4493bfa..f0b9e5f75 100644 --- a/mdstat.go +++ b/mdstat.go @@ -22,9 +22,12 @@ import ( ) var ( - statusLineRE = regexp.MustCompile(`(\d+) blocks .*\[(\d+)/(\d+)\] \[[U_]+\]`) - recoveryLineRE = regexp.MustCompile(`\((\d+)/\d+\)`) - componentDeviceRE = regexp.MustCompile(`(.*)\[\d+\]`) + statusLineRE = regexp.MustCompile(`(\d+) blocks .*\[(\d+)/(\d+)\] \[([U_]+)\]`) + recoveryLineBlocksRE = regexp.MustCompile(`\((\d+)/\d+\)`) + recoveryLinePctRE = regexp.MustCompile(`= (.+)%`) + recoveryLineFinishRE = regexp.MustCompile(`finish=(.+)min`) + recoveryLineSpeedRE = regexp.MustCompile(`speed=(.+)[A-Z]`) + componentDeviceRE = regexp.MustCompile(`(.*)\[\d+\]`) ) // MDStat holds info parsed from /proc/mdstat. @@ -39,12 +42,20 @@ type MDStat struct { DisksTotal int64 // Number of failed disks. DisksFailed int64 + // Number of "down" disks. (the _ indicator in the status line) + DisksDown int64 // Spare disks in the device. DisksSpare int64 // Number of blocks the device holds. BlocksTotal int64 // Number of blocks on the device that are in sync. BlocksSynced int64 + // progress percentage of current sync + BlocksSyncedPct float64 + // estimated finishing time for current sync (in minutes) + BlocksSyncedFinishTime float64 + // current sync speed (in Kilobytes/sec) + BlocksSyncedSpeed float64 // Name of md component devices Devices []string } @@ -91,7 +102,7 @@ func parseMDStat(mdStatData []byte) ([]MDStat, error) { // Failed disks have the suffix (F) & Spare disks have the suffix (S). fail := int64(strings.Count(line, "(F)")) spare := int64(strings.Count(line, "(S)")) - active, total, size, err := evalStatusLine(lines[i], lines[i+1]) + active, total, down, size, err := evalStatusLine(lines[i], lines[i+1]) if err != nil { return nil, fmt.Errorf("error parsing md device lines: %w", err) @@ -105,6 +116,9 @@ func parseMDStat(mdStatData []byte) ([]MDStat, error) { // If device is syncing at the moment, get the number of currently // synced bytes, otherwise that number equals the size of the device. syncedBlocks := size + speed := float64(0) + finish := float64(0) + pct := float64(0) recovering := strings.Contains(lines[syncLineIdx], "recovery") resyncing := strings.Contains(lines[syncLineIdx], "resync") checking := strings.Contains(lines[syncLineIdx], "check") @@ -124,7 +138,7 @@ func parseMDStat(mdStatData []byte) ([]MDStat, error) { strings.Contains(lines[syncLineIdx], "DELAYED") { syncedBlocks = 0 } else { - syncedBlocks, err = evalRecoveryLine(lines[syncLineIdx]) + syncedBlocks, pct, finish, speed, err = evalRecoveryLine(lines[syncLineIdx]) if err != nil { return nil, fmt.Errorf("error parsing sync line in md device %q: %w", mdName, err) } @@ -132,69 +146,104 @@ func parseMDStat(mdStatData []byte) ([]MDStat, error) { } mdStats = append(mdStats, MDStat{ - Name: mdName, - ActivityState: state, - DisksActive: active, - DisksFailed: fail, - DisksSpare: spare, - DisksTotal: total, - BlocksTotal: size, - BlocksSynced: syncedBlocks, - Devices: evalComponentDevices(deviceFields), + Name: mdName, + ActivityState: state, + DisksActive: active, + DisksFailed: fail, + DisksDown: down, + DisksSpare: spare, + DisksTotal: total, + BlocksTotal: size, + BlocksSynced: syncedBlocks, + BlocksSyncedPct: pct, + BlocksSyncedFinishTime: finish, + BlocksSyncedSpeed: speed, + Devices: evalComponentDevices(deviceFields), }) } return mdStats, nil } -func evalStatusLine(deviceLine, statusLine string) (active, total, size int64, err error) { +func evalStatusLine(deviceLine, statusLine string) (active, total, down, size int64, err error) { sizeStr := strings.Fields(statusLine)[0] size, err = strconv.ParseInt(sizeStr, 10, 64) if err != nil { - return 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err) + return 0, 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err) } if strings.Contains(deviceLine, "raid0") || strings.Contains(deviceLine, "linear") { // In the device deviceLine, only disks have a number associated with them in []. total = int64(strings.Count(deviceLine, "[")) - return total, total, size, nil + return total, total, 0, size, nil } if strings.Contains(deviceLine, "inactive") { - return 0, 0, size, nil + return 0, 0, 0, size, nil } matches := statusLineRE.FindStringSubmatch(statusLine) - if len(matches) != 4 { - return 0, 0, 0, fmt.Errorf("couldn't find all the substring matches: %s", statusLine) + if len(matches) != 5 { + return 0, 0, 0, 0, fmt.Errorf("couldn't find all the substring matches: %s", statusLine) } total, err = strconv.ParseInt(matches[2], 10, 64) if err != nil { - return 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err) + return 0, 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err) } active, err = strconv.ParseInt(matches[3], 10, 64) if err != nil { - return 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err) + return 0, 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err) } + down = int64(strings.Count(matches[4], "_")) - return active, total, size, nil + return active, total, down, size, nil } -func evalRecoveryLine(recoveryLine string) (syncedBlocks int64, err error) { - matches := recoveryLineRE.FindStringSubmatch(recoveryLine) +func evalRecoveryLine(recoveryLine string) (syncedBlocks int64, pct float64, finish float64, speed float64, err error) { + matches := recoveryLineBlocksRE.FindStringSubmatch(recoveryLine) if len(matches) != 2 { - return 0, fmt.Errorf("unexpected recoveryLine: %s", recoveryLine) + return 0, 0, 0, 0, fmt.Errorf("unexpected recoveryLine: %s", recoveryLine) } syncedBlocks, err = strconv.ParseInt(matches[1], 10, 64) if err != nil { - return 0, fmt.Errorf("error parsing int from recoveryLine %q: %w", recoveryLine, err) + return 0, 0, 0, 0, fmt.Errorf("error parsing int from recoveryLine %q: %w", recoveryLine, err) } - return syncedBlocks, nil + // Get percentage complete + matches = recoveryLinePctRE.FindStringSubmatch(recoveryLine) + if len(matches) != 2 { + return syncedBlocks, 0, 0, 0, fmt.Errorf("unexpected recoveryLine matching percentage: %s", recoveryLine) + } + pct, err = strconv.ParseFloat(strings.TrimSpace(matches[1]), 64) + if err != nil { + return syncedBlocks, 0, 0, 0, fmt.Errorf("error parsing float from recoveryLine %q: %w", recoveryLine, err) + } + + // Get time expected left to complete + matches = recoveryLineFinishRE.FindStringSubmatch(recoveryLine) + if len(matches) != 2 { + return syncedBlocks, pct, 0, 0, fmt.Errorf("unexpected recoveryLine matching est. finish time: %s", recoveryLine) + } + finish, err = strconv.ParseFloat(matches[1], 64) + if err != nil { + return syncedBlocks, pct, 0, 0, fmt.Errorf("error parsing float from recoveryLine %q: %w", recoveryLine, err) + } + + // Get recovery speed + matches = recoveryLineSpeedRE.FindStringSubmatch(recoveryLine) + if len(matches) != 2 { + return syncedBlocks, pct, finish, 0, fmt.Errorf("unexpected recoveryLine matching speed: %s", recoveryLine) + } + speed, err = strconv.ParseFloat(matches[1], 64) + if err != nil { + return syncedBlocks, pct, finish, 0, fmt.Errorf("error parsing float from recoveryLine %q: %w", recoveryLine, err) + } + + return syncedBlocks, pct, finish, speed, nil } func evalComponentDevices(deviceFields []string) []string { diff --git a/mdstat_test.go b/mdstat_test.go index 1b9a8ea81..d572cb346 100644 --- a/mdstat_test.go +++ b/mdstat_test.go @@ -25,23 +25,23 @@ func TestFS_MDStat(t *testing.T) { } refs := map[string]MDStat{ - "md127": {Name: "md127", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 312319552, BlocksSynced: 312319552, Devices: []string{"sdi2", "sdj2"}}, - "md0": {Name: "md0", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 248896, BlocksSynced: 248896, Devices: []string{"sdi1", "sdj1"}}, - "md4": {Name: "md4", ActivityState: "inactive", DisksActive: 0, DisksTotal: 0, DisksFailed: 1, DisksSpare: 1, BlocksTotal: 4883648, BlocksSynced: 4883648, Devices: []string{"sda3", "sdb3"}}, - "md6": {Name: "md6", ActivityState: "recovering", DisksActive: 1, DisksTotal: 2, DisksFailed: 1, DisksSpare: 1, BlocksTotal: 195310144, BlocksSynced: 16775552, Devices: []string{"sdb2", "sdc", "sda2"}}, - "md3": {Name: "md3", ActivityState: "active", DisksActive: 8, DisksTotal: 8, DisksFailed: 0, DisksSpare: 2, BlocksTotal: 5853468288, BlocksSynced: 5853468288, Devices: []string{"sda1", "sdh1", "sdg1", "sdf1", "sde1", "sdd1", "sdc1", "sdb1", "sdd1", "sdd2"}}, - "md8": {Name: "md8", ActivityState: "resyncing", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 2, BlocksTotal: 195310144, BlocksSynced: 16775552, Devices: []string{"sdb1", "sda1", "sdc", "sde"}}, - "md7": {Name: "md7", ActivityState: "active", DisksActive: 3, DisksTotal: 4, DisksFailed: 1, DisksSpare: 0, BlocksTotal: 7813735424, BlocksSynced: 7813735424, Devices: []string{"sdb1", "sde1", "sdd1", "sdc1"}}, - "md9": {Name: "md9", ActivityState: "resyncing", DisksActive: 4, DisksTotal: 4, DisksSpare: 1, DisksFailed: 2, BlocksTotal: 523968, BlocksSynced: 0, Devices: []string{"sdc2", "sdd2", "sdb2", "sda2", "sde", "sdf", "sdg"}}, - "md10": {Name: "md10", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 314159265, BlocksSynced: 314159265, Devices: []string{"sda1", "sdb1"}}, - "md11": {Name: "md11", ActivityState: "resyncing", DisksActive: 2, DisksTotal: 2, DisksFailed: 1, DisksSpare: 2, BlocksTotal: 4190208, BlocksSynced: 0, Devices: []string{"sdb2", "sdc2", "sdc3", "hda", "ssdc2"}}, - "md12": {Name: "md12", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksSpare: 0, DisksFailed: 0, BlocksTotal: 3886394368, BlocksSynced: 3886394368, Devices: []string{"sdc2", "sdd2"}}, - "md120": {Name: "md120", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 2095104, BlocksSynced: 2095104, Devices: []string{"sda1", "sdb1"}}, - "md126": {Name: "md126", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 1855870976, BlocksSynced: 1855870976, Devices: []string{"sdb", "sdc"}}, - "md219": {Name: "md219", ActivityState: "inactive", DisksTotal: 0, DisksFailed: 0, DisksActive: 0, DisksSpare: 3, BlocksTotal: 7932, BlocksSynced: 7932, Devices: []string{"sdc", "sda"}}, - "md00": {Name: "md00", ActivityState: "active", DisksActive: 1, DisksTotal: 1, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 4186624, BlocksSynced: 4186624, Devices: []string{"xvdb"}}, - "md101": {Name: "md101", ActivityState: "active", DisksActive: 3, DisksTotal: 3, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 322560, BlocksSynced: 322560, Devices: []string{"sdb", "sdd", "sdc"}}, - "md201": {Name: "md201", ActivityState: "checking", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 1993728, BlocksSynced: 114176, Devices: []string{"sda3", "sdb3"}}, + "md127": {Name: "md127", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 312319552, BlocksSynced: 312319552, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdi2", "sdj2"}}, + "md0": {Name: "md0", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 248896, BlocksSynced: 248896, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdi1", "sdj1"}}, + "md4": {Name: "md4", ActivityState: "inactive", DisksActive: 0, DisksTotal: 0, DisksFailed: 1, DisksDown: 0, DisksSpare: 1, BlocksTotal: 4883648, BlocksSynced: 4883648, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sda3", "sdb3"}}, + "md6": {Name: "md6", ActivityState: "recovering", DisksActive: 1, DisksTotal: 2, DisksFailed: 1, DisksDown: 1, DisksSpare: 1, BlocksTotal: 195310144, BlocksSynced: 16775552, BlocksSyncedPct: 8.5, BlocksSyncedFinishTime: 17, BlocksSyncedSpeed: 259783, Devices: []string{"sdb2", "sdc", "sda2"}}, + "md3": {Name: "md3", ActivityState: "active", DisksActive: 8, DisksTotal: 8, DisksFailed: 0, DisksDown: 0, DisksSpare: 2, BlocksTotal: 5853468288, BlocksSynced: 5853468288, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sda1", "sdh1", "sdg1", "sdf1", "sde1", "sdd1", "sdc1", "sdb1", "sdd1", "sdd2"}}, + "md8": {Name: "md8", ActivityState: "resyncing", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 2, BlocksTotal: 195310144, BlocksSynced: 16775552, BlocksSyncedPct: 8.5, BlocksSyncedFinishTime: 17, BlocksSyncedSpeed: 259783, Devices: []string{"sdb1", "sda1", "sdc", "sde"}}, + "md7": {Name: "md7", ActivityState: "active", DisksActive: 3, DisksTotal: 4, DisksFailed: 1, DisksDown: 1, DisksSpare: 0, BlocksTotal: 7813735424, BlocksSynced: 7813735424, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdb1", "sde1", "sdd1", "sdc1"}}, + "md9": {Name: "md9", ActivityState: "resyncing", DisksActive: 4, DisksTotal: 4, DisksSpare: 1, DisksDown: 0, DisksFailed: 2, BlocksTotal: 523968, BlocksSynced: 0, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdc2", "sdd2", "sdb2", "sda2", "sde", "sdf", "sdg"}}, + "md10": {Name: "md10", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 314159265, BlocksSynced: 314159265, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sda1", "sdb1"}}, + "md11": {Name: "md11", ActivityState: "resyncing", DisksActive: 2, DisksTotal: 2, DisksFailed: 1, DisksDown: 0, DisksSpare: 2, BlocksTotal: 4190208, BlocksSynced: 0, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdb2", "sdc2", "sdc3", "hda", "ssdc2"}}, + "md12": {Name: "md12", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksSpare: 0, DisksDown: 0, DisksFailed: 0, BlocksTotal: 3886394368, BlocksSynced: 3886394368, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdc2", "sdd2"}}, + "md120": {Name: "md120", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 2095104, BlocksSynced: 2095104, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sda1", "sdb1"}}, + "md126": {Name: "md126", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 1855870976, BlocksSynced: 1855870976, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdb", "sdc"}}, + "md219": {Name: "md219", ActivityState: "inactive", DisksTotal: 0, DisksFailed: 0, DisksActive: 0, DisksDown: 0, DisksSpare: 3, BlocksTotal: 7932, BlocksSynced: 7932, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdc", "sda"}}, + "md00": {Name: "md00", ActivityState: "active", DisksActive: 1, DisksTotal: 1, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 4186624, BlocksSynced: 4186624, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"xvdb"}}, + "md101": {Name: "md101", ActivityState: "active", DisksActive: 3, DisksTotal: 3, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 322560, BlocksSynced: 322560, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdb", "sdd", "sdc"}}, + "md201": {Name: "md201", ActivityState: "checking", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 1993728, BlocksSynced: 114176, BlocksSyncedPct: 5.7, BlocksSyncedFinishTime: 0.2, BlocksSyncedSpeed: 114176, Devices: []string{"sda3", "sdb3"}}, } if want, have := len(refs), len(mdStats); want != have {