Skip to content

Commit

Permalink
feat(inputs.lustre2): Add health-check metric (#15042)
Browse files Browse the repository at this point in the history
Co-authored-by: Josh Powers <[email protected]>
  • Loading branch information
lukeyeager and powersj authored Mar 22, 2024
1 parent 4c1aa59 commit 2dde6a0
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 53 deletions.
7 changes: 7 additions & 0 deletions plugins/inputs/lustre2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.

## Metrics

From `/sys/fs/lustre/health_check`:

- lustre2
- tags:
- fields:
- health

From `/proc/fs/lustre/obdfilter/*/stats` and
`/proc/fs/lustre/osd-ldiskfs/*/stats`:

Expand Down
108 changes: 70 additions & 38 deletions plugins/inputs/lustre2/lustre2.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ type Lustre2 struct {
OstProcfiles []string `toml:"ost_procfiles"`
MdsProcfiles []string `toml:"mds_procfiles"`

// used by the testsuite to generate mock sysfs and procfs files
rootdir string

// allFields maps an OST name to the metric fields associated with that OST
allFields map[tags]map[string]interface{}
}
Expand Down Expand Up @@ -376,8 +379,49 @@ func (*Lustre2) SampleConfig() string {
return sampleConfig
}

func (l *Lustre2) GetLustreHealth() error {
// the linter complains about using an element containing '/' in filepath.Join()
// so we explicitly set the rootdir default to '/' in this function rather than
// starting the second element with a '/'.
rootdir := l.rootdir
if rootdir == "" {
rootdir = "/"
}

filename := filepath.Join(rootdir, "sys", "fs", "lustre", "health_check")
if _, err := os.Stat(filename); err != nil {
// try falling back to the old procfs location
// it was moved in https://github.com/lustre/lustre-release/commit/5d368bd0b2
filename = filepath.Join(rootdir, "proc", "fs", "lustre", "health_check")
if _, err = os.Stat(filename); err != nil {
return nil //nolint: nilerr // we don't want to return an error if the file doesn't exist
}
}
contents, err := os.ReadFile(filename)
if err != nil {
return err
}

value := strings.TrimSpace(string(contents))
var health uint64
if value == "healthy" {
health = 1
}

t := tags{}
var fields map[string]interface{}
fields, ok := l.allFields[t]
if !ok {
fields = make(map[string]interface{})
l.allFields[t] = fields
}

fields["health"] = health
return nil
}

func (l *Lustre2) GetLustreProcStats(fileglob string, wantedFields []*mapping) error {
files, err := filepath.Glob(fileglob)
files, err := filepath.Glob(filepath.Join(l.rootdir, fileglob))
if err != nil {
return err
}
Expand Down Expand Up @@ -465,7 +509,7 @@ func (l *Lustre2) GetLustreProcStats(fileglob string, wantedFields []*mapping) e
}

func (l *Lustre2) getLustreProcBrwStats(fileglob string, wantedFields []*mapping) error {
files, err := filepath.Glob(fileglob)
files, err := filepath.Glob(filepath.Join(l.rootdir, fileglob))
if err != nil {
return fmt.Errorf("failed to find files matching glob %s: %w", fileglob, err)
}
Expand Down Expand Up @@ -560,45 +604,32 @@ func (l *Lustre2) getLustreProcBrwStats(fileglob string, wantedFields []*mapping
func (l *Lustre2) Gather(acc telegraf.Accumulator) error {
l.allFields = make(map[tags]map[string]interface{})

err := l.GetLustreHealth()
if err != nil {
return err
}

if len(l.OstProcfiles) == 0 {
// read/write bytes are in obdfilter/<ost_name>/stats
err := l.GetLustreProcStats("/proc/fs/lustre/obdfilter/*/stats", wantedOstFields)
if err != nil {
return err
}
// cache counters are in osd-ldiskfs/<ost_name>/stats
err = l.GetLustreProcStats("/proc/fs/lustre/osd-ldiskfs/*/stats", wantedOstFields)
if err != nil {
return err
}
// per job statistics are in obdfilter/<ost_name>/job_stats
err = l.GetLustreProcStats("/proc/fs/lustre/obdfilter/*/job_stats", wantedOstJobstatsFields)
if err != nil {
return err
}
// bulk read/wrote statistics for ldiskfs
err = l.getLustreProcBrwStats("/proc/fs/lustre/osd-ldiskfs/*/brw_stats", wantedBrwstatsFields)
if err != nil {
return err
}
// bulk read/write statistics for zfs
err = l.getLustreProcBrwStats("/proc/fs/lustre/osd-zfs/*/brw_stats", wantedBrwstatsFields)
if err != nil {
return err
l.OstProcfiles = []string{
// read/write bytes are in obdfilter/<ost_name>/stats
"/proc/fs/lustre/obdfilter/*/stats",
// cache counters are in osd-ldiskfs/<ost_name>/stats
"/proc/fs/lustre/osd-ldiskfs/*/stats",
// per job statistics are in obdfilter/<ost_name>/job_stats
"/proc/fs/lustre/obdfilter/*/job_stats",
// bulk read/write statistics for ldiskfs
"/proc/fs/lustre/osd-ldiskfs/*/brw_stats",
// bulk read/write statistics for zfs
"/proc/fs/lustre/osd-zfs/*/brw_stats",
}
}

if len(l.MdsProcfiles) == 0 {
// Metadata server stats
err := l.GetLustreProcStats("/proc/fs/lustre/mdt/*/md_stats", wantedMdsFields)
if err != nil {
return err
}

// Metadata target job stats
err = l.GetLustreProcStats("/proc/fs/lustre/mdt/*/job_stats", wantedMdtJobstatsFields)
if err != nil {
return err
l.MdsProcfiles = []string{
// Metadata server stats
"/proc/fs/lustre/mdt/*/md_stats",
// Metadata target job stats
"/proc/fs/lustre/mdt/*/job_stats",
}
}

Expand Down Expand Up @@ -640,8 +671,9 @@ func (l *Lustre2) Gather(acc telegraf.Accumulator) error {
}

for tgs, fields := range l.allFields {
tags := map[string]string{
"name": tgs.name,
tags := map[string]string{}
if len(tgs.name) > 0 {
tags["name"] = tgs.name
}
if len(tgs.brwSection) > 0 {
tags["brw_section"] = tgs.brwSection
Expand Down
56 changes: 41 additions & 15 deletions plugins/inputs/lustre2/lustre2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,43 @@ disk I/O size ios % cum % | ios % cum %
1M: 43866371 99 100 | 850248 57 100
`

func TestLustre2GeneratesHealth(t *testing.T) {
tmpDir, err := os.MkdirTemp("", "telegraf-lustre")
require.NoError(t, err)
defer os.RemoveAll(tmpDir)

rootdir := tmpDir + "/telegraf"
sysdir := rootdir + "/sys/fs/lustre/"
err = os.MkdirAll(sysdir, 0750)
require.NoError(t, err)

err = os.WriteFile(sysdir+"health_check", []byte("healthy\n"), 0640)
require.NoError(t, err)

m := &Lustre2{rootdir: rootdir}

var acc testutil.Accumulator

err = m.Gather(&acc)
require.NoError(t, err)

acc.AssertContainsTaggedFields(
t,
"lustre2",
map[string]interface{}{
"health": uint64(1),
},
map[string]string{},
)
}

func TestLustre2GeneratesMetrics(t *testing.T) {
tmpDir, err := os.MkdirTemp("", "telegraf-lustre")
require.NoError(t, err)
defer os.RemoveAll(tmpDir)

tempdir := tmpDir + "/telegraf/proc/fs/lustre/"
rootdir := tmpDir + "/telegraf"
tempdir := rootdir + "/proc/fs/lustre/"
ostName := "OST0001"

mdtdir := tempdir + "/mdt/"
Expand All @@ -199,10 +230,7 @@ func TestLustre2GeneratesMetrics(t *testing.T) {
require.NoError(t, err)

// Begin by testing standard Lustre stats
m := &Lustre2{
OstProcfiles: []string{obddir + "/*/stats", osddir + "/*/stats"},
MdsProcfiles: []string{mdtdir + "/*/md_stats"},
}
m := &Lustre2{rootdir: rootdir}

var acc testutil.Accumulator

Expand Down Expand Up @@ -247,7 +275,8 @@ func TestLustre2GeneratesClientMetrics(t *testing.T) {
require.NoError(t, err)
defer os.RemoveAll(tmpDir)

tempdir := tmpDir + "/telegraf/proc/fs/lustre/"
rootdir := tmpDir + "/telegraf"
tempdir := rootdir + "/proc/fs/lustre/"
ostName := "OST0001"
clientName := "10.2.4.27@o2ib1"
mdtdir := tempdir + "/mdt/"
Expand Down Expand Up @@ -311,7 +340,8 @@ func TestLustre2GeneratesJobstatsMetrics(t *testing.T) {
require.NoError(t, err)
defer os.RemoveAll(tmpDir)

tempdir := tmpDir + "/telegraf/proc/fs/lustre/"
rootdir := tmpDir + "/telegraf"
tempdir := rootdir + "/proc/fs/lustre/"
ostName := "OST0001"
jobNames := []string{"cluster-testjob1", "testjob2"}

Expand All @@ -330,10 +360,7 @@ func TestLustre2GeneratesJobstatsMetrics(t *testing.T) {
require.NoError(t, err)

// Test Lustre Jobstats
m := &Lustre2{
OstProcfiles: []string{obddir + "/*/job_stats"},
MdsProcfiles: []string{mdtdir + "/*/job_stats"},
}
m := &Lustre2{rootdir: rootdir}

var acc testutil.Accumulator

Expand Down Expand Up @@ -474,7 +501,8 @@ func TestLustre2GeneratesBrwstatsMetrics(t *testing.T) {
require.NoError(t, err)
defer os.RemoveAll(tmpdir)

tempdir := tmpdir + "/telegraf/proc/fs/lustre/"
rootdir := tmpdir + "/telegraf"
tempdir := rootdir + "/proc/fs/lustre"
ostname := "OST0001"

osddir := tempdir + "/osd-ldiskfs/"
Expand All @@ -484,9 +512,7 @@ func TestLustre2GeneratesBrwstatsMetrics(t *testing.T) {
err = os.WriteFile(osddir+"/"+ostname+"/brw_stats", []byte(brwstatsProcContents), 0640)
require.NoError(t, err)

m := &Lustre2{
OstProcfiles: []string{osddir + "/*/brw_stats"},
}
m := &Lustre2{rootdir: rootdir}

var acc testutil.Accumulator

Expand Down

0 comments on commit 2dde6a0

Please sign in to comment.