Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support the failure/success threshold for probe #233

Merged
merged 6 commits into from
Oct 19, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cmd/easeprobe/probe.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ import (

func configProbers(probers []probe.Prober) []probe.Prober {
gProbeConf := global.ProbeSettings{
Interval: conf.Get().Settings.Probe.Interval,
Timeout: conf.Get().Settings.Probe.Timeout,
Interval: conf.Get().Settings.Probe.Interval,
Timeout: conf.Get().Settings.Probe.Timeout,
StatusChangeThresholdSettings: conf.Get().Settings.Probe.StatusChangeThresholdSettings,
}
log.Debugf("Global Probe Configuration: %+v", gProbeConf)

Expand Down
5 changes: 3 additions & 2 deletions conf/conf.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,9 @@ type Notify struct {

// Probe is the settings of prober
type Probe struct {
Interval time.Duration `yaml:"interval" json:"interval,omitempty" jsonschema:"type=string,format=duration,title=Probe Interval,description=the interval of probe,default=1m"`
Timeout time.Duration `yaml:"timeout" json:"timeout,omitempty" jsonschema:"type=string,format=duration,title=Probe Timeout,description=the timeout of probe,default=30s"`
Interval time.Duration `yaml:"interval" json:"interval,omitempty" jsonschema:"type=string,format=duration,title=Probe Interval,description=the interval of probe,default=1m"`
Timeout time.Duration `yaml:"timeout" json:"timeout,omitempty" jsonschema:"type=string,format=duration,title=Probe Timeout,description=the timeout of probe,default=30s"`
global.StatusChangeThresholdSettings `yaml:",inline" json:",inline"`
}

// SLAReport is the settings for SLA report
Expand Down
6 changes: 5 additions & 1 deletion docs/Manual.md
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,8 @@ The following example configurations illustrate the EaseProbe supported features

- `timeout` - the maximum time to wait for the probe to complete. default: `30s`.
- `interval` - the interval time to run the probe. default: `1m`.

- `failure` - number of consecutive failed probes needed to determine the status down, default: 1
- `success` - number of consecutive successful probes needed to determine the status up, default: 1

## 7.1 HTTP Probe Configuration

Expand Down Expand Up @@ -1308,6 +1309,9 @@ settings:
probe:
timeout: 30s # the time out for all probes
interval: 1m # probe every minute for all probes
failure: 2 # number of consecutive failed probes needed to determine the status down, default: 1
success: 1 # number of consecutive successful probes needed to determine the status up, default: 1


# easeprobe program running log file.
log:
Expand Down
2 changes: 2 additions & 0 deletions global/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ const (
DefaultTimeOut = time.Second * 30
// DefaultChannelName is the default wide channel name
DefaultChannelName = "__EaseProbe_Channel__"
// DefaultStatusChangeThresholdSetting is the threshold of status change
DefaultStatusChangeThresholdSetting = 1
)

const (
Expand Down
17 changes: 17 additions & 0 deletions global/probe.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,19 @@ package global

import "time"

// StatusChangeThresholdSettings is the settings for probe threshold
type StatusChangeThresholdSettings struct {
// the failures threshold such as 2, 5
Failure int `yaml:"failure,omitempty" json:"failure,omitempty" jsonschema:"title=Failure Threshold,description=the failures threshold to change the status such as 3,default=1"`
// the success threshold such as 2, 5
Success int `yaml:"success,omitempty" json:"success,omitempty" jsonschema:"title=Success Threshold,description=the success threshold to change the status such as 2,default=1"`
}

// ProbeSettings is the global probe setting
type ProbeSettings struct {
Interval time.Duration
Timeout time.Duration
StatusChangeThresholdSettings
}

// NormalizeTimeOut return a normalized timeout value
Expand All @@ -34,3 +43,11 @@ func (p *ProbeSettings) NormalizeTimeOut(t time.Duration) time.Duration {
func (p *ProbeSettings) NormalizeInterval(t time.Duration) time.Duration {
return normalize(p.Interval, t, 0, DefaultProbeInterval)
}

// NormalizeThreshold return a normalized threshold value
func (p *ProbeSettings) NormalizeThreshold(t StatusChangeThresholdSettings) StatusChangeThresholdSettings {
return StatusChangeThresholdSettings{
Failure: normalize(p.Failure, t.Failure, 0, DefaultStatusChangeThresholdSetting),
Success: normalize(p.Success, t.Success, 0, DefaultStatusChangeThresholdSetting),
}
}
62 changes: 62 additions & 0 deletions global/probe_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,67 @@ func TestProbe(t *testing.T) {
p.Interval = 20
r = p.NormalizeInterval(0)
assert.Equal(t, time.Duration(20), r)
}

func TestStatusChangeThresholdSettings(t *testing.T) {
p := ProbeSettings{}

r := p.NormalizeThreshold(StatusChangeThresholdSettings{})
assert.Equal(t, StatusChangeThresholdSettings{
Failure: DefaultStatusChangeThresholdSetting,
Success: DefaultStatusChangeThresholdSetting,
}, r)

p.Failure = 2
p.Success = 3

r = p.NormalizeThreshold(StatusChangeThresholdSettings{
Failure: 1,
})
assert.Equal(t, StatusChangeThresholdSettings{
Failure: 1,
Success: 3,
}, r)

r = p.NormalizeThreshold(StatusChangeThresholdSettings{
Success: 2,
})
assert.Equal(t, StatusChangeThresholdSettings{
Failure: 2,
Success: 2,
}, r)

r = p.NormalizeThreshold(StatusChangeThresholdSettings{
Failure: 5,
Success: 6,
})
assert.Equal(t, StatusChangeThresholdSettings{
Failure: 5,
Success: 6,
}, r)

r = p.NormalizeThreshold(StatusChangeThresholdSettings{
Failure: 0,
})
assert.Equal(t, StatusChangeThresholdSettings{
Failure: 2,
Success: 3,
}, r)

r = p.NormalizeThreshold(StatusChangeThresholdSettings{
Success: -1,
})
assert.Equal(t, StatusChangeThresholdSettings{
Failure: 2,
Success: 3,
}, r)

p.Failure = -1
r = p.NormalizeThreshold(StatusChangeThresholdSettings{
Failure: 0,
})
assert.Equal(t, StatusChangeThresholdSettings{
Failure: 1,
Success: 3,
}, r)
}
77 changes: 55 additions & 22 deletions probe/base/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,16 @@ type ProbeFuncType func() (bool, string)

// DefaultProbe is the default options for all probe
type DefaultProbe struct {
ProbeKind string `yaml:"-" json:"-"`
ProbeTag string `yaml:"-" json:"-"`
ProbeName string `yaml:"name" json:"name" jsonschema:"required,title=Probe Name,description=the name of probe must be unique"`
ProbeChannels []string `yaml:"channels" json:"channels,omitempty" jsonschema:"title=Probe Channels,description=the channels of probe message need to send to"`
ProbeTimeout time.Duration `yaml:"timeout,omitempty" json:"timeout,omitempty" jsonschema:"type=string,format=duration,title=Probe Timeout,description=the timeout of probe"`
ProbeTimeInterval time.Duration `yaml:"interval,omitempty" json:"interval,omitempty" jsonschema:"type=string,format=duration,title=Probe Interval,description=the interval of probe"`
ProbeFunc ProbeFuncType `yaml:"-" json:"-"`
ProbeResult *probe.Result `yaml:"-" json:"-"`
metrics *metrics `yaml:"-" json:"-"`
ProbeKind string `yaml:"-" json:"-"`
ProbeTag string `yaml:"-" json:"-"`
ProbeName string `yaml:"name" json:"name" jsonschema:"required,title=Probe Name,description=the name of probe must be unique"`
ProbeChannels []string `yaml:"channels" json:"channels,omitempty" jsonschema:"title=Probe Channels,description=the channels of probe message need to send to"`
ProbeTimeout time.Duration `yaml:"timeout,omitempty" json:"timeout,omitempty" jsonschema:"type=string,format=duration,title=Probe Timeout,description=the timeout of probe"`
ProbeTimeInterval time.Duration `yaml:"interval,omitempty" json:"interval,omitempty" jsonschema:"type=string,format=duration,title=Probe Interval,description=the interval of probe"`
global.StatusChangeThresholdSettings `yaml:",inline" json:",inline"`
ProbeFunc ProbeFuncType `yaml:"-" json:"-"`
ProbeResult *probe.Result `yaml:"-" json:"-"`
metrics *metrics `yaml:"-" json:"-"`
}

// Kind return the probe kind
Expand Down Expand Up @@ -85,6 +86,35 @@ func (d *DefaultProbe) Result() *probe.Result {
return d.ProbeResult
}

// LogTitle return the log title
func (d *DefaultProbe) LogTitle() string {
if len(d.ProbeTag) > 0 {
return fmt.Sprintf("[%s / %s / %s]", d.ProbeKind, d.ProbeTag, d.ProbeName)
}
return fmt.Sprintf("[%s / %s]", d.ProbeKind, d.ProbeName)
}

// CheckStatusThreshold check the status threshold
func (d *DefaultProbe) CheckStatusThreshold() probe.Status {
s := d.StatusChangeThresholdSettings
c := d.ProbeResult.Stat.StatusCounter
title := d.LogTitle()
log.Debugf(" %s - Status Threshold Checking - Current[%v], StatusCnt[%d], FailureThread[%d], SuccessThread[%d]",
title, c.CurrentStatus, c.StatusCount, s.Failure, s.Success)

if c.CurrentStatus == true && c.StatusCount >= s.Success {
log.Infof("%s - Status is UP! Meet the Success Threshold [%d/%d]", title, c.StatusCount, s.Success)
return probe.StatusUp
}
if c.CurrentStatus == false && c.StatusCount >= s.Failure {
log.Infof("%s - Status is DOWN! Meet the Failure Threshold [%d/%d]", title, c.StatusCount, s.Failure)
return probe.StatusDown
}
log.Infof("%s - Keep the Status as %s! Not meet the Threshold - Current[%v], StatusCnt[%d], FailureThread[%d], SuccessThread[%d]",
title, d.ProbeResult.PreStatus, c.CurrentStatus, c.StatusCount, s.Failure, s.Success)
return d.ProbeResult.PreStatus
}

// Config default config
func (d *DefaultProbe) Config(gConf global.ProbeSettings,
kind, tag, name, endpoint string, fn ProbeFuncType) error {
Expand All @@ -96,20 +126,25 @@ func (d *DefaultProbe) Config(gConf global.ProbeSettings,

d.ProbeTimeout = gConf.NormalizeTimeOut(d.ProbeTimeout)
d.ProbeTimeInterval = gConf.NormalizeInterval(d.ProbeTimeInterval)
d.StatusChangeThresholdSettings = gConf.NormalizeThreshold(d.StatusChangeThresholdSettings)

d.ProbeResult = probe.NewResultWithName(name)
d.ProbeResult.Name = name
d.ProbeResult.Endpoint = endpoint

// Set the new length of the status counter
maxLen := d.StatusChangeThresholdSettings.Failure
if d.StatusChangeThresholdSettings.Success > maxLen {
maxLen = d.StatusChangeThresholdSettings.Success
}
d.ProbeResult.Stat.StatusCounter.SetMaxLen(maxLen)

// if there no channels, use the default channel
if len(d.ProbeChannels) == 0 {
d.ProbeChannels = append(d.ProbeChannels, global.DefaultChannelName)
}

if len(d.ProbeTag) > 0 {
log.Infof("Probe [%s / %s] - [%s] base options are configured!", d.ProbeKind, d.ProbeTag, d.ProbeName)
} else {
log.Infof("Probe [%s] - [%s] base options are configured!", d.ProbeKind, d.ProbeName)
}
log.Infof("Probe %s base options are configured!", d.LogTitle())

d.metrics = newMetrics(kind, tag)

Expand All @@ -130,21 +165,19 @@ func (d *DefaultProbe) Probe() probe.Result {

d.ProbeResult.RoundTripTime = time.Since(now)

status := probe.StatusUp
title := "Success"
if stat != true {
status = probe.StatusDown
title = "Error"
}
// check the status threshold
d.ProbeResult.Stat.StatusCounter.AppendStatus(stat, msg)
status := d.CheckStatusThreshold()
title := status.Title()

if len(d.ProbeTag) > 0 {
d.ProbeResult.Message = fmt.Sprintf("%s (%s/%s): %s", title, d.ProbeKind, d.ProbeTag, msg)
log.Debugf("[%s / %s / %s] - %s", d.ProbeKind, d.ProbeTag, d.ProbeName, msg)
} else {
d.ProbeResult.Message = fmt.Sprintf("%s (%s): %s", title, d.ProbeKind, msg)
log.Debugf("[%s / %s] - %s", d.ProbeKind, d.ProbeName, msg)
}

log.Debugf("%s - %s", d.LogTitle(), msg)

d.ProbeResult.PreStatus = d.ProbeResult.Status
d.ProbeResult.Status = status

Expand Down
62 changes: 62 additions & 0 deletions probe/base/base_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package base

import (
"fmt"
"math/rand"
"net"
"os"
Expand Down Expand Up @@ -147,3 +148,64 @@ func TestProxyConnection(t *testing.T) {

monkey.UnpatchAll()
}

func TestStatusThreshold(t *testing.T) {
p := newDummyProber("probe")
p.StatusChangeThresholdSettings = global.StatusChangeThresholdSettings{
Failure: 2,
Success: 3,
}
p.Config(global.ProbeSettings{
StatusChangeThresholdSettings: global.StatusChangeThresholdSettings{
Failure: 2,
Success: 1,
},
})
assert.Equal(t, 3, p.ProbeResult.Stat.MaxLen)
assert.Equal(t, 2, p.StatusChangeThresholdSettings.Failure)
assert.Equal(t, 3, p.StatusChangeThresholdSettings.Success)

p.ProbeResult.Status = probe.StatusInit

cnt := 0
p.ProbeFunc = func() (bool, string) {
cnt++
return true, fmt.Sprintf("success - %d", cnt)
}

n := p.ProbeResult.Stat.MaxLen + 2
for i := 1; i <= n; i++ {
p.Probe()
if i < p.StatusChangeThresholdSettings.Success {

assert.Equal(t, probe.StatusInit, p.Result().Status)
} else {
assert.Equal(t, probe.StatusUp, p.Result().Status)
}
if i < p.ProbeResult.Stat.MaxLen {
assert.Equal(t, i, p.ProbeResult.Stat.StatusCount)
} else {
assert.Equal(t, p.ProbeResult.Stat.MaxLen, p.ProbeResult.Stat.StatusCount)
}
}

cnt = 0
p.ProbeFunc = func() (bool, string) {
cnt++
return false, fmt.Sprintf("failure - %d", cnt)
}

for i := 1; i <= n; i++ {
p.Probe()
if i < p.StatusChangeThresholdSettings.Failure {
assert.Equal(t, probe.StatusUp, p.Result().Status)
} else {
assert.Equal(t, probe.StatusDown, p.Result().Status)
}
if i < p.ProbeResult.Stat.MaxLen {
assert.Equal(t, i, p.ProbeResult.Stat.StatusCount)
} else {
assert.Equal(t, p.ProbeResult.Stat.MaxLen, p.ProbeResult.Stat.StatusCount)
}
}
}
10 changes: 6 additions & 4 deletions probe/data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ var testResults = []Result{
StatusUp: 70,
StatusDown: 30,
},
UpTime: 70 * time.Minute,
DownTime: 30 * time.Minute,
UpTime: 70 * time.Minute,
DownTime: 30 * time.Minute,
StatusCounter: *NewStatusCounter(1),
},
},
{
Expand All @@ -74,8 +75,9 @@ var testResults = []Result{
StatusUp: 270,
StatusDown: 30,
},
UpTime: 270 * time.Minute,
DownTime: 30 * time.Minute,
UpTime: 270 * time.Minute,
DownTime: 30 * time.Minute,
StatusCounter: *NewStatusCounter(2),
},
},
}
Expand Down
Loading