diff --git a/go.mod b/go.mod index da16a0de..6066aea9 100644 --- a/go.mod +++ b/go.mod @@ -28,10 +28,10 @@ require ( github.com/mitchellh/go-homedir v1.1.0 github.com/mitchellh/mapstructure v1.5.0 github.com/nats-io/jwt/v2 v2.3.0 - github.com/nats-io/nats-server/v2 v2.9.14 + github.com/nats-io/nats-server/v2 v2.9.15 github.com/nats-io/nats.go v1.24.0 github.com/nutsdb/nutsdb v0.12.0 - github.com/onsi/ginkgo/v2 v2.8.4 + github.com/onsi/ginkgo/v2 v2.9.0 github.com/onsi/gomega v1.27.2 github.com/pelletier/go-toml v1.9.5 github.com/prometheus/client_golang v1.14.0 @@ -41,28 +41,28 @@ require ( github.com/spf13/jwalterweatherman v1.1.0 github.com/spf13/viper v1.15.0 github.com/vbauerster/mpb/v5 v5.4.0 - github.com/xanzy/go-gitlab v0.80.2 + github.com/xanzy/go-gitlab v0.80.3 github.com/xhit/go-simple-mail v2.2.2+incompatible github.com/xujiajun/utils v0.0.0-20220904132955-5f7c5b914235 - golang.org/x/exp v0.0.0-20230224173230-c95f2b4c22f2 - golang.org/x/net v0.7.0 - golang.org/x/oauth2 v0.5.0 + golang.org/x/exp v0.0.0-20230307190834-24139beb5833 + golang.org/x/net v0.8.0 + golang.org/x/oauth2 v0.6.0 golang.org/x/sync v0.1.0 - golang.org/x/sys v0.5.0 - golang.org/x/term v0.5.0 + golang.org/x/sys v0.6.0 + golang.org/x/term v0.6.0 gopkg.in/yaml.v3 v3.0.1 gorm.io/driver/clickhouse v0.5.0 gorm.io/driver/mysql v1.4.7 gorm.io/driver/postgres v1.4.8 gorm.io/driver/sqlite v1.4.4 gorm.io/driver/sqlserver v1.4.2 - gorm.io/gorm v1.24.5 + gorm.io/gorm v1.24.6 ) require ( github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect github.com/ClickHouse/ch-go v0.53.0 // indirect - github.com/ClickHouse/clickhouse-go/v2 v2.6.5 // indirect + github.com/ClickHouse/clickhouse-go/v2 v2.7.0 // indirect github.com/Masterminds/goutils v1.1.1 // indirect github.com/Masterminds/semver v1.5.0 // indirect github.com/Masterminds/sprig v2.22.0+incompatible // indirect @@ -89,14 +89,14 @@ require ( github.com/aws/smithy-go v1.13.5 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bwmarrin/snowflake v0.3.0 // indirect - github.com/bytedance/sonic v1.8.2 // indirect + github.com/bytedance/sonic v1.8.3 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect github.com/cockroachdb/errors v1.9.1 // indirect github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b // indirect github.com/cockroachdb/pebble v0.0.0-20210331181633-27fc006b8bfb // indirect github.com/cockroachdb/redact v1.1.3 // indirect - github.com/getsentry/sentry-go v0.18.0 // indirect + github.com/getsentry/sentry-go v0.19.0 // indirect github.com/gin-contrib/sse v0.1.0 // indirect github.com/go-asn1-ber/asn1-ber v1.5.4 // indirect github.com/go-faster/city v1.0.1 // indirect @@ -167,7 +167,7 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/pkg/term v1.2.0-beta.2 // indirect github.com/prometheus/client_model v0.3.0 // indirect - github.com/prometheus/common v0.40.0 // indirect + github.com/prometheus/common v0.42.0 // indirect github.com/prometheus/procfs v0.9.0 // indirect github.com/rivo/uniseg v0.4.4 // indirect github.com/rogpeppe/go-internal v1.9.0 // indirect @@ -175,7 +175,7 @@ require ( github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 // indirect github.com/segmentio/asm v1.2.0 // indirect github.com/shopspring/decimal v1.3.1 // indirect - github.com/spf13/afero v1.9.4 // indirect + github.com/spf13/afero v1.9.5 // indirect github.com/spf13/cast v1.5.0 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect @@ -191,12 +191,12 @@ require ( github.com/yusufpapurcu/wmi v1.2.2 // indirect go.opentelemetry.io/otel v1.14.0 // indirect go.opentelemetry.io/otel/trace v1.14.0 // indirect - golang.org/x/arch v0.2.0 // indirect - golang.org/x/crypto v0.6.0 // indirect - golang.org/x/mod v0.8.0 // indirect - golang.org/x/text v0.7.0 // indirect + golang.org/x/arch v0.3.0 // indirect + golang.org/x/crypto v0.7.0 // indirect + golang.org/x/mod v0.9.0 // indirect + golang.org/x/text v0.8.0 // indirect golang.org/x/time v0.3.0 // indirect - golang.org/x/tools v0.6.0 // indirect + golang.org/x/tools v0.7.0 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/protobuf v1.28.1 // indirect gopkg.in/ini.v1 v1.67.0 // indirect diff --git a/monitor/info/encode.go b/monitor/info/encode.go index 70967db2..0d75597a 100644 --- a/monitor/info/encode.go +++ b/monitor/info/encode.go @@ -48,11 +48,11 @@ func (e *encodingModel) stringInfo() string { buf = bytes.NewBuffer(make([]byte, 0)) ) - for _, i := range e.Info { - buf.WriteString(fmt.Sprintf("%v", i)) + for n, i := range e.Info { + buf.WriteString(fmt.Sprintf("%s: %v,", n, i)) } - return buf.String() + return strings.Trim(strings.TrimSpace(buf.String()), ",") } func (e *encodingModel) stringClean(str string) string { diff --git a/monitor/internalConfig.go b/monitor/internalConfig.go index c99a3940..7db98783 100644 --- a/monitor/internalConfig.go +++ b/monitor/internalConfig.go @@ -33,8 +33,6 @@ import ( "github.com/nabbar/golib/monitor/types" - monsts "github.com/nabbar/golib/monitor/status" - liberr "github.com/nabbar/golib/errors" liblog "github.com/nabbar/golib/logger" ) @@ -264,32 +262,14 @@ func (o *mon) getFct() types.HealthCheck { func (o *mon) getLastCheck() *lastRun { if i, l := o.x.Load(keyLastRun); !l { - return &lastRun{ - status: monsts.KO, - runtime: time.Now(), - isRise: false, - isFall: false, - } + return newLastRun() } else if v, k := i.(*lastRun); !k { - return &lastRun{ - status: monsts.KO, - runtime: time.Now(), - isRise: false, - isFall: false, - } + return newLastRun() } else { return v } } -func (o *mon) setLastCheck(m middleWare) error { - e := m.Next() - l := &lastRun{ - status: o.Status(), - runtime: time.Now(), - isRise: o.IsRise(), - isFall: o.IsFall(), - } +func (o *mon) setLastCheck(l *lastRun) { o.x.Store(keyLastRun, l) - return e } diff --git a/monitor/last.go b/monitor/last.go index bde2310c..25d18fcd 100644 --- a/monitor/last.go +++ b/monitor/last.go @@ -27,14 +27,211 @@ package monitor import ( + "fmt" + "sync" "time" monsts "github.com/nabbar/golib/monitor/status" ) type lastRun struct { + m sync.RWMutex + status monsts.Status runtime time.Time - isRise bool - isFall bool + + isRise bool + isFall bool + + cntRise uint8 + cntFall uint8 + + uptime time.Duration + downtime time.Duration + riseTime time.Duration + fallTime time.Duration + latency time.Duration + + err error +} + +func newLastRun() *lastRun { + return &lastRun{ + m: sync.RWMutex{}, + status: monsts.KO, + runtime: time.Now(), + isRise: false, + isFall: false, + cntRise: 0, + cntFall: 0, + uptime: 0, + downtime: 0, + riseTime: 0, + fallTime: 0, + latency: 0, + err: fmt.Errorf("no healcheck still run"), + } +} + +func (o *lastRun) Latency() time.Duration { + o.m.RLock() + defer o.m.RUnlock() + return o.latency +} + +func (o *lastRun) FallTime() time.Duration { + o.m.RLock() + defer o.m.RUnlock() + return o.fallTime +} + +func (o *lastRun) RiseTime() time.Duration { + o.m.RLock() + defer o.m.RUnlock() + return o.riseTime +} + +func (o *lastRun) UpTime() time.Duration { + o.m.RLock() + defer o.m.RUnlock() + return o.uptime +} + +func (o *lastRun) DownTime() time.Duration { + o.m.RLock() + defer o.m.RUnlock() + return o.downtime +} + +func (o *lastRun) Status() monsts.Status { + o.m.RLock() + defer o.m.RUnlock() + return o.status +} + +func (o *lastRun) IsRise() bool { + o.m.RLock() + defer o.m.RUnlock() + return o.isRise +} + +func (o *lastRun) IsFall() bool { + o.m.RLock() + defer o.m.RUnlock() + return o.isFall +} + +func (o *lastRun) Error() error { + o.m.RLock() + defer o.m.RUnlock() + return o.err +} + +func (o *lastRun) setStatus(err error, dur time.Duration, cfg *runCfg) { + o.m.Lock() + defer o.m.Unlock() + + o.latency = dur + + if err != nil { + o.err = err + o.setStatusFall(cfg) + } else { + o.err = nil + o.setStatusRise(cfg) + } +} + +func (o *lastRun) setStatusFall(cfg *runCfg) { + if cfg == nil { + return + } + + sts := o.status + dur := time.Since(o.runtime) + o.runtime = time.Now() + + o.cntRise = 0 + o.cntFall++ + + switch sts { + case monsts.OK: + if o.cntFall >= cfg.riseCountWarn { + o.cntFall = 0 + o.status = monsts.Warn + } else { + o.status = monsts.OK + } + o.isFall = true + o.isRise = false + o.fallTime += dur + o.uptime += dur + + case monsts.Warn: + if o.cntFall >= cfg.riseCountKO { + o.isFall = false + o.cntFall = 0 + o.status = monsts.KO + } else { + o.isFall = true + o.status = monsts.Warn + } + o.isRise = false + o.fallTime += dur + o.downtime += dur + + default: + o.cntFall = 0 + o.isFall = false + o.isRise = false + o.status = monsts.KO + o.downtime += dur + } +} + +func (o *lastRun) setStatusRise(cfg *runCfg) { + if cfg == nil { + return + } + + sts := o.status + dur := time.Since(o.runtime) + o.runtime = time.Now() + + o.cntFall = 0 + o.cntRise++ + + switch sts { + case monsts.KO: + if o.cntRise >= cfg.riseCountKO { + o.cntRise = 0 + o.status = monsts.Warn + } else { + o.status = monsts.KO + } + o.isFall = false + o.isRise = true + o.riseTime += dur + o.downtime += dur + + case monsts.Warn: + if o.cntRise >= cfg.riseCountWarn { + o.cntRise = 0 + o.isRise = false + o.status = monsts.OK + } else { + o.isRise = true + o.status = monsts.Warn + } + o.isFall = false + o.riseTime += dur + o.downtime += dur + + default: + o.cntRise = 0 + o.isFall = false + o.isRise = false + o.status = monsts.OK + o.uptime += dur + } } diff --git a/monitor/metrics.go b/monitor/metrics.go index 338c9c6e..77f6e5a6 100644 --- a/monitor/metrics.go +++ b/monitor/metrics.go @@ -65,57 +65,27 @@ func (o *mon) RegisterCollectMetrics(fct libprm.FuncCollectMetrics) { } func (o *mon) CollectLatency() time.Duration { - if i, l := o.x.LoadAndDelete(keyMetricLatency); !l { - return 0 - } else if v, k := i.(time.Duration); !k { - return 0 - } else { - return v - } + return o.Latency() } func (o *mon) CollectUpTime() time.Duration { - if i, l := o.x.LoadAndDelete(keyMetricUpTime); !l { - return 0 - } else if v, k := i.(time.Duration); !k { - return 0 - } else { - return v - } + return o.Uptime() } func (o *mon) CollectDownTime() time.Duration { - if i, l := o.x.LoadAndDelete(keyMetricDownTime); !l { - return 0 - } else if v, k := i.(time.Duration); !k { - return 0 - } else { - return v - } + return o.Downtime() } func (o *mon) CollectRiseTime() time.Duration { - if i, l := o.x.LoadAndDelete(keyMetricRiseTime); !l { - return 0 - } else if v, k := i.(time.Duration); !k { - return 0 - } else { - return v - } + return o.getLastCheck().RiseTime() } func (o *mon) CollectFallTime() time.Duration { - if i, l := o.x.LoadAndDelete(keyMetricFallTime); !l { - return 0 - } else if v, k := i.(time.Duration); !k { - return 0 - } else { - return v - } + return o.getLastCheck().FallTime() } -func (o *mon) CollectStatus() (sts string, rise bool, fall bool) { - return o.Status().String(), o.IsRise(), o.IsFall() +func (o *mon) CollectStatus() (sts monsts.Status, rise bool, fall bool) { + return o.Status(), o.IsRise(), o.IsFall() } func (o *mon) collectMetrics(ctx context.Context) { @@ -146,73 +116,3 @@ func (o *mon) collectMetrics(ctx context.Context) { f(ctx, n...) } - -func (o *mon) setLatency(m middleWare) error { - var ts = time.Now() - - ret := m.Next() - d := time.Since(ts) - - o.x.Store(keyMetricLatency, d) - - return ret -} - -func (o *mon) setUpTime(m middleWare) error { - ret := m.Next() - - if o.Status() != monsts.OK { - return ret - } - - last := o.getLastCheck() - - if last.status != monsts.OK { - return ret - } - - d := time.Since(last.runtime) + o.Uptime() - o.x.Store(keyMetricUpTime, d) - return ret -} - -func (o *mon) setDownTime(m middleWare) error { - ret := m.Next() - - if o.Status() != monsts.KO { - return ret - } - - last := o.getLastCheck() - if last.status != monsts.KO { - return ret - } - - d := time.Since(last.runtime) + o.Downtime() - o.x.Store(keyMetricDownTime, d) - return ret -} - -func (o *mon) setRiseTime(m middleWare) error { - ret := m.Next() - - if !o.IsRise() { - return ret - } - - last := o.getLastCheck() - o.x.Store(keyMetricRiseTime, o.CollectRiseTime()+time.Since(last.runtime)) - return ret -} - -func (o *mon) setFallTime(m middleWare) error { - ret := m.Next() - - if !o.IsFall() { - return ret - } - - last := o.getLastCheck() - o.x.Store(keyMetricFallTime, o.CollectFallTime()+time.Since(last.runtime)) - return ret -} diff --git a/monitor/middleware.go b/monitor/middleware.go index 3cabce04..7ebb4bab 100644 --- a/monitor/middleware.go +++ b/monitor/middleware.go @@ -78,7 +78,7 @@ func (m *mdl) Run(ctx context.Context) { m.ctx, cnl = context.WithTimeout(ctx, m.cfg.checkTimeout) defer cnl() - m.crs = len(m.mdl) - 1 + m.crs = len(m.mdl) _ = m.Next() } diff --git a/monitor/model.go b/monitor/model.go index 5afe4440..c766d3d0 100644 --- a/monitor/model.go +++ b/monitor/model.go @@ -43,21 +43,17 @@ const ( keyLogger = "keyLogger" keyLoggerDef = "keyLoggerDefault" keyHealthCheck = "keyFct" - keyStatus = "keySts" - keyMessage = "keyMsg" - keyRise = "keyRise" - keyFall = "keyFall" keyRun = "keyRun" keyLastRun = "keyLastRun" keyMetricsName = "keyMetricsName" keyMetricsFunc = "keyMetricsFunc" - keyMetricLatency = "metricLatency" - keyMetricUpTime = "metricUpTime" - keyMetricDownTime = "metricDownTime" - keyMetricRiseTime = "metricRiseTime" - keyMetricFallTime = "metricFallTime" + keyMetricLatency = "keyMetricLatency" + keyMetricUpTime = "keyMetricUpTime" + keyMetricDownTime = "keyMetricDownTime" + keyMetricRiseTime = "keyMetricRiseTime" + keyMetricFallTime = "keyMetricFallTime" LogFieldProcess = "process" LogValueProcess = "monitor" diff --git a/monitor/pool/interface.go b/monitor/pool/interface.go index 60e8f37c..ecd9492f 100644 --- a/monitor/pool/interface.go +++ b/monitor/pool/interface.go @@ -27,7 +27,11 @@ package pool import ( + "context" "sync" + "time" + + liblog "github.com/nabbar/golib/logger" libsrv "github.com/nabbar/golib/server" @@ -40,7 +44,10 @@ type Pool interface { montps.Pool libsrv.Server + InitMetrics(prm libprm.FuncGetPrometheus, log liblog.FuncLog) error RegisterFctProm(prm libprm.FuncGetPrometheus) + RegisterFctLogger(log liblog.FuncLog) + TriggerCollectMetrics(ctx context.Context, dur time.Duration) } func New(ctx libctx.FuncContext) Pool { diff --git a/monitor/pool/metrics.go b/monitor/pool/metrics.go index f94fd200..83dfda36 100644 --- a/monitor/pool/metrics.go +++ b/monitor/pool/metrics.go @@ -30,6 +30,8 @@ import ( "context" "strings" + liblog "github.com/nabbar/golib/logger" + montps "github.com/nabbar/golib/monitor/types" libprm "github.com/nabbar/golib/prometheus" libmet "github.com/nabbar/golib/prometheus/metrics" @@ -41,9 +43,11 @@ const ( metricLatency = "latency" metricUptime = "uptime" metricDowntime = "downtime" - metricRisetime = "risetime" - metricFalltime = "falltime" + metricRiseTime = "risetime" + metricFallTime = "falltime" metricStatus = "status" + metricRise = "rise" + metricFall = "fall" metricBoolTrue = "true" metricBoolFalse = "false" ) @@ -62,11 +66,10 @@ func (o *pool) normalizeName(name string) string { return name } -func (o *pool) getMetricName(monitor, metric string) string { +func (o *pool) getMetricName(metric string) string { part := make([]string, 0) part = append(part, o.normalizeName(metricBaseName)) part = append(part, o.normalizeName(metric)) - part = append(part, o.normalizeName(monitor)) return strings.Join(part, "_") } @@ -83,139 +86,352 @@ func (o *pool) getProm() libprm.Prometheus { return nil } -func (o *pool) createMetrics(mon montps.Monitor) error { - var prm libprm.Prometheus +func (o *pool) createMetricsLatency() error { + var ( + prm libprm.Prometheus + met libmet.Metric + mnm string + ) if prm = o.getProm(); prm == nil { return nil } + mnm = o.getMetricName(metricLatency) + met = libmet.NewMetrics(mnm, prmtps.Histogram) + met.SetDesc("the time monitor took to check components' health") + met.AddLabel(metricBaseName) + met.AddBuckets([]float64{0.0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0, 10}...) + met.SetCollect(o.collectMetricLatency) + + return prm.AddMetric(false, met) +} + +func (o *pool) collectMetricLatency(ctx context.Context, m libmet.Metric) { + var log = o.getLog() + + o.MonitorWalk(func(name string, val montps.Monitor) bool { + if e := m.Observe([]string{name}, val.CollectLatency().Seconds()); e != nil { + ent := log.Entry(liblog.ErrorLevel, "failed to collect metrics", nil) + ent.FieldAdd("monitor", name) + ent.FieldAdd("metric", val.Name()) + ent.ErrorAdd(true, e) + ent.Log() + } + + return true + }) +} + +func (o *pool) createMetricsUptime() error { var ( - name = mon.Name() - met libmet.Metric - mnm string + prm libprm.Prometheus + met libmet.Metric + mnm string ) - mnm = o.getMetricName(name, metricLatency) - mon.RegisterMetricsAddName(mnm) - met = libmet.NewMetrics(mnm, prmtps.Histogram) - met.SetDesc("the time monitor took to check the health of '" + name + "' component") + if prm = o.getProm(); prm == nil { + return nil + } + + mnm = o.getMetricName(metricUptime) + met = libmet.NewMetrics(mnm, prmtps.Gauge) + met.SetDesc("the total seconds during the component are up") met.AddLabel(metricBaseName) - met.AddBuckets(prm.GetDuration()...) - met.SetCollect(func(ctx context.Context, m libmet.Metric) { - _ = m.Observe([]string{name}, mon.CollectLatency().Seconds()) + met.SetCollect(o.collectMetricUptime) + + return prm.AddMetric(false, met) +} + +func (o *pool) collectMetricUptime(ctx context.Context, m libmet.Metric) { + var log = o.getLog() + + o.MonitorWalk(func(name string, val montps.Monitor) bool { + if e := m.SetGaugeValue([]string{name}, val.CollectUpTime().Seconds()); e != nil { + ent := log.Entry(liblog.ErrorLevel, "failed to collect metrics", nil) + ent.FieldAdd("monitor", name) + ent.FieldAdd("metric", val.Name()) + ent.ErrorAdd(true, e) + ent.Log() + } + + return true }) +} - if e := prm.AddMetric(false, met); e != nil { - return e +func (o *pool) createMetricsDowntime() error { + var ( + prm libprm.Prometheus + met libmet.Metric + mnm string + ) + + if prm = o.getProm(); prm == nil { + return nil } - mnm = o.getMetricName(name, metricUptime) - mon.RegisterMetricsAddName(mnm) - met = libmet.NewMetrics(mnm, prmtps.Histogram) - met.SetDesc("the total time during which the '" + name + "' component is up") + mnm = o.getMetricName(metricDowntime) + met = libmet.NewMetrics(mnm, prmtps.Gauge) + met.SetDesc("the total time during the components are down") met.AddLabel(metricBaseName) - met.AddBuckets(prm.GetDuration()...) - met.SetCollect(func(ctx context.Context, m libmet.Metric) { - _ = m.Observe([]string{name}, mon.CollectUpTime().Seconds()) + met.SetCollect(o.collectMetricDowntime) + + return prm.AddMetric(false, met) +} + +func (o *pool) collectMetricDowntime(ctx context.Context, m libmet.Metric) { + var log = o.getLog() + + o.MonitorWalk(func(name string, val montps.Monitor) bool { + if e := m.SetGaugeValue([]string{name}, val.CollectDownTime().Seconds()); e != nil { + ent := log.Entry(liblog.ErrorLevel, "failed to collect metrics", nil) + ent.FieldAdd("monitor", name) + ent.FieldAdd("metric", val.Name()) + ent.ErrorAdd(true, e) + ent.Log() + } + + return true }) +} - if e := prm.AddMetric(false, met); e != nil { - return e +func (o *pool) createMetricsRiseTime() error { + var ( + prm libprm.Prometheus + met libmet.Metric + mnm string + ) + + if prm = o.getProm(); prm == nil { + return nil } - mnm = o.getMetricName(name, metricDowntime) - mon.RegisterMetricsAddName(mnm) - met = libmet.NewMetrics(mnm, prmtps.Histogram) - met.SetDesc("the total time during which the '" + name + "' component is down") + mnm = o.getMetricName(metricRiseTime) + met = libmet.NewMetrics(mnm, prmtps.Gauge) + met.SetDesc("the total time during the components are rising") met.AddLabel(metricBaseName) - met.AddBuckets(prm.GetDuration()...) - met.SetCollect(func(ctx context.Context, m libmet.Metric) { - _ = m.Observe([]string{name}, mon.CollectDownTime().Seconds()) + met.SetCollect(o.collectMetricRiseTime) + + return prm.AddMetric(false, met) +} + +func (o *pool) collectMetricRiseTime(ctx context.Context, m libmet.Metric) { + var log = o.getLog() + + o.MonitorWalk(func(name string, val montps.Monitor) bool { + if e := m.SetGaugeValue([]string{name}, val.CollectRiseTime().Seconds()); e != nil { + ent := log.Entry(liblog.ErrorLevel, "failed to collect metrics", nil) + ent.FieldAdd("monitor", name) + ent.FieldAdd("metric", val.Name()) + ent.ErrorAdd(true, e) + ent.Log() + } + + return true }) +} - if e := prm.AddMetric(false, met); e != nil { - return e +func (o *pool) createMetricsFallTime() error { + var ( + prm libprm.Prometheus + met libmet.Metric + mnm string + ) + + if prm = o.getProm(); prm == nil { + return nil } - mnm = o.getMetricName(name, metricRisetime) - mon.RegisterMetricsAddName(mnm) - met = libmet.NewMetrics(mnm, prmtps.Histogram) - met.SetDesc("the total time during which the '" + name + "' component is rising") + mnm = o.getMetricName(metricFallTime) + met = libmet.NewMetrics(mnm, prmtps.Gauge) + met.SetDesc("the total time during the components are falling") met.AddLabel(metricBaseName) - met.AddBuckets(prm.GetDuration()...) - met.SetCollect(func(ctx context.Context, m libmet.Metric) { - _ = m.Observe([]string{name}, mon.CollectRiseTime().Seconds()) + met.SetCollect(o.collectMetricFallTime) + + return prm.AddMetric(false, met) +} + +func (o *pool) collectMetricFallTime(ctx context.Context, m libmet.Metric) { + var log = o.getLog() + + o.MonitorWalk(func(name string, val montps.Monitor) bool { + if e := m.SetGaugeValue([]string{name}, val.CollectFallTime().Seconds()); e != nil { + ent := log.Entry(liblog.ErrorLevel, "failed to collect metrics", nil) + ent.FieldAdd("monitor", name) + ent.FieldAdd("metric", val.Name()) + ent.ErrorAdd(true, e) + ent.Log() + } + + return true }) +} - if e := prm.AddMetric(false, met); e != nil { - return e +func (o *pool) createMetricsStatus() error { + var ( + prm libprm.Prometheus + met libmet.Metric + mnm string + ) + + if prm = o.getProm(); prm == nil { + return nil } - mnm = o.getMetricName(name, metricFalltime) - mon.RegisterMetricsAddName(mnm) - met = libmet.NewMetrics(mnm, prmtps.Histogram) - met.SetDesc("the total time during which the '" + name + "' component is falling") + mnm = o.getMetricName(metricStatus) + met = libmet.NewMetrics(mnm, prmtps.Gauge) + met.SetDesc("the instant status of components") met.AddLabel(metricBaseName) - met.AddBuckets(prm.GetDuration()...) - met.SetCollect(func(ctx context.Context, m libmet.Metric) { - _ = m.Observe([]string{name}, mon.CollectFallTime().Seconds()) + met.SetCollect(o.collectMetricStatus) + + return prm.AddMetric(false, met) +} + +func (o *pool) collectMetricStatus(ctx context.Context, m libmet.Metric) { + var log = o.getLog() + + o.MonitorWalk(func(name string, val montps.Monitor) bool { + var ( + s, _, _ = val.CollectStatus() + ) + + if e := m.SetGaugeValue([]string{name}, s.Float()); e != nil { + ent := log.Entry(liblog.ErrorLevel, "failed to collect metrics", nil) + ent.FieldAdd("monitor", name) + ent.FieldAdd("metric", val.Name()) + ent.ErrorAdd(true, e) + ent.Log() + } + return true }) +} - if e := prm.AddMetric(false, met); e != nil { - return e +func (o *pool) createMetricsRising() error { + var ( + prm libprm.Prometheus + met libmet.Metric + mnm string + ) + + if prm = o.getProm(); prm == nil { + return nil } - mnm = o.getMetricName(name, metricStatus) - mon.RegisterMetricsAddName(mnm) - met = libmet.NewMetrics(mnm, prmtps.Counter) - met.SetDesc("the total time during which the '" + name + "' component is falling") - met.AddLabel(metricBaseName, "status", "rise", "fall") - met.SetCollect(func(ctx context.Context, m libmet.Metric) { + mnm = o.getMetricName(metricRise) + met = libmet.NewMetrics(mnm, prmtps.Gauge) + met.SetDesc("the status rising value of components") + met.AddLabel(metricBaseName) + met.SetCollect(o.collectMetricRising) + + return prm.AddMetric(false, met) +} + +func (o *pool) collectMetricRising(ctx context.Context, m libmet.Metric) { + var log = o.getLog() + + o.MonitorWalk(func(name string, val montps.Monitor) bool { var ( - s, r, f = mon.CollectStatus() - rs, fs string + _, r, _ = val.CollectStatus() + s float64 ) if r { - rs = metricBoolTrue + s = 1 } else { - rs = metricBoolFalse + s = 0 + } + + if e := m.SetGaugeValue([]string{name}, s); e != nil { + ent := log.Entry(liblog.ErrorLevel, "failed to collect metrics", nil) + ent.FieldAdd("monitor", name) + ent.FieldAdd("metric", val.Name()) + ent.ErrorAdd(true, e) + ent.Log() } + return true + }) +} + +func (o *pool) createMetricsFalling() error { + var ( + prm libprm.Prometheus + met libmet.Metric + mnm string + ) + + if prm = o.getProm(); prm == nil { + return nil + } + + mnm = o.getMetricName(metricFall) + met = libmet.NewMetrics(mnm, prmtps.Gauge) + met.SetDesc("the status falling value of components") + met.AddLabel(metricBaseName) + met.SetCollect(o.collectMetricFalling) + + return prm.AddMetric(false, met) +} + +func (o *pool) collectMetricFalling(ctx context.Context, m libmet.Metric) { + var log = o.getLog() + + o.MonitorWalk(func(name string, val montps.Monitor) bool { + var ( + _, _, f = val.CollectStatus() + s float64 + ) + if f { - fs = metricBoolTrue + s = 1 } else { - fs = metricBoolFalse + s = 0 + } + + if e := m.SetGaugeValue([]string{name}, s); e != nil { + ent := log.Entry(liblog.ErrorLevel, "failed to collect metrics", nil) + ent.FieldAdd("monitor", name) + ent.FieldAdd("metric", val.Name()) + ent.ErrorAdd(true, e) + ent.Log() } - _ = m.Inc([]string{name, s, rs, fs}) + return true }) +} - if e := prm.AddMetric(false, met); e != nil { +func (o *pool) createMetrics() error { + if e := o.createMetricsLatency(); e != nil { return e } - mon.RegisterCollectMetrics(prm.CollectMetrics) + if e := o.createMetricsUptime(); e != nil { + return e + } - return nil -} + if e := o.createMetricsDowntime(); e != nil { + return e + } -func (o *pool) deleteMetrics(mon montps.Monitor) { - var prm libprm.Prometheus + if e := o.createMetricsRiseTime(); e != nil { + return e + } - if prm = o.getProm(); prm == nil { - return + if e := o.createMetricsFallTime(); e != nil { + return e } - var ( - name = mon.Name() - ) + if e := o.createMetricsStatus(); e != nil { + return e + } + + if e := o.createMetricsRising(); e != nil { + return e + } - prm.DelMetric(o.getMetricName(name, metricLatency)) - prm.DelMetric(o.getMetricName(name, metricUptime)) - prm.DelMetric(o.getMetricName(name, metricDowntime)) - prm.DelMetric(o.getMetricName(name, metricRisetime)) - prm.DelMetric(o.getMetricName(name, metricFalltime)) - prm.DelMetric(o.getMetricName(name, metricStatus)) + if e := o.createMetricsFalling(); e != nil { + return e + } + + return nil } diff --git a/monitor/pool/model.go b/monitor/pool/model.go index 9bb6f7aa..04ba866f 100644 --- a/monitor/pool/model.go +++ b/monitor/pool/model.go @@ -27,7 +27,11 @@ package pool import ( + "context" "sync" + "time" + + liblog "github.com/nabbar/golib/logger" libctx "github.com/nabbar/golib/context" libprm "github.com/nabbar/golib/prometheus" @@ -36,12 +40,58 @@ import ( type pool struct { m sync.RWMutex fp libprm.FuncGetPrometheus + fl liblog.FuncLog p libctx.Config[string] } +func (o *pool) getLog() liblog.Logger { + o.m.RLock() + defer o.m.RUnlock() + + if o.fl == nil { + return liblog.GetDefault() + } else if l := o.fl(); l != nil { + return l + } + + return liblog.GetDefault() +} + +func (o *pool) InitMetrics(prm libprm.FuncGetPrometheus, log liblog.FuncLog) error { + o.RegisterFctProm(prm) + o.RegisterFctLogger(log) + return o.createMetrics() +} + func (o *pool) RegisterFctProm(prm libprm.FuncGetPrometheus) { o.m.Lock() defer o.m.Unlock() o.fp = prm } + +func (o *pool) RegisterFctLogger(log liblog.FuncLog) { + o.m.Lock() + defer o.m.Unlock() + + o.fl = log +} + +func (o *pool) TriggerCollectMetrics(ctx context.Context, dur time.Duration) { + var tck *time.Ticker + + tck = time.NewTicker(dur) + defer tck.Stop() + + for { + select { + case <-tck.C: + if p := o.getProm(); p != nil { + p.CollectMetrics(ctx) + } + + case <-ctx.Done(): + return + } + } +} diff --git a/monitor/pool/pool.go b/monitor/pool/pool.go index ab6c27a3..da61cd55 100644 --- a/monitor/pool/pool.go +++ b/monitor/pool/pool.go @@ -39,10 +39,6 @@ func (o *pool) MonitorAdd(mon montps.Monitor) error { return fmt.Errorf("monitor name cannot be empty") } - if e := o.createMetrics(mon); e != nil { - return e - } - if o.IsRunning() && !mon.IsRunning() { if e := mon.Start(o.p.GetContext()); e != nil { return e @@ -81,12 +77,8 @@ func (o *pool) MonitorSet(mon montps.Monitor) error { func (o *pool) MonitorDel(name string) { if len(name) < 1 { return - } else if i, l := o.p.LoadAndDelete(name); !l { - return - } else if v, k := i.(montps.Monitor); !k { - return } else { - o.deleteMetrics(v) + o.p.Delete(name) } } diff --git a/monitor/server.go b/monitor/server.go index 330b5b91..a9f6239e 100644 --- a/monitor/server.go +++ b/monitor/server.go @@ -167,23 +167,17 @@ func (o *mon) check(ctx context.Context, cfg *runCfg) { var fct types.HealthCheck if fct = o.getFct(); fct == nil { - _ = o.setStatus(ErrorMissingHealthCheck.Error(nil), cfg) + l := o.getLastCheck() + l.setStatus(ErrorMissingHealthCheck.Error(nil), 0, cfg) + o.x.Store(keyLastRun, l) } else if cfg == nil { - _ = o.setStatus(ErrorValidatorError.Error(nil), cfg) + l := o.getLastCheck() + l.setStatus(ErrorValidatorError.Error(nil), 0, cfg) + o.x.Store(keyLastRun, l) } m := newMiddleware(cfg, fct) - m.Add(o.setLatency) m.Add(o.mdlStatus) - // add here other part to run - - m.Add(o.setUpTime) - m.Add(o.setDownTime) - m.Add(o.setRiseTime) - m.Add(o.setFallTime) - - // no add after this - m.Add(o.setLastCheck) m.Run(ctx) // store metrics to prometheus exporter diff --git a/monitor/status.go b/monitor/status.go index 36c36662..6328ee89 100644 --- a/monitor/status.go +++ b/monitor/status.go @@ -66,163 +66,44 @@ func (o *mon) InfoUpd(inf montps.Info) { } func (o *mon) Status() monsts.Status { - if i, l := o.x.Load(keyStatus); !l { - return monsts.KO - } else if v, k := i.(monsts.Status); !k { - return monsts.KO - } else { - return v - } + return o.getLastCheck().Status() } func (o *mon) Message() string { - if i, l := o.x.Load(keyMessage); !l { - return "" - } else if v, k := i.(error); !k { - return "" - } else if v == nil { - return "" - } else { - return v.Error() + if err := o.getLastCheck().Error(); err != nil { + return err.Error() } + + return "" } func (o *mon) IsRise() bool { - if sts := o.Status(); sts == monsts.OK { - return false - } else { - return o.riseGet() > 0 - } + return o.getLastCheck().IsRise() } func (o *mon) IsFall() bool { - if sts := o.Status(); sts == monsts.KO { - return false - } else { - return o.fallGet() > 0 - } + return o.getLastCheck().IsFall() } func (o *mon) Latency() time.Duration { - if i, l := o.x.Load(keyMetricLatency); !l { - return 0 - } else if v, k := i.(time.Duration); !k { - return 0 - } else { - return v - } + return o.getLastCheck().Latency() } func (o *mon) Uptime() time.Duration { - if i, l := o.x.Load(keyMetricUpTime); !l { - return 0 - } else if v, k := i.(time.Duration); !k { - return 0 - } else { - return v - } + return o.getLastCheck().UpTime() } func (o *mon) Downtime() time.Duration { - if i, l := o.x.Load(keyMetricDownTime); !l { - return 0 - } else if v, k := i.(time.Duration); !k { - return 0 - } else { - return v - } -} - -func (o *mon) riseInc() uint8 { - i := o.riseGet() + 1 - o.x.Store(keyRise, i) - return i -} - -func (o *mon) riseReset() { - o.x.Delete(keyRise) -} - -func (o *mon) riseGet() uint8 { - if i, l := o.x.Load(keyRise); !l { - return 0 - } else if v, k := i.(uint8); !k { - return 0 - } else { - return v - } -} - -func (o *mon) fallInc() uint8 { - i := o.fallGet() + 1 - o.x.Store(keyFall, i) - return i -} - -func (o *mon) fallReset() { - o.x.Delete(keyFall) -} - -func (o *mon) fallGet() uint8 { - if i, l := o.x.Load(keyFall); !l { - return 0 - } else if v, k := i.(uint8); !k { - return 0 - } else { - return v - } -} - -func (o *mon) setStatus(err error, cfg *runCfg) error { - if err != nil { - o.x.Store(keyMessage, err) - o.setStatusFall(cfg) - } else { - o.x.Delete(keyMessage) - o.setStatusRise(cfg) - } - - return err + return o.getLastCheck().DownTime() } func (o *mon) mdlStatus(m middleWare) error { - return o.setStatus(m.Next(), m.Config()) -} - -func (o *mon) setStatusFall(cfg *runCfg) { - sts := o.Status() - - if sts == monsts.KO || cfg == nil { - return - } + ts := time.Now() + err := m.Next() - o.riseReset() - i := o.fallInc() + lst := o.getLastCheck() + lst.setStatus(err, time.Since(ts), m.Config()) + o.setLastCheck(lst) - if i > cfg.fallCountKO { - o.x.Store(keyStatus, monsts.KO) - } else if i > cfg.fallCountWarn { - o.x.Store(keyStatus, monsts.Warn) - } else { - o.x.Store(keyStatus, monsts.OK) - } -} - -func (o *mon) setStatusRise(cfg *runCfg) { - sts := o.Status() - - if sts == monsts.OK || cfg == nil { - return - } - - o.fallReset() - i := o.riseInc() - - if i >= cfg.riseCountWarn { - o.x.Store(keyStatus, monsts.OK) - } else if i >= cfg.riseCountKO { - o.x.Store(keyStatus, monsts.Warn) - } else { - o.x.Store(keyStatus, monsts.KO) - } + return err } diff --git a/monitor/status/status.go b/monitor/status/status.go index 704ff6ae..a9dbe1ac 100644 --- a/monitor/status/status.go +++ b/monitor/status/status.go @@ -46,6 +46,10 @@ func (s Status) Int() int64 { return int64(s) } +func (s Status) Float() float64 { + return float64(s) +} + func (s Status) MarshalJSON() ([]byte, error) { b := make([]byte, 0, len(s.String())+2) b = append(b, '"') diff --git a/monitor/types/monitor.go b/monitor/types/monitor.go index 407b1f23..96aeac3d 100644 --- a/monitor/types/monitor.go +++ b/monitor/types/monitor.go @@ -83,7 +83,7 @@ type MonitorMetrics interface { CollectDownTime() time.Duration CollectRiseTime() time.Duration CollectFallTime() time.Duration - CollectStatus() (sts string, rise bool, fall bool) + CollectStatus() (sts monsts.Status, rise bool, fall bool) } type MonitorInfo interface {