Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Prometheus metrics cache events and stale events #9826

Merged
merged 9 commits into from
Feb 11, 2022
2 changes: 2 additions & 0 deletions docs/pages/setup/reference/metrics.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ Now you can see the monitoring information by visiting several endpoints:
| `rx` | counter | Teleport | Number of bytes received. |
| `server_interactive_sessions_total` | gauge | Teleport | Number of active sessions. |
| `teleport_build_info` | gauge | Teleport | Provides build information of Teleport including gitref (git describe --long --tags), Go version, and Teleport version. The value of this gauge will always be 1. |
| `teleport_cache_events` | counter | Teleport | Number of events received by a Teleport service cache. Teleport's Auth Service, Proxy Service, and other services cache incoming events related to their service. |
| `teleport_cache_stale_events` | counter | Teleport | Number of stale events received by a Teleport service cache. A high percentage of stale events can indicate a degraded backend. |
| `trusted_clusters` | gauge | Teleport | Number of tunnels per state. |
| `tx` | counter | Teleport | Number of bytes transmitted. |
| `user_login_total` | counter | Teleport Auth | Number of user logins. |
Expand Down
28 changes: 28 additions & 0 deletions lib/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,32 @@ import (

"github.com/gravitational/trace"
"github.com/jonboulle/clockwork"
"github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus"
"go.uber.org/atomic"
)

var (
cacheEventsReceived = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: teleport.MetricNamespace,
Name: teleport.MetricCacheEventsReceived,
Help: "Number of events received by a Teleport service cache. Teleport's Auth Service, Proxy Service, and other services cache incoming events related to their service.",
},
[]string{teleport.TagCacheComponent},
)
cacheStaleEventsReceived = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: teleport.MetricNamespace,
Name: teleport.MetricStaleCacheEventsReceived,
Help: "Number of stale events received by a Teleport service cache. A high percentage of stale events can indicate a degraded backend.",
},
[]string{teleport.TagCacheComponent},
)

cacheCollectors = []prometheus.Collector{cacheEventsReceived, cacheStaleEventsReceived}
)

func tombstoneKey() []byte {
return backend.Key("cache", teleport.Version, "tombstone", "ok")
}
Expand Down Expand Up @@ -238,6 +260,7 @@ func ForApps(cfg Config) Config {

// ForDatabases sets up watch configuration for database proxy servers.
func ForDatabases(cfg Config) Config {
cfg.target = "db"
cfg.Watches = []types.WatchKind{
{Kind: types.KindCertAuthority, LoadSecrets: false},
{Kind: types.KindClusterName},
Expand Down Expand Up @@ -593,6 +616,9 @@ const (

// New creates a new instance of Cache
func New(config Config) (*Cache, error) {
if err := utils.RegisterPrometheusCollectors(cacheCollectors...); err != nil {
return nil, trace.Wrap(err)
}
if err := config.CheckAndSetDefaults(); err != nil {
return nil, trace.Wrap(err)
}
Expand Down Expand Up @@ -920,8 +946,10 @@ func (c *Cache) fetchAndWatch(ctx context.Context, retry utils.Retry, timer *tim
// than pruning the resources that we think *might* have been removed from the real backend.
// TODO(fspmarshall): ^^^
//
cacheEventsReceived.WithLabelValues(c.target).Inc()
if event.Type == types.OpPut && !event.Resource.Expiry().IsZero() {
if now := c.Clock.Now(); now.After(event.Resource.Expiry()) {
cacheStaleEventsReceived.WithLabelValues(c.target).Inc()
staleEventCount++
if now.After(lastStalenessWarning.Add(time.Minute)) {
kind := event.Resource.GetKind()
Expand Down
9 changes: 9 additions & 0 deletions metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,12 @@ const (
// MetricBuildInfo tracks build information
MetricBuildInfo = "build_info"

// MetricCacheEventsReceived tracks the total number of events received by a cache
MetricCacheEventsReceived = "cache_events"

// MetricStaleCacheEventsReceived tracks the number of stale events received by a cache
MetricStaleCacheEventsReceived = "cache_stale_events"

// TagRange is a tag specifying backend requests
TagRange = "range"

Expand All @@ -206,4 +212,7 @@ const (

// TagGoVersion is a prometheus label for version of Go used to build Teleport
TagGoVersion = "goversion"

// TagCacheComponent is a prometheus label for the cache component
TagCacheComponent = "cache_component"
)