Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Prometheus metrics cache events and stale events #9826

Merged
merged 9 commits into from
Feb 11, 2022
2 changes: 2 additions & 0 deletions docs/pages/setup/reference/metrics.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ Now you can see the monitoring information by visiting several endpoints:
| `rx` | counter | Teleport | Number of bytes received. |
| `server_interactive_sessions_total` | gauge | Teleport | Number of active sessions. |
| `teleport_build_info` | gauge | Teleport | Provides build information of Teleport including gitref (git describe --long --tags), Go version, and Teleport version. The value of this gauge will always be 1. |
| `teleport_cache_events` | counter | Teleport | Number of events received by a Teleport service cache. Teleport's Auth Service, Proxy Service, and other services cache incoming events related to their service. |
| `teleport_cache_stale_events` | counter | Teleport | Number of stale events received by a Teleport service cache. A high percentage of stale events can indicate a degraded backend. |
| `teleport_registered_servers` | gauge | Teleport Auth | The number of Teleport servers (a server consists of one or more Teleport services) that have connected to the Teleport cluster, including the Teleport version. After disconnecting, a Teleport server has a TTL of 10 minutes, so this value will include servers that have recently disconnected but have not reached their TTL. |
| `teleport_reverse_tunnels_connected` | gauge | Teleport Proxy | Number of reverse SSH tunnels connected to the Teleport Proxy Service by Teleport instances. |
| `trusted_clusters` | gauge | Teleport | Number of tunnels per state. |
Expand Down
28 changes: 28 additions & 0 deletions lib/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,32 @@ import (

"github.com/gravitational/trace"
"github.com/jonboulle/clockwork"
"github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus"
"go.uber.org/atomic"
)

var (
cacheEventsReceived = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: teleport.MetricNamespace,
Name: teleport.MetricCacheEventsReceived,
Help: "Number of events received by a Teleport service cache. Teleport's Auth Service, Proxy Service, and other services cache incoming events related to their service.",
},
[]string{teleport.TagCacheComponent},
)
cacheStaleEventsReceived = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: teleport.MetricNamespace,
Name: teleport.MetricStaleCacheEventsReceived,
Help: "Number of stale events received by a Teleport service cache. A high percentage of stale events can indicate a degraded backend.",
},
[]string{teleport.TagCacheComponent},
)

cacheCollectors = []prometheus.Collector{cacheEventsReceived, cacheStaleEventsReceived}
)

func tombstoneKey() []byte {
return backend.Key("cache", teleport.Version, "tombstone", "ok")
}
Expand Down Expand Up @@ -241,6 +263,7 @@ func ForApps(cfg Config) Config {

// ForDatabases sets up watch configuration for database proxy servers.
func ForDatabases(cfg Config) Config {
cfg.target = "db"
cfg.Watches = []types.WatchKind{
{Kind: types.KindCertAuthority, LoadSecrets: false},
{Kind: types.KindClusterName},
Expand Down Expand Up @@ -609,6 +632,9 @@ const (

// New creates a new instance of Cache
func New(config Config) (*Cache, error) {
if err := utils.RegisterPrometheusCollectors(cacheCollectors...); err != nil {
return nil, trace.Wrap(err)
}
if err := config.CheckAndSetDefaults(); err != nil {
return nil, trace.Wrap(err)
}
Expand Down Expand Up @@ -956,8 +982,10 @@ func (c *Cache) fetchAndWatch(ctx context.Context, retry utils.Retry, timer *tim
// than pruning the resources that we think *might* have been removed from the real backend.
// TODO(fspmarshall): ^^^
//
cacheEventsReceived.WithLabelValues(c.target).Inc()
if event.Type == types.OpPut && !event.Resource.Expiry().IsZero() {
if now := c.Clock.Now(); now.After(event.Resource.Expiry()) {
cacheStaleEventsReceived.WithLabelValues(c.target).Inc()
staleEventCount++
if now.After(lastStalenessWarning.Add(time.Minute)) {
kind := event.Resource.GetKind()
Expand Down
9 changes: 9 additions & 0 deletions metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,12 @@ const (
// MetricBuildInfo tracks build information
MetricBuildInfo = "build_info"

// MetricCacheEventsReceived tracks the total number of events received by a cache
MetricCacheEventsReceived = "cache_events"

// MetricStaleCacheEventsReceived tracks the number of stale events received by a cache
MetricStaleCacheEventsReceived = "cache_stale_events"

// MetricRegisteredServers tracks the number of Teleport servers that have successfully registered with the Teleport cluster and have not reached the end of their ttl
MetricRegisteredServers = "registered_servers"

Expand Down Expand Up @@ -213,6 +219,9 @@ const (
// TagGoVersion is a prometheus label for version of Go used to build Teleport
TagGoVersion = "goversion"

// TagCacheComponent is a prometheus label for the cache component
TagCacheComponent = "cache_component"

// TagType is a prometheus label for type of resource or tunnel connected
TagType = "type"
)