From 55d3e36aa5c84ea50b4ff86fbf20663834805548 Mon Sep 17 00:00:00 2001 From: Carson Anderson Date: Thu, 17 Mar 2022 15:18:05 -0600 Subject: [PATCH 1/4] add ssh connect attempts metric --- docs/pages/setup/reference/metrics.mdx | 1 + lib/srv/regular/proxy.go | 11 ++++++++++- metrics.go | 3 +++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/pages/setup/reference/metrics.mdx b/docs/pages/setup/reference/metrics.mdx index ae574253ff76f..c22d325092fb1 100644 --- a/docs/pages/setup/reference/metrics.mdx +++ b/docs/pages/setup/reference/metrics.mdx @@ -112,6 +112,7 @@ Now you can see the monitoring information by visiting several endpoints: | `teleport_build_info` | gauge | Teleport | Provides build information of Teleport including gitref (git describe --long --tags), Go version, and Teleport version. The value of this gauge will always be 1. | | `teleport_cache_events` | counter | Teleport | Number of events received by a Teleport service cache. Teleport's Auth Service, Proxy Service, and other services cache incoming events related to their service. | | `teleport_cache_stale_events` | counter | Teleport | Number of stale events received by a Teleport service cache. A high percentage of stale events can indicate a degraded backend. | +| `teleport_connect_to_node_attempts_total` | gauge | Teleport Proxy | Number of `tsh login` or `tsh ssh` login attempts. | | `teleport_connected_resources` | gauge | Teleport Auth | Tracks the number and type of resources connected via keepalives. | | `teleport_registered_servers` | gauge | Teleport Auth | The number of Teleport servers (a server consists of one or more Teleport services) that have connected to the Teleport cluster, including the Teleport version. After disconnecting, a Teleport server has a TTL of 10 minutes, so this value will include servers that have recently disconnected but have not reached their TTL. | | `teleport_reverse_tunnels_connected` | gauge | Teleport Proxy | Number of reverse SSH tunnels connected to the Teleport Proxy Service by Teleport instances. | diff --git a/lib/srv/regular/proxy.go b/lib/srv/regular/proxy.go index cf11e7eed69e1..1205a5637bc11 100644 --- a/lib/srv/regular/proxy.go +++ b/lib/srv/regular/proxy.go @@ -59,7 +59,15 @@ var ( // failedConnectingToNode counts failed attempts to connect to a node }, ) - prometheusCollectors = []prometheus.Collector{proxiedSessions, failedConnectingToNode} + connectingToNode = prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: teleport.MetricNamespace, + Name: teleport.MetricConnectToNodeAttempts, + Help: "Number of ssh connect attempts", + }, + ) + + prometheusCollectors = []prometheus.Collector{proxiedSessions, failedConnectingToNode, connectingToNode} ) // proxySubsys implements an SSH subsystem for proxying listening sockets from @@ -405,6 +413,7 @@ func (t *proxySubsys) proxyToHost( AddrNetwork: "tcp", Addr: serverAddr, } + connectingToNode.Inc() conn, err := site.Dial(reversetunnel.DialParams{ From: remoteAddr, To: toAddr, diff --git a/metrics.go b/metrics.go index 49ed320c614a5..71e5a8dda0602 100644 --- a/metrics.go +++ b/metrics.go @@ -49,6 +49,9 @@ const ( // MetricFailedLoginAttempts counts failed login attempts MetricFailedLoginAttempts = "failed_login_attempts_total" + // MetricConnectToNodeAttempts counts ssh attempts + MetricConnectToNodeAttempts = "connect_to_node_attempts_total" + // MetricFailedConnectToNodeAttempts counts failed ssh attempts MetricFailedConnectToNodeAttempts = "failed_connect_to_node_attempts_total" From fb6f622338842c60b17942829b6677ad90f7aab5 Mon Sep 17 00:00:00 2001 From: Carson Anderson Date: Thu, 17 Mar 2022 15:25:46 -0600 Subject: [PATCH 2/4] fix help message wording --- docs/pages/setup/reference/metrics.mdx | 2 +- lib/srv/regular/proxy.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/pages/setup/reference/metrics.mdx b/docs/pages/setup/reference/metrics.mdx index c22d325092fb1..edc02b97e238e 100644 --- a/docs/pages/setup/reference/metrics.mdx +++ b/docs/pages/setup/reference/metrics.mdx @@ -112,7 +112,7 @@ Now you can see the monitoring information by visiting several endpoints: | `teleport_build_info` | gauge | Teleport | Provides build information of Teleport including gitref (git describe --long --tags), Go version, and Teleport version. The value of this gauge will always be 1. | | `teleport_cache_events` | counter | Teleport | Number of events received by a Teleport service cache. Teleport's Auth Service, Proxy Service, and other services cache incoming events related to their service. | | `teleport_cache_stale_events` | counter | Teleport | Number of stale events received by a Teleport service cache. A high percentage of stale events can indicate a degraded backend. | -| `teleport_connect_to_node_attempts_total` | gauge | Teleport Proxy | Number of `tsh login` or `tsh ssh` login attempts. | +| `teleport_connect_to_node_attempts_total` | gauge | Teleport Proxy | Number of ssh connection attempts to a node. | | `teleport_connected_resources` | gauge | Teleport Auth | Tracks the number and type of resources connected via keepalives. | | `teleport_registered_servers` | gauge | Teleport Auth | The number of Teleport servers (a server consists of one or more Teleport services) that have connected to the Teleport cluster, including the Teleport version. After disconnecting, a Teleport server has a TTL of 10 minutes, so this value will include servers that have recently disconnected but have not reached their TTL. | | `teleport_reverse_tunnels_connected` | gauge | Teleport Proxy | Number of reverse SSH tunnels connected to the Teleport Proxy Service by Teleport instances. | diff --git a/lib/srv/regular/proxy.go b/lib/srv/regular/proxy.go index 1205a5637bc11..a4c5e2b05111c 100644 --- a/lib/srv/regular/proxy.go +++ b/lib/srv/regular/proxy.go @@ -63,7 +63,7 @@ var ( // failedConnectingToNode counts failed attempts to connect to a node prometheus.CounterOpts{ Namespace: teleport.MetricNamespace, Name: teleport.MetricConnectToNodeAttempts, - Help: "Number of ssh connect attempts", + Help: "Number of ssh connection attempts to a node.", }, ) From c1e38d898996d80d3774358a2190c45540fdd4ab Mon Sep 17 00:00:00 2001 From: Carson Anderson Date: Mon, 21 Mar 2022 08:48:16 -0600 Subject: [PATCH 3/4] updated HELP and docs to clarify usage of connect metric --- docs/pages/setup/reference/metrics.mdx | 4 ++-- lib/srv/regular/proxy.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/pages/setup/reference/metrics.mdx b/docs/pages/setup/reference/metrics.mdx index edc02b97e238e..07db8ede7ae8d 100644 --- a/docs/pages/setup/reference/metrics.mdx +++ b/docs/pages/setup/reference/metrics.mdx @@ -54,7 +54,7 @@ Now you can see the monitoring information by visiting several endpoints: | `etcd_backend_tx_seconds` | histogram | etcd | Latency for etcd transaction operations. | | `etcd_backend_write_requests` | counter | etcd | Number of write requests to the database. | | `etcd_backend_write_seconds` | histogram | etcd | Latency for etcd write operations. | -| `failed_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of times a user failed connecting to a node | +| `failed_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of failed SSH connection attempts to a node. Used with `teleport_connect_to_node_attempts_total` to get failure rate. | | `failed_login_attempts_total` | counter | Teleport Proxy | Number of failed `tsh login` or `tsh ssh` logins. | | `firestore_events_backend_batch_read_requests` | counter | GCP Cloud Firestore | Number of batch read requests to Cloud Firestore events. | | `firestore_events_backend_batch_read_seconds` | histogram | GCP Cloud Firestore | Latency for Cloud Firestore events batch read operations. | @@ -112,7 +112,7 @@ Now you can see the monitoring information by visiting several endpoints: | `teleport_build_info` | gauge | Teleport | Provides build information of Teleport including gitref (git describe --long --tags), Go version, and Teleport version. The value of this gauge will always be 1. | | `teleport_cache_events` | counter | Teleport | Number of events received by a Teleport service cache. Teleport's Auth Service, Proxy Service, and other services cache incoming events related to their service. | | `teleport_cache_stale_events` | counter | Teleport | Number of stale events received by a Teleport service cache. A high percentage of stale events can indicate a degraded backend. | -| `teleport_connect_to_node_attempts_total` | gauge | Teleport Proxy | Number of ssh connection attempts to a node. | +| `teleport_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of SSH connection attempts to a node. Used with `failed_connect_to_node_attempts_total` to get failure rate. | | `teleport_connected_resources` | gauge | Teleport Auth | Tracks the number and type of resources connected via keepalives. | | `teleport_registered_servers` | gauge | Teleport Auth | The number of Teleport servers (a server consists of one or more Teleport services) that have connected to the Teleport cluster, including the Teleport version. After disconnecting, a Teleport server has a TTL of 10 minutes, so this value will include servers that have recently disconnected but have not reached their TTL. | | `teleport_reverse_tunnels_connected` | gauge | Teleport Proxy | Number of reverse SSH tunnels connected to the Teleport Proxy Service by Teleport instances. | diff --git a/lib/srv/regular/proxy.go b/lib/srv/regular/proxy.go index a4c5e2b05111c..bcbbe2bddad6a 100644 --- a/lib/srv/regular/proxy.go +++ b/lib/srv/regular/proxy.go @@ -55,7 +55,7 @@ var ( // failedConnectingToNode counts failed attempts to connect to a node failedConnectingToNode = prometheus.NewCounter( prometheus.CounterOpts{ Name: teleport.MetricFailedConnectToNodeAttempts, - Help: "Number of failed attempts to connect to a node", + Help: "Number of failed SSH connection attempts to a node. Used with `teleport_connect_to_node_attempts_total` to get failure rate.", }, ) @@ -63,7 +63,7 @@ var ( // failedConnectingToNode counts failed attempts to connect to a node prometheus.CounterOpts{ Namespace: teleport.MetricNamespace, Name: teleport.MetricConnectToNodeAttempts, - Help: "Number of ssh connection attempts to a node.", + Help: "Number of SSH connection attempts to a node. Used with `failed_connect_to_node_attempts_total` to get failure rate.", }, ) From b343c6c5fc442211258cc4f9496328541ee73483 Mon Sep 17 00:00:00 2001 From: Carson Anderson Date: Tue, 22 Mar 2022 16:05:44 -0600 Subject: [PATCH 4/4] Apply help and docs suggestions Co-authored-by: Paul Gottschling --- docs/pages/setup/reference/metrics.mdx | 4 ++-- lib/srv/regular/proxy.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/pages/setup/reference/metrics.mdx b/docs/pages/setup/reference/metrics.mdx index 07db8ede7ae8d..c468261e497d4 100644 --- a/docs/pages/setup/reference/metrics.mdx +++ b/docs/pages/setup/reference/metrics.mdx @@ -54,7 +54,7 @@ Now you can see the monitoring information by visiting several endpoints: | `etcd_backend_tx_seconds` | histogram | etcd | Latency for etcd transaction operations. | | `etcd_backend_write_requests` | counter | etcd | Number of write requests to the database. | | `etcd_backend_write_seconds` | histogram | etcd | Latency for etcd write operations. | -| `failed_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of failed SSH connection attempts to a node. Used with `teleport_connect_to_node_attempts_total` to get failure rate. | +| `failed_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of failed SSH connection attempts to a node. Use with `teleport_connect_to_node_attempts_total` to get the failure rate. | | `failed_login_attempts_total` | counter | Teleport Proxy | Number of failed `tsh login` or `tsh ssh` logins. | | `firestore_events_backend_batch_read_requests` | counter | GCP Cloud Firestore | Number of batch read requests to Cloud Firestore events. | | `firestore_events_backend_batch_read_seconds` | histogram | GCP Cloud Firestore | Latency for Cloud Firestore events batch read operations. | @@ -112,7 +112,7 @@ Now you can see the monitoring information by visiting several endpoints: | `teleport_build_info` | gauge | Teleport | Provides build information of Teleport including gitref (git describe --long --tags), Go version, and Teleport version. The value of this gauge will always be 1. | | `teleport_cache_events` | counter | Teleport | Number of events received by a Teleport service cache. Teleport's Auth Service, Proxy Service, and other services cache incoming events related to their service. | | `teleport_cache_stale_events` | counter | Teleport | Number of stale events received by a Teleport service cache. A high percentage of stale events can indicate a degraded backend. | -| `teleport_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of SSH connection attempts to a node. Used with `failed_connect_to_node_attempts_total` to get failure rate. | +| `teleport_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of SSH connection attempts to a node. Use with `failed_connect_to_node_attempts_total` to get the failure rate. | | `teleport_connected_resources` | gauge | Teleport Auth | Tracks the number and type of resources connected via keepalives. | | `teleport_registered_servers` | gauge | Teleport Auth | The number of Teleport servers (a server consists of one or more Teleport services) that have connected to the Teleport cluster, including the Teleport version. After disconnecting, a Teleport server has a TTL of 10 minutes, so this value will include servers that have recently disconnected but have not reached their TTL. | | `teleport_reverse_tunnels_connected` | gauge | Teleport Proxy | Number of reverse SSH tunnels connected to the Teleport Proxy Service by Teleport instances. | diff --git a/lib/srv/regular/proxy.go b/lib/srv/regular/proxy.go index bcbbe2bddad6a..1b854fe7420fa 100644 --- a/lib/srv/regular/proxy.go +++ b/lib/srv/regular/proxy.go @@ -55,7 +55,7 @@ var ( // failedConnectingToNode counts failed attempts to connect to a node failedConnectingToNode = prometheus.NewCounter( prometheus.CounterOpts{ Name: teleport.MetricFailedConnectToNodeAttempts, - Help: "Number of failed SSH connection attempts to a node. Used with `teleport_connect_to_node_attempts_total` to get failure rate.", + Help: "Number of failed SSH connection attempts to a node. Use with `teleport_connect_to_node_attempts_total` to get the failure rate.", }, ) @@ -63,7 +63,7 @@ var ( // failedConnectingToNode counts failed attempts to connect to a node prometheus.CounterOpts{ Namespace: teleport.MetricNamespace, Name: teleport.MetricConnectToNodeAttempts, - Help: "Number of SSH connection attempts to a node. Used with `failed_connect_to_node_attempts_total` to get failure rate.", + Help: "Number of SSH connection attempts to a node. Use with `failed_connect_to_node_attempts_total` to get the failure rate.", }, )