Skip to content

Commit

Permalink
[Backport 7.59.x] expand conn_close_flushed map and add telemetry (#3…
Browse files Browse the repository at this point in the history
…0591)

Co-authored-by: Adam Karpowich <[email protected]>
  • Loading branch information
agent-platform-auto-pr[bot] and akarpz authored Oct 30, 2024
1 parent 67b092d commit 5df9e3b
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 1 deletion.
2 changes: 2 additions & 0 deletions pkg/network/ebpf/c/tracer.c
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ int BPF_BYPASSABLE_KPROBE(kprobe__tcp_done, struct sock *sk) {
__u64 timestamp = bpf_ktime_get_ns();
if (bpf_map_update_with_telemetry(conn_close_flushed, &t, &timestamp, BPF_NOEXIST, -EEXIST) == 0) {
cleanup_conn(ctx, &t, sk);
increment_telemetry_count(tcp_done_connection_flush);
flush_tcp_failure(ctx, &t, err);
} else {
bpf_map_delete_elem(&conn_close_flushed, &t);
Expand Down Expand Up @@ -287,6 +288,7 @@ int BPF_BYPASSABLE_KPROBE(kprobe__tcp_close, struct sock *sk) {
__u64 timestamp = bpf_ktime_get_ns();
if (bpf_map_update_with_telemetry(conn_close_flushed, &t, &timestamp, BPF_NOEXIST, -EEXIST) == 0) {
cleanup_conn(ctx, &t, sk);
increment_telemetry_count(tcp_close_connection_flush);
int err = 0;
bpf_probe_read_kernel_with_telemetry(&err, sizeof(err), (&sk->sk_err));
if (err == TCP_CONN_FAILED_RESET || err == TCP_CONN_FAILED_TIMEOUT || err == TCP_CONN_FAILED_REFUSED) {
Expand Down
2 changes: 1 addition & 1 deletion pkg/network/ebpf/c/tracer/maps.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ BPF_HASH_MAP(tcp_retransmits, conn_tuple_t, __u32, 0)
BPF_HASH_MAP(tcp_ongoing_connect_pid, skp_conn_tuple_t, pid_ts_t, 0)

/* Will hold a flag to indicate that closed connections have already been flushed */
BPF_HASH_MAP(conn_close_flushed, conn_tuple_t, __u64, 8192)
BPF_HASH_MAP(conn_close_flushed, conn_tuple_t, __u64, 16384)

/* Will hold the tcp/udp close events
* The keys are the cpu number and the values a perf file descriptor for a perf event
Expand Down
8 changes: 8 additions & 0 deletions pkg/network/ebpf/c/tracer/telemetry.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ enum telemetry_counter {
tcp_done_failed_tuple,
tcp_finish_connect_failed_tuple,
tcp_close_target_failures,
tcp_done_connection_flush,
tcp_close_connection_flush
};

static __always_inline void increment_telemetry_count(enum telemetry_counter counter_name) {
Expand Down Expand Up @@ -82,6 +84,12 @@ static __always_inline void increment_telemetry_count(enum telemetry_counter cou
case tcp_close_target_failures:
__sync_fetch_and_add(&val->tcp_close_target_failures, 1);
break;
case tcp_done_connection_flush:
__sync_fetch_and_add(&val->tcp_done_connection_flush, 1);
break;
case tcp_close_connection_flush:
__sync_fetch_and_add(&val->tcp_close_connection_flush, 1);
break;
}
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/network/ebpf/c/tracer/tracer.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ typedef struct {
__u64 tcp_done_failed_tuple;
__u64 tcp_finish_connect_failed_tuple;
__u64 tcp_close_target_failures;
__u64 tcp_done_connection_flush;
__u64 tcp_close_connection_flush;
} telemetry_t;

typedef struct {
Expand Down
2 changes: 2 additions & 0 deletions pkg/network/ebpf/kprobe_types_linux.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions pkg/network/tracer/connection/ebpf_tracer.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ var EbpfTracerTelemetry = struct {
tcpDoneFailedTuple *prometheus.Desc
tcpFinishConnectFailedTuple *prometheus.Desc
tcpCloseTargetFailures *prometheus.Desc
tcpDoneConnectionFlush *prometheus.Desc
tcpCloseConnectionFlush *prometheus.Desc
ongoingConnectPidCleaned telemetry.Counter
PidCollisions *telemetry.StatCounterWrapper
iterationDups telemetry.Counter
Expand All @@ -93,6 +95,8 @@ var EbpfTracerTelemetry = struct {
lastTcpDoneFailedTuple *atomic.Int64
lastTcpFinishConnectFailedTuple *atomic.Int64
lastTcpCloseTargetFailures *atomic.Int64
lastTcpDoneConnectionFlush *atomic.Int64
lastTcpCloseConnectionFlush *atomic.Int64
}{
telemetry.NewGauge(connTracerModuleName, "connections", []string{"ip_proto", "family"}, "Gauge measuring the number of active connections in the EBPF map"),
prometheus.NewDesc(connTracerModuleName+"__tcp_failed_connects", "Counter measuring the number of failed TCP connections in the EBPF map", nil, nil),
Expand All @@ -110,6 +114,8 @@ var EbpfTracerTelemetry = struct {
prometheus.NewDesc(connTracerModuleName+"__tcp_done_failed_tuple", "Counter measuring the number of failed TCP connections due to tuple collisions", nil, nil),
prometheus.NewDesc(connTracerModuleName+"__tcp_finish_connect_failed_tuple", "Counter measuring the number of failed TCP connections due to tuple collisions", nil, nil),
prometheus.NewDesc(connTracerModuleName+"__tcp_close_target_failures", "Counter measuring the number of failed TCP connections in tcp_close", nil, nil),
prometheus.NewDesc(connTracerModuleName+"__tcp_done_connection_flush", "Counter measuring the number of connection flushes performed in tcp_done", nil, nil),
prometheus.NewDesc(connTracerModuleName+"__tcp_close_connection_flush", "Counter measuring the number of connection flushes performed in tcp_close", nil, nil),
telemetry.NewCounter(connTracerModuleName, "ongoing_connect_pid_cleaned", []string{}, "Counter measuring the number of tcp_ongoing_connect_pid entries cleaned in userspace"),
telemetry.NewStatCounterWrapper(connTracerModuleName, "pid_collisions", []string{}, "Counter measuring number of process collisions"),
telemetry.NewCounter(connTracerModuleName, "iteration_dups", []string{}, "Counter measuring the number of connections iterated more than once"),
Expand All @@ -129,6 +135,8 @@ var EbpfTracerTelemetry = struct {
atomic.NewInt64(0),
atomic.NewInt64(0),
atomic.NewInt64(0),
atomic.NewInt64(0),
atomic.NewInt64(0),
}

type ebpfTracer struct {
Expand Down Expand Up @@ -533,6 +541,8 @@ func (t *ebpfTracer) Describe(ch chan<- *prometheus.Desc) {
ch <- EbpfTracerTelemetry.tcpDoneFailedTuple
ch <- EbpfTracerTelemetry.tcpFinishConnectFailedTuple
ch <- EbpfTracerTelemetry.tcpCloseTargetFailures
ch <- EbpfTracerTelemetry.tcpDoneConnectionFlush
ch <- EbpfTracerTelemetry.tcpCloseConnectionFlush
}

// Collect returns the current state of all metrics of the collector
Expand Down Expand Up @@ -600,6 +610,14 @@ func (t *ebpfTracer) Collect(ch chan<- prometheus.Metric) {
delta = int64(ebpfTelemetry.Tcp_close_target_failures) - EbpfTracerTelemetry.lastTcpCloseTargetFailures.Load()
EbpfTracerTelemetry.lastTcpCloseTargetFailures.Store(int64(ebpfTelemetry.Tcp_close_target_failures))
ch <- prometheus.MustNewConstMetric(EbpfTracerTelemetry.tcpCloseTargetFailures, prometheus.CounterValue, float64(delta))

delta = int64(ebpfTelemetry.Tcp_done_connection_flush) - EbpfTracerTelemetry.lastTcpDoneConnectionFlush.Load()
EbpfTracerTelemetry.lastTcpDoneConnectionFlush.Store(int64(ebpfTelemetry.Tcp_done_connection_flush))
ch <- prometheus.MustNewConstMetric(EbpfTracerTelemetry.tcpDoneConnectionFlush, prometheus.CounterValue, float64(delta))

delta = int64(ebpfTelemetry.Tcp_close_connection_flush) - EbpfTracerTelemetry.lastTcpCloseConnectionFlush.Load()
EbpfTracerTelemetry.lastTcpCloseConnectionFlush.Store(int64(ebpfTelemetry.Tcp_close_connection_flush))
ch <- prometheus.MustNewConstMetric(EbpfTracerTelemetry.tcpCloseConnectionFlush, prometheus.CounterValue, float64(delta))
}

// DumpMaps (for debugging purpose) returns all maps content by default or selected maps from maps parameter.
Expand Down

0 comments on commit 5df9e3b

Please sign in to comment.