Skip to content

Commit

Permalink
Propagate context through TCP packets (#1161)
Browse files Browse the repository at this point in the history
  • Loading branch information
grcevski authored Sep 12, 2024
1 parent c6655f6 commit 6a94e5e
Show file tree
Hide file tree
Showing 55 changed files with 281 additions and 36 deletions.
5 changes: 3 additions & 2 deletions bpf/http_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,13 @@ const u8 ip4ip6_prefix[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff};

#ifdef BPF_DEBUG
static __always_inline void dbg_print_http_connection_info(connection_info_t *info) {
bpf_dbg_printk("[http] s_h = %llx, s_l = %llx, d_h = %llx, d_l = %llx, s_port=%d, d_port=%d",
bpf_dbg_printk("[conn] s_h = %llx, s_l = %llx, s_port=%d",
*(u64 *)(&info->s_addr),
*(u64 *)(&info->s_addr[8]),
info->s_port);
bpf_dbg_printk("[conn] d_h = %llx, d_l = %llx, d_port=%d",
*(u64 *)(&info->d_addr),
*(u64 *)(&info->d_addr[8]),
info->s_port,
info->d_port);
}
#else
Expand Down
92 changes: 77 additions & 15 deletions bpf/k_tracer.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

char __license[] SEC("license") = "Dual MIT/GPL";

#define TC_SYN_PACKET_ID 0xdeadf00d

// Temporary tracking of accept arguments
struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
Expand Down Expand Up @@ -186,6 +188,39 @@ int BPF_KPROBE(kprobe_tcp_connect, struct sock *sk) {

bpf_dbg_printk("=== tcp connect %llx ===", id);

tp_info_pid_t *tp_p = tp_buf();

// Connect runs before the SYN packet is sent.
// We use this opportunity to setup a trace context information for the connection.
// We'll later query the trace information in tc_egress, and serialize it on the TCP packet.
// Why would we do this here instead of on the tc_egress itself? We could move this on the tc_egress,
// but we would be modifying all packets, not just for processes which are instrumented,
// since we can't reliably tell the process PID in TC or socket filters.
if (tp_p) {
tp_p->tp.ts = bpf_ktime_get_ns();
tp_p->tp.flags = 1;
tp_p->valid = 1;
tp_p->pid = TC_SYN_PACKET_ID; // set an ID up here in case someone else is doing what we are doing
urand_bytes(tp_p->tp.span_id, SPAN_ID_SIZE_BYTES);
tp_info_pid_t *server_tp = find_parent_trace();
if (server_tp && valid_trace(server_tp->tp.trace_id)) {
__builtin_memcpy(tp_p->tp.trace_id, server_tp->tp.trace_id, sizeof(tp_p->tp.trace_id));
__builtin_memcpy(tp_p->tp.parent_id, server_tp->tp.span_id, sizeof(tp_p->tp.parent_id));
} else {
urand_bytes(tp_p->tp.trace_id, TRACE_ID_SIZE_BYTES);
__builtin_memset(tp_p->tp.parent_id, 0, sizeof(tp_p->tp.span_id));
}

connection_info_t conn = {};
parse_sock_info(sk, &conn);
sort_connection_info(&conn);

bpf_dbg_printk("Setting up tp info");
dbg_print_http_connection_info(&conn);

bpf_map_update_elem(&outgoing_trace_map, &conn, tp_p, BPF_ANY);
}

u64 addr = (u64)sk;

sock_args_t args = {};
Expand Down Expand Up @@ -232,7 +267,7 @@ int BPF_KRETPROBE(kretprobe_sys_connect, int fd)
sort_connection_info(&info.p_conn.conn);
info.p_conn.pid = pid_from_pid_tgid(id);
info.orig_dport = orig_dport;

bpf_map_update_elem(&pid_tid_to_conn, &id, &info, BPF_ANY); // to support SSL
}

Expand Down Expand Up @@ -678,17 +713,35 @@ int app_ingress(struct __sk_buff *skb) {
return 0;
}

unsigned char buf[12];
// handle SYN only, ignore SYN+ACK packets
if (!tcp_syn(&tcp) || tcp_ack(&tcp)) {
return 0;
}

if (tcp_syn(&tcp) && !tcp_ack(&tcp)) {
bpf_skb_load_bytes(skb, tcp.hdr_len, &buf, 4);
if (skb->len - tcp.hdr_len < sizeof(tp_info_pid_t)) {
bpf_printk("SYN packet without tp info");
return 0;
}

tp_info_pid_t tp = {0};

s32 len = skb->len-sizeof(u32);
bpf_printk("SYN packed len = %d, offset = %d, hdr_len %d", skb->len, len, tcp.hdr_len);
bpf_skb_load_bytes(skb, tcp.hdr_len, &tp, sizeof(tp_info_pid_t));

bpf_printk("***Data: %x%x", buf[3], buf[2]);
bpf_printk("***Data: %x%x", buf[1], buf[0]);
if (tp.pid != TC_SYN_PACKET_ID) {
bpf_printk("SYN packet without the custom ID inside pid, ignoring...");
return 0;
}

s32 len = skb->len-sizeof(u32);
bpf_printk("Received SYN packed len = %d, offset = %d, hdr_len %d", skb->len, len, tcp.hdr_len);

unsigned char tp_buf[TP_MAX_VAL_LENGTH];
make_tp_string(tp_buf, &tp.tp);
bpf_printk("tp: %s", tp_buf);

// Once we receive a traceID over the wire (TCP packet) we store it for later to be used by the trace code.
bpf_map_update_elem(&incoming_trace_map, &conn, &tp, BPF_ANY);

return 0;
}

Expand All @@ -703,17 +756,26 @@ int app_egress(struct __sk_buff *skb) {
return 0;
}

// handle SYN only, ignore SYN+ACK packets
if (!tcp_syn(&tcp) || tcp_ack(&tcp)) {
return 0;
}

sort_connection_info(&conn);

if (tcp_syn(&tcp) && !tcp_ack(&tcp)) {
tp_info_pid_t *tp = bpf_map_lookup_elem(&outgoing_trace_map, &conn);

if (tp) {
bpf_printk("SYN packed len = %d", skb->len);

u32 val=0xdeadf00d;
unsigned char tp_buf[TP_MAX_VAL_LENGTH];
make_tp_string(tp_buf, &tp->tp);
bpf_printk("tp: %s", tp_buf);

uint16_t pkt_end = skb->data_end - skb->data;
bpf_printk("Changing tail and setting data on syn, end=%d", pkt_end);
bpf_skb_change_tail(skb, pkt_end + sizeof(val), 0);
bpf_skb_store_bytes(skb, pkt_end, &val, sizeof(val), 0);
bpf_skb_change_tail(skb, pkt_end + sizeof(tp_info_pid_t), 0);
bpf_skb_store_bytes(skb, pkt_end, tp, sizeof(tp_info_pid_t), 0);

u32 offset_ip_tot_len = 0;
u32 offset_ip_checksum = 0;
Expand All @@ -724,10 +786,10 @@ int app_egress(struct __sk_buff *skb) {
offset_ip_tot_len = ETH_HLEN + offsetof(struct ipv6hdr, payload_len);
}

u16 new_tot_len = bpf_htons(bpf_ntohs(tcp.tot_len) + sizeof(val));
u16 new_tot_len = bpf_htons(bpf_ntohs(tcp.tot_len) + sizeof(tp_info_pid_t));

bpf_printk("tot_len = %d, tot_len_alt = %d, new_tot_len = %d", tcp.tot_len, bpf_ntohs(tcp.tot_len), new_tot_len);
bpf_printk("new_tot_len_alt = %d, h_proto = %d, skb->len = %d", bpf_ntohs(new_tot_len), tcp.h_proto, skb->len);
bpf_printk("tot_len = %u, new_tot_len = %u", bpf_ntohs(tcp.tot_len), bpf_ntohs(new_tot_len));
bpf_printk("h_proto = %u, skb->len = %u", tcp.h_proto, skb->len);

if (offset_ip_checksum) {
bpf_l3_csum_replace(skb, offset_ip_checksum, tcp.tot_len, new_tot_len, sizeof(u16));
Expand Down
2 changes: 1 addition & 1 deletion bpf/protocol_http.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ int protocol_http(void *ctx) {
if (meta->type == EVENT_HTTP_CLIENT && !valid_span(tp_p->tp.parent_id)) {
bpf_dbg_printk("Looking for trace id of a client span");
tp_info_pid_t *server_tp = find_parent_trace();
if (server_tp && server_tp->valid) {
if (server_tp && server_tp->valid && valid_trace(server_tp->tp.trace_id)) {
bpf_dbg_printk("Found existing server span for id=%llx", bpf_get_current_pid_tgid());
__builtin_memcpy(info->tp.trace_id, server_tp->tp.trace_id, sizeof(info->tp.trace_id));
__builtin_memcpy(info->tp.parent_id, server_tp->tp.span_id, sizeof(info->tp.parent_id));
Expand Down
2 changes: 1 addition & 1 deletion bpf/protocol_tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ static __always_inline void handle_unknown_tcp_connection(pid_connection_info_t

tp_info_pid_t *server_tp = find_parent_trace();

if (server_tp && server_tp->valid) {
if (server_tp && server_tp->valid && valid_trace(server_tp->tp.trace_id)) {
bpf_dbg_printk("Found existing server tp for client call");
__builtin_memcpy(req->tp.trace_id, server_tp->tp.trace_id, sizeof(req->tp.trace_id));
__builtin_memcpy(req->tp.parent_id, server_tp->tp.span_id, sizeof(req->tp.parent_id));
Expand Down
2 changes: 1 addition & 1 deletion bpf/tcp_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ static __always_inline bool read_sk_buff(struct __sk_buff *skb, protocol_info_t
tcp->h_proto = h_proto;
tcp->hdr_len += doff;

if ((skb->len - tcp->hdr_len) < 0) { // less than 0 is a packet we can't parse
if (tcp->hdr_len > skb->len) { // bad packet, hdr_len is greater than the skb len, we can't parse this.
return false;
}

Expand Down
54 changes: 39 additions & 15 deletions bpf/trace_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,14 @@ static __always_inline void delete_server_trace(trace_key_t *t_key) {
// bpf_dbg_printk("Deleting server span for id=%llx, pid=%d, ns=%d, res = %d", bpf_get_current_pid_tgid(), t_key->p_key.pid, t_key->p_key.ns, res);
}

static __always_inline u8 valid_span(unsigned char *span_id) {
return *((u64 *)span_id) != 0;
}

static __always_inline u8 valid_trace(unsigned char *trace_id) {
return *((u64 *)trace_id) != 0 && *((u64 *)(trace_id + 8)) != 0;
}

static __always_inline void server_or_client_trace(http_connection_metadata_t *meta, connection_info_t *conn, tp_info_pid_t *tp_p) {
if (!meta) {
return;
Expand Down Expand Up @@ -196,27 +204,47 @@ static __always_inline void get_or_create_trace_info(http_connection_metadata_t

if (meta) {
if (meta->type == EVENT_HTTP_CLIENT) {
// Before this change the client code only looked for a server wrapped trace and
// if it didn't find it would generate the trace information later. Now we look if
// the TC egress has setup TCP trace info for us. If we find this info we set the bool as having trace info,
// i.e. we must not regenerate it later. The kprobe on 'tcp_connect' does the lookup of the server trace
// for us, so the server context should already be setup.
tp_info_pid_t *in_tp = bpf_map_lookup_elem(&outgoing_trace_map, conn);
tp_p->pid = -1; // we only want to prevent correlation of duplicate server calls by PID
tp_info_pid_t *server_tp = find_parent_trace();

if (server_tp && server_tp->valid) {
if (in_tp) {
found_tp = 1;
bpf_dbg_printk("Found existing server tp for client call");
__builtin_memcpy(tp_p->tp.trace_id, server_tp->tp.trace_id, sizeof(tp_p->tp.trace_id));
__builtin_memcpy(tp_p->tp.parent_id, server_tp->tp.span_id, sizeof(tp_p->tp.parent_id));
tp_p = in_tp;
} else {
tp_info_pid_t *server_tp = find_parent_trace();

if (server_tp && server_tp->valid && valid_trace(server_tp->tp.trace_id)) {
found_tp = 1;
bpf_dbg_printk("Found existing server tp for client call");
__builtin_memcpy(tp_p->tp.trace_id, server_tp->tp.trace_id, sizeof(tp_p->tp.trace_id));
__builtin_memcpy(tp_p->tp.parent_id, server_tp->tp.span_id, sizeof(tp_p->tp.parent_id));
}
}
} else {
//bpf_dbg_printk("Looking up existing trace for connection");
//dbg_print_http_connection_info(conn);

tp_info_pid_t *existing_tp = trace_info_for_connection(conn);

if (correlated_requests(tp_p, existing_tp)) {
// For server requests, we first look for TCP info (setup by TC ingress) and then we fall back to black-box info.
tp_info_pid_t *existing_tp = bpf_map_lookup_elem(&incoming_trace_map, conn);
if (existing_tp) {
found_tp = 1;
bpf_dbg_printk("Found existing correlated tp for server request");
bpf_dbg_printk("Found incoming (TCP) tp for server request");
__builtin_memcpy(tp_p->tp.trace_id, existing_tp->tp.trace_id, sizeof(tp_p->tp.trace_id));
__builtin_memcpy(tp_p->tp.parent_id, existing_tp->tp.span_id, sizeof(tp_p->tp.parent_id));
}
} else {
existing_tp = trace_info_for_connection(conn);

if (correlated_requests(tp_p, existing_tp)) {
found_tp = 1;
bpf_dbg_printk("Found existing correlated tp for server request");
__builtin_memcpy(tp_p->tp.trace_id, existing_tp->tp.trace_id, sizeof(tp_p->tp.trace_id));
__builtin_memcpy(tp_p->tp.parent_id, existing_tp->tp.span_id, sizeof(tp_p->tp.parent_id));
}
}
}
}

Expand Down Expand Up @@ -276,8 +304,4 @@ static __always_inline void get_or_create_trace_info(http_connection_metadata_t
return;
}

static __always_inline u8 valid_span(unsigned char *span_id) {
return *((u64 *)span_id) != 0;
}

#endif
16 changes: 15 additions & 1 deletion bpf/tracing.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,20 @@ struct {
__uint(pinning, LIBBPF_PIN_BY_NAME);
} trace_map SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__type(key, connection_info_t); // key: the connection info
__type(value, tp_info_pid_t); // value: traceparent info
__uint(max_entries, MAX_CONCURRENT_REQUESTS);
} incoming_trace_map SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__type(key, connection_info_t); // key: the connection info
__type(value, tp_info_pid_t); // value: traceparent info
__uint(max_entries, MAX_CONCURRENT_REQUESTS);
} outgoing_trace_map SEC(".maps");

static __always_inline void make_tp_string(unsigned char *buf, tp_info_t *tp) {
// Version
*buf++ = '0'; *buf++ = '0'; *buf++ = '-';
Expand All @@ -34,7 +48,7 @@ static __always_inline void make_tp_string(unsigned char *buf, tp_info_t *tp) {
}

static __always_inline tp_info_pid_t *trace_info_for_connection(connection_info_t *conn) {
return (tp_info_pid_t *)bpf_map_lookup_elem(&trace_map, conn);
return (tp_info_pid_t *)bpf_map_lookup_elem(&trace_map, conn);
}

static __always_inline u64 current_epoch(u64 ts) {
Expand Down
6 changes: 6 additions & 0 deletions pkg/internal/ebpf/gotracer/bpf_arm64_bpfel.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified pkg/internal/ebpf/gotracer/bpf_arm64_bpfel.o
Binary file not shown.
6 changes: 6 additions & 0 deletions pkg/internal/ebpf/gotracer/bpf_debug_arm64_bpfel.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified pkg/internal/ebpf/gotracer/bpf_debug_arm64_bpfel.o
Binary file not shown.
Loading

0 comments on commit 6a94e5e

Please sign in to comment.