diff --git a/bpf/Makefile b/bpf/Makefile
index 0aae8e3e5d..4732245976 100644
--- a/bpf/Makefile
+++ b/bpf/Makefile
@@ -23,7 +23,7 @@ BPF_BUNDLE := $(OUT_DIR)/parca-agent.bpf.tar.gz
 LIBBPF_HEADERS := $(OUT_DIR)/libbpf/$(ARCH)/usr/include
 
 VMLINUX_INCLUDE_PATH := $(SHORT_ARCH)
-BPF_SRC := unwinders/native.bpf.c
+BPF_SRC := unwinders/native.bpf.c unwinders/go_traceid.h
 RBPERF_SRC := unwinders/rbperf.bpf.c
 PYPERF_SRC := unwinders/pyperf.bpf.c
 OUT_PID_NAMESPACE_DETECTOR_SRC := pid_namespace.bpf.c
diff --git a/bpf/unwinders/go_traceid.h b/bpf/unwinders/go_traceid.h
new file mode 100644
index 0000000000..ef8aa68a59
--- /dev/null
+++ b/bpf/unwinders/go_traceid.h
@@ -0,0 +1,199 @@
+// +build ignore
+// ^^ this is a golang build tag meant to exclude this C file from compilation
+// by the CGO compiler
+//
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2024 The Parca Authors
+
+#include "vmlinux.h"
+#include "basic_types.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include "tls.h"
+
+struct go_string
+{
+    char *str;
+    s64 len;
+};
+
+struct go_slice
+{
+    void *array;
+    s64 len;
+    s64 cap;
+};
+
+struct map_bucket {
+    char tophash[8];
+    struct go_string keys[8];
+    struct go_string values[8];
+    void *overflow;
+};
+
+struct
+{
+    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+    __uint(key_size, sizeof(u32));
+    __uint(value_size, sizeof(struct map_bucket));
+    __uint(max_entries, 1);
+} golang_mapbucket_storage_map SEC(".maps");
+
+// length of "otel.traceid" is 12
+#define TRACEID_MAP_KEY_LENGTH 12
+#define TRACEID_MAP_VAL_LENGTH 32
+#define MAX_BUCKETS 8
+
+static __always_inline bool bpf_memcmp(char *s1, char *s2, s32 size)
+{
+	for (int i = 0; i < size; i++)
+	{
+		if (s1[i] != s2[i])
+		{
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static __always_inline void hex_string_to_bytes(char *str, u32 size, unsigned char *out)
+{
+	for (int i = 0; i < (size / 2); i++)
+	{
+		char ch0 = str[2 * i];
+		char ch1 = str[2 * i + 1];
+		u8 nib0 = (ch0 & 0xF) + (ch0 >> 6) | ((ch0 >> 3) & 0x8);
+		u8 nib1 = (ch1 & 0xF) + (ch1 >> 6) | ((ch1 >> 3) & 0x8);
+		out[i] = (nib0 << 4) | nib1;
+	}
+}
+
+// Go processes store the current goroutine in thread local store. From there
+// this reads the g (aka goroutine) struct, then the m (the actual operating
+// system thread) of that goroutine, and finally curg (current goroutine). This
+// chain is necessary because getg().m.curg points to the current user g
+// assigned to the thread (curg == getg() when not on the system stack). curg
+// may be nil if there is no user g, such as when running in the scheduler. If
+// curg is nil, then g is either a system stack (called g0) or a signal handler
+// g (gsignal). Neither one will ever have labels.
+static __always_inline bool get_trace_id(unsigned char *res_trace_id) {
+    long res;
+    struct task_struct *task = (struct task_struct *)bpf_get_current_task();
+    if (task == NULL) {
+        return false;
+    }
+
+    // It appears from all Go binaries we looked at 0xfffffffffffffff8 is the offset of `runtime.g`.
+    u64 g_addr_offset = 0xfffffffffffffff8;
+
+    size_t g_addr;
+    res = bpf_probe_read_user(&g_addr, sizeof(void *), (void*)(read_tls_base(task)+g_addr_offset));
+    if (res < 0) {
+        return false;
+    }
+
+    // DW_TAG_member
+    //   DW_AT_name    ("m")
+    //   DW_AT_data_member_location    (48)
+    //   DW_AT_type    (0x0000000000088e39 "runtime.m *")
+    //   DW_AT_GO_embedded_field       (0x00)
+    size_t m_ptr_addr;
+    res = bpf_probe_read_user(&m_ptr_addr, sizeof(void *), (void*)(g_addr+48));
+    if (res < 0) {
+        return false;
+    }
+
+    // DW_TAG_member
+    //   DW_AT_name    ("curg")
+    //   DW_AT_data_member_location    (192)
+    //   DW_AT_type    (0x00000000000892b1 "runtime.g *")
+    //   DW_AT_GO_embedded_field       (0x00)
+    size_t curg_ptr_addr;
+    res = bpf_probe_read_user(&curg_ptr_addr, sizeof(void *), (void*)(m_ptr_addr+192));
+    if (res < 0) {
+        return false;
+    }
+
+    // DW_TAG_member
+    //   DW_AT_name    ("labels")
+    //   DW_AT_data_member_location    (360)
+    //   DW_AT_type    (0x000000000005c242 "void *")
+    //   DW_AT_GO_embedded_field       (0x00)
+    void *labels_map_ptr_ptr;
+    res = bpf_probe_read_user(&labels_map_ptr_ptr, sizeof(void *), (void*)(curg_ptr_addr+360));
+    if (res < 0) {
+        return false;
+    }
+
+    void *labels_map_ptr;
+    res = bpf_probe_read(&labels_map_ptr, sizeof(labels_map_ptr), labels_map_ptr_ptr);
+    if (res < 0) {
+        return false;
+    }
+
+    u64 labels_count = 0;
+    res = bpf_probe_read(&labels_count, sizeof(labels_count), labels_map_ptr);
+    if (res < 0) {
+        return false;
+    }
+    if (labels_count == 0) {
+        return false;
+    }
+
+    unsigned char log_2_bucket_count;
+    res = bpf_probe_read(&log_2_bucket_count, sizeof(log_2_bucket_count), labels_map_ptr + 9);
+    if (res < 0) {
+        return false;
+    }
+    u64 bucket_count = 1 << log_2_bucket_count;
+    void *label_buckets;
+    res = bpf_probe_read(&label_buckets, sizeof(label_buckets), labels_map_ptr + 16);
+    if (res < 0) {
+        return false;
+    }
+
+    u32 map_id = 0;
+    // This needs to be allocated in a per-cpu map, because it's too large and
+    // can't be allocated on the stack (which is limited to 512 bytes in bpf).
+    struct map_bucket *map_value = bpf_map_lookup_elem(&golang_mapbucket_storage_map, &map_id);
+    if (!map_value) {
+        return NULL;
+    }
+
+    for (u64 j = 0; j < MAX_BUCKETS; j++) {
+        if (j >= bucket_count) {
+            break;
+        }
+        res = bpf_probe_read(map_value, sizeof(struct map_bucket), label_buckets + (j * sizeof(struct map_bucket)));
+        if (res < 0) {
+            continue;
+        }
+        for (u64 i = 0; i < 8; i++) {
+            if (map_value->tophash[i] == 0) {
+                continue;
+            }
+            if (map_value->keys[i].len != TRACEID_MAP_KEY_LENGTH) {
+                continue;
+            }
+
+            char current_label_key[TRACEID_MAP_KEY_LENGTH];
+            bpf_probe_read(current_label_key, sizeof(current_label_key), map_value->keys[i].str);
+            if (!bpf_memcmp(current_label_key, "otel.traceid", TRACEID_MAP_KEY_LENGTH)) {
+                continue;
+            }
+
+            if (map_value->values[i].len != TRACEID_MAP_VAL_LENGTH) {
+                continue;
+            }
+
+            char trace_id[TRACEID_MAP_VAL_LENGTH];
+            bpf_probe_read(trace_id, TRACEID_MAP_VAL_LENGTH, map_value->values[i].str);
+
+            hex_string_to_bytes(trace_id, TRACEID_MAP_VAL_LENGTH, res_trace_id);
+            return true;
+        }
+    }
+
+    return false;
+}
diff --git a/bpf/unwinders/native.bpf.c b/bpf/unwinders/native.bpf.c
index c0f30d6b98..1feee626e6 100644
--- a/bpf/unwinders/native.bpf.c
+++ b/bpf/unwinders/native.bpf.c
@@ -14,6 +14,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include "shared.h"
+#include "go_traceid.h"
 
 /*================================ CONSTANTS =================================*/
 // Programs.
@@ -119,10 +120,10 @@ struct unwinder_config_t {
   bool mixed_stack_enabled;
   bool python_enabled;
   bool ruby_enabled;
-  /* 3 byte of padding */
+  bool collect_trace_id;
+  /* 2 byte of padding */
   bool _padding1;
   bool _padding2;
-  bool _padding3;
   u32 rate_limit_unwind_info;
   u32 rate_limit_process_mappings;
   u32 rate_limit_refresh_process_info;
@@ -666,6 +667,10 @@ static __always_inline void add_stack(struct bpf_perf_event_data *ctx, u64 pid_t
   stack_key->pid = per_process_id;
   stack_key->tgid = per_thread_id;
 
+  if (unwinder_config.collect_trace_id) {
+    get_trace_id(stack_key->trace_id);
+  }
+
   // Hash and add user stack.
   u64 user_stack_id = hash_stack(&unwind_state->stack, 0);
   stack_key->user_stack_id = user_stack_id;
@@ -1110,6 +1115,7 @@ static __always_inline bool set_initial_state(struct bpf_perf_event_data *ctx) {
   unwind_state->stack_key.user_stack_id = 0;
   unwind_state->stack_key.kernel_stack_id = 0;
   unwind_state->stack_key.interpreter_stack_id = 0;
+  __builtin_memset(unwind_state->stack_key.trace_id, 0, 16);
 
   u64 ip = 0;
   u64 sp = 0;
diff --git a/bpf/unwinders/pyperf.bpf.c b/bpf/unwinders/pyperf.bpf.c
index ccd791dd0c..4fd72a73de 100644
--- a/bpf/unwinders/pyperf.bpf.c
+++ b/bpf/unwinders/pyperf.bpf.c
@@ -13,6 +13,7 @@
 
 #include "hash.h"
 #include "shared.h"
+#include "tls.h"
 
 //
 //   ╔═════════════════════════════════════════════════════════════════════════╗
@@ -151,20 +152,6 @@ static inline __attribute__((__always_inline__)) int tls_read(void *tls_base, In
   return 0;
 }
 
-static inline __attribute__((__always_inline__)) long unsigned int read_tls_base(struct task_struct *task) {
-  long unsigned int tls_base;
-// This changes depending on arch and kernel version.
-// task->thread.fs, task->thread.uw.tp_value, etc.
-#if __TARGET_ARCH_x86
-  tls_base = BPF_CORE_READ(task, thread.fsbase);
-#elif __TARGET_ARCH_arm64
-  tls_base = BPF_CORE_READ(task, thread.uw.tp_value);
-#else
-#error "Unsupported platform"
-#endif
-  return tls_base;
-}
-
 //
 //   ╔═════════════════════════════════════════════════════════════════════════╗
 //   ║ BPF Programs                                                            ║
diff --git a/bpf/unwinders/shared.h b/bpf/unwinders/shared.h
index b51c7743cc..2e3f325611 100644
--- a/bpf/unwinders/shared.h
+++ b/bpf/unwinders/shared.h
@@ -15,6 +15,7 @@ typedef struct {
     u64 user_stack_id;
     u64 kernel_stack_id;
     u64 interpreter_stack_id;
+    unsigned char trace_id[16];
 } stack_count_key_t;
 
 typedef struct {
diff --git a/bpf/unwinders/tls.h b/bpf/unwinders/tls.h
new file mode 100644
index 0000000000..a0489d0c04
--- /dev/null
+++ b/bpf/unwinders/tls.h
@@ -0,0 +1,24 @@
+// +build ignore
+// ^^ this is a golang build tag meant to exclude this C file from compilation
+// by the CGO compiler
+//
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2024 The Parca Authors
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_core_read.h>
+
+static inline __attribute__((__always_inline__)) long unsigned int read_tls_base(struct task_struct *task) {
+  long unsigned int tls_base;
+// This changes depending on arch and kernel version.
+// task->thread.fs, task->thread.uw.tp_value, etc.
+#if __TARGET_ARCH_x86
+  tls_base = BPF_CORE_READ(task, thread.fsbase);
+#elif __TARGET_ARCH_arm64
+  tls_base = BPF_CORE_READ(task, thread.uw.tp_value);
+#else
+#error "Unsupported platform"
+#endif
+  return tls_base;
+}
diff --git a/cmd/parca-agent/main.go b/cmd/parca-agent/main.go
index 4aabe4eb0c..e1a57b3cbe 100644
--- a/cmd/parca-agent/main.go
+++ b/cmd/parca-agent/main.go
@@ -144,6 +144,8 @@ type flags struct {
 	PythonUnwindingDisable bool                `default:"false" help:"Disable Python unwinder."`
 	RubyUnwindingDisable   bool                `default:"false" help:"Disable Ruby unwinder."`
 
+	CollectTraceID bool `default:"false" help:"Attempt to collect trace ID from the process."`
+
 	AnalyticsOptOut bool `default:"false" help:"Opt out of sending anonymous usage statistics."`
 
 	Telemetry FlagsTelemetry `embed:"" prefix:"telemetry-"`
@@ -962,6 +964,7 @@ func run(logger log.Logger, reg *prometheus.Registry, flags flags, numCPU int) e
 				RateLimitUnwindInfo:               flags.Hidden.RateLimitUnwindInfo,
 				RateLimitProcessMappings:          flags.Hidden.RateLimitProcessMappings,
 				RateLimitRefreshProcessInfo:       flags.Hidden.RateLimitRefreshProcessInfo,
+				CollectTraceID:                    flags.CollectTraceID,
 			},
 			bpfProgramLoaded,
 			ofp,
diff --git a/pkg/pprof/pprof.go b/pkg/pprof/pprof.go
index c7e221e8c4..8779b6ea56 100644
--- a/pkg/pprof/pprof.go
+++ b/pkg/pprof/pprof.go
@@ -15,6 +15,7 @@ package pprof
 
 import (
 	"context"
+	"encoding/hex"
 	"errors"
 	"io/fs"
 	"strconv"
@@ -174,6 +175,15 @@ const (
 	threadNameLabel = "thread_name"
 )
 
+func isNonEmptyTraceID(traceID [16]byte) bool {
+	for _, b := range traceID {
+		if b != 0 {
+			return true
+		}
+	}
+	return false
+}
+
 // Convert converts a profile to a pprof profile. It is intended to only be
 // used once.
 func (c *Converter) Convert(ctx context.Context, rawData []profile.RawSample) (*pprofprofile.Profile, []*profilestorepb.ExecutableInfo, error) {
@@ -254,6 +264,9 @@ func (c *Converter) Convert(ctx context.Context, rawData []profile.RawSample) (*
 		if threadName != "" {
 			pprofSample.Label[threadNameLabel] = append(pprofSample.Label[threadNameLabel], threadName)
 		}
+		if isNonEmptyTraceID(sample.TraceID) {
+			pprofSample.Label["trace_id"] = append(pprofSample.Label["trace_id"], hex.EncodeToString(sample.TraceID[:]))
+		}
 
 		c.result.Sample = append(c.result.Sample, pprofSample)
 	}
diff --git a/pkg/profile/profile.go b/pkg/profile/profile.go
index bd35f6f2ec..ae35fbbbb6 100644
--- a/pkg/profile/profile.go
+++ b/pkg/profile/profile.go
@@ -31,6 +31,7 @@ type RawSample struct {
 	// frame.
 	InterpreterStack []uint64
 	Value            uint64
+	TraceID          [16]byte
 }
 
 type RawData []ProcessRawData
diff --git a/pkg/profiler/cpu/cpu.go b/pkg/profiler/cpu/cpu.go
index 1a34163292..c0be3c66a5 100644
--- a/pkg/profiler/cpu/cpu.go
+++ b/pkg/profiler/cpu/cpu.go
@@ -68,9 +68,9 @@ type UnwinderConfig struct {
 	MixedStackWalking           bool
 	PythonEnable                bool
 	RubyEnabled                 bool
+	CollectTraceID              bool
 	Padding1                    bool
 	Padding2                    bool
-	Padding3                    bool
 	RateLimitUnwindInfo         uint32
 	RateLimitProcessMappings    uint32
 	RateLimitRefreshProcessInfo uint32
@@ -99,6 +99,8 @@ type Config struct {
 	RateLimitUnwindInfo         uint32
 	RateLimitProcessMappings    uint32
 	RateLimitRefreshProcessInfo uint32
+
+	CollectTraceID bool
 }
 
 func (c Config) DebugModeEnabled() bool {
@@ -309,9 +311,9 @@ func loadBPFModules(logger log.Logger, reg prometheus.Registerer, memlockRlimit
 			MixedStackWalking:           config.DWARFUnwindingMixedModeEnabled,
 			PythonEnable:                config.PythonUnwindingEnabled,
 			RubyEnabled:                 config.RubyUnwindingEnabled,
+			CollectTraceID:              config.CollectTraceID,
 			Padding1:                    false,
 			Padding2:                    false,
-			Padding3:                    false,
 			RateLimitUnwindInfo:         config.RateLimitUnwindInfo,
 			RateLimitProcessMappings:    config.RateLimitProcessMappings,
 			RateLimitRefreshProcessInfo: config.RateLimitRefreshProcessInfo,
@@ -980,12 +982,14 @@ type (
 		UserStackID        uint64
 		KernelStackID      uint64
 		InterpreterStackID uint64
+		TraceID            [16]byte
 	}
 )
 
 type profileKey struct {
-	pid int32
-	tid int32
+	pid     int32
+	tid     int32
+	traceID [16]byte
 }
 
 // interpreterSymbolTable returns an up-to-date symbol table for the interpreter.
@@ -1054,7 +1058,7 @@ func (p *CPU) obtainRawData(ctx context.Context) (profile.RawData, error) {
 		}
 
 		// Profile aggregation key.
-		pKey := profileKey{pid: key.PID, tid: key.TID}
+		pKey := profileKey{pid: key.PID, tid: key.TID, traceID: key.TraceID}
 
 		// Twice the stack depth because we have a user and a potential Kernel stack.
 		// Read order matters, since we read from the key buffer.
@@ -1202,6 +1206,7 @@ func preprocessRawData(rawData map[profileKey]map[bpfprograms.CombinedStack]uint
 				KernelStack:      kernelStack,
 				InterpreterStack: interpreterStack,
 				Value:            count,
+				TraceID:          pKey.traceID,
 			})
 		}