Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 100 additions & 35 deletions sigsegv-monitor.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@
#include <bpf/bpf_tracing.h>
#include "sigsegv-monitor.h"

// By default is commented: a lot of #PF events are hit
// so enable only if it is acceptable.
// #define TRACE_PF_CR2

// if /sys/kernel/tracing/trace_on is set to 1,
// cat /sys/kernel/tracing/trace
// will show the bpf_printk() output
Expand All @@ -21,12 +17,60 @@ struct trace_event_raw_page_fault_user {
char __data[0];
};

struct cr2_stat {
u64 cr2;
u64 err;
u64 tai;
};

struct cr2_stats {
struct cr2_stat stat[MAX_USER_PF_ENTRIES];
u64 head;
u64 count;
};

struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 1024);
__type(key, u32);
__type(value, u64);
__type(value, struct cr2_stats);
} tgid_cr2 SEC(".maps");

inline void cr2stats_init(struct cr2_stats* stats) {
stats->head = 0;
stats->count = 0;
}

inline void cr2stats_push(struct cr2_stats* stats, struct cr2_stat* value) {
if (stats->head < MAX_USER_PF_ENTRIES) {
stats->stat[stats->head] = *value;

if (++stats->head == MAX_USER_PF_ENTRIES) {
stats->head = 0;
}

if (stats->count < MAX_USER_PF_ENTRIES) {
++stats->count;
}
}
}

// The `index` parameter here is not an index in the array, but an index in the ring buffer,
// i.e. passing an index 0 would return the oldest element in the ring buffer.
inline struct cr2_stat* cr2stats_get(struct cr2_stats* stats, u32 index) {
if (stats->count == MAX_USER_PF_ENTRIES) {
index += stats->head;
if (index >= MAX_USER_PF_ENTRIES) {
index -= MAX_USER_PF_ENTRIES;
}
}

if (index < MAX_USER_PF_ENTRIES) {
return stats->stat + index;
}

return NULL;
}
#endif

// Output map (for user space)
Expand Down Expand Up @@ -75,24 +119,24 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) {
bpf_probe_read_kernel_str(&event->tgleader_comm, sizeof(event->tgleader_comm), &task->group_leader->comm);
// TODO: can the acquisition of pidns_tgid, pidns_pid be made more robust / simplified?
{
struct pid const* thread_pid = task->thread_pid;
unsigned int const level = thread_pid->level;
// thread_pid->numbers is a size-one flexible array member (type numbers[1])
// => cannot perform bounds-check against BTF information
// => need bpf_probe_read_kernel to read from indices potentially > 1
struct upid const* upid_inv = &thread_pid->numbers[level];
event->pidns_pid = BPF_CORE_READ(upid_inv, nr); // we already have implicit CO-RE, but we need the probe function call
}
{
struct pid const* tgid_pid = task->signal->pids[PIDTYPE_TGID];
unsigned int const level = tgid_pid->level;
struct upid const* tgid_upid_inv = &tgid_pid->numbers[level];
// TODO: doesn't this return the pid in the NS of the tg leader, instead of the pid in the NS of the current thread?
// TODO: don't we need RCU here?
event->pidns_tgid = BPF_CORE_READ(tgid_upid_inv, nr);
}

event->regs.trapno = task->thread.trap_nr; // TODO: also copy the other fields like cr2 and error_code
struct pid const* thread_pid = task->thread_pid;
unsigned int const level = thread_pid->level;
// thread_pid->numbers is a size-one flexible array member (type numbers[1])
// => cannot perform bounds-check against BTF information
// => need bpf_probe_read_kernel to read from indices potentially > 1
struct upid const* upid_inv = &thread_pid->numbers[level];
event->pidns_pid = BPF_CORE_READ(upid_inv, nr); // we already have implicit CO-RE, but we need the probe function call
}
{
struct pid const* tgid_pid = task->signal->pids[PIDTYPE_TGID];
unsigned int const level = tgid_pid->level;
struct upid const* tgid_upid_inv = &tgid_pid->numbers[level];
// TODO: doesn't this return the pid in the NS of the tg leader, instead of the pid in the NS of the current thread?
// TODO: don't we need RCU here?
event->pidns_tgid = BPF_CORE_READ(tgid_upid_inv, nr);
}

event->regs.trapno = task->thread.trap_nr;
event->regs.err = task->thread.error_code;

// TODO: how are these regs acquired?
Expand All @@ -119,18 +163,28 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) {
event->regs.flags = regs->flags;

event->regs.cr2 = task->thread.cr2;
event->regs.cr2_fault = -1;

#ifdef TRACE_PF_CR2
u32 tgid = task->tgid;
u64 *cr2 = bpf_map_lookup_elem(&tgid_cr2, &tgid);
}

if (cr2) {
event->regs.cr2_fault = *cr2;
bpf_map_delete_elem(&tgid_cr2, &tgid);
event->pf_count = 0;
#ifdef TRACE_PF_CR2
u32 tgid = task->tgid;
Copy link
Contributor

@work-robot work-robot Feb 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if we should use the process ID (tgid) or rather the thread ID (pid) for the key of the map... on the one hand, the map can be smaller. On the other hand, we'd need to record which thread has generated the PF, and we might rotate through the ring buffer to quickly.

I'm also not sure if we'd need to use some form of locking if multiple threads can write into the ring buffer.

Copy link
Contributor

@work-robot work-robot Feb 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah we'd need locking: https://docs.ebpf.io/linux/helper-function/bpf_map_lookup_elem/
I just don't know if per-thread would be sufficient, or if we'd need per-CPU.

struct cr2_stats *cr2stats = bpf_map_lookup_elem(&tgid_cr2, &tgid);

if (cr2stats) {
for (u32 i = 0; i < cr2stats->count && i < MAX_USER_PF_ENTRIES; i++) {
struct cr2_stat* stat = cr2stats_get(cr2stats, i);
if (stat) {
event->pf[i].cr2 = stat->cr2;
event->pf[i].err = stat->err;
event->pf[i].tai = stat->tai;

++event->pf_count;
}
}
#endif

bpf_map_delete_elem(&tgid_cr2, &tgid);
}
#endif

// TODO: when is this snapshot taken? or does the CPU not do LBR in the kernel?
long ret = bpf_get_branch_snapshot(&event->lbr, sizeof(event->lbr), 0);
Expand All @@ -149,13 +203,24 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) {
#ifdef TRACE_PF_CR2
SEC("tracepoint/exceptions/page_fault_user")
int trace_page_fault(struct trace_event_raw_page_fault_user *ctx) {
u64 cr2;
struct cr2_stat stat;
u32 tgid;

cr2 = ctx->address;
stat.cr2 = ctx->address;
stat.err = ctx->error_code;
stat.tai = bpf_ktime_get_tai_ns();
tgid = bpf_get_current_pid_tgid() >> 32;

bpf_map_update_elem(&tgid_cr2, &tgid, &cr2, BPF_ANY);
struct cr2_stats *cr2stats = bpf_map_lookup_elem(&tgid_cr2, &tgid);
if (cr2stats) {
cr2stats_push(cr2stats, &stat);
} else {
struct cr2_stats new_stats;
cr2stats_init(&new_stats);
cr2stats_push(&new_stats, &stat);

bpf_map_update_elem(&tgid_cr2, &tgid, &new_stats, BPF_ANY);
}

return 0;
}
Expand Down
19 changes: 14 additions & 5 deletions sigsegv-monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,22 @@ void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) {
printf("\"flags\":\"0x%016llx\",", e->regs.flags);
printf("\"trapno\":\"0x%016llx\",", e->regs.trapno);
printf("\"err\":\"0x%016llx\",", e->regs.err);
printf("\"cr2\":\"0x%016llx\",", e->regs.cr2);
if (e->regs.cr2_fault != (u64)-1)
printf("\"cr2_fault\":\"0x%016llx\"", e->regs.cr2_fault);
else
printf("\"cr2_fault\":null");
printf("\"cr2\":\"0x%016llx\"", e->regs.cr2);
printf("},");

#ifdef TRACE_PF_CR2
printf("\"page_faults\": [");
for_each(i, e->pf_count)
{
printf("{\"cr2\":\"0x%016llx\",\"err\":\"0x%016llx\",\"tai\":%llu}", e->pf[i].cr2, e->pf[i].err, e->pf[i].tai);

if (i + 1 != e->pf_count) {
printf(",");
}
}
printf("],");
#endif

printf("\"lbr\":[");
int lbr_limit = (e->lbr_count < MAX_LBR_ENTRIES) ? e->lbr_count : MAX_LBR_ENTRIES;
for_each(i, lbr_limit) {
Expand Down
16 changes: 14 additions & 2 deletions sigsegv-monitor.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
#pragma once


#define MAX_LBR_ENTRIES 32
#define MAX_USER_PF_ENTRIES 16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

16 is fine if the ring buffer is for a single thread, but I fear that with dozens of threads, there might be too many PF, so any interesting one might be rotated out by the time we land in signal_generate. But due to the locking requirement (see my comment in the .bpf.c file) I think we should split this into a per-thread data structure (or per-CPU but while recording the pid, not sure...).


// By default is commented: a lot of #PF events are hit
// so enable only if it is acceptable.
// #define TRACE_PF_CR2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should enable this, since it doesn't seem to have much of a performance impact.


struct page_fault_info_t {
u64 cr2;
u64 err;
u64 tai;
};

struct user_regs_t {
u64 rip;
Expand All @@ -25,7 +35,6 @@ struct user_regs_t {
u64 trapno;
u64 err;
u64 cr2;
u64 cr2_fault;
};

// WARNING: this is for the SENDING process (e.g. pid) of the signal!
Expand All @@ -45,4 +54,7 @@ struct event_t {
struct perf_branch_entry lbr[MAX_LBR_ENTRIES];

u64 tai; // time atomic international

u32 pf_count;
struct page_fault_info_t pf[MAX_USER_PF_ENTRIES];
};