From c3ed13f50190ef9675bc7404d2daa3989db06f8d Mon Sep 17 00:00:00 2001 From: Damian Dimanov Date: Fri, 20 Feb 2026 15:38:23 +0200 Subject: [PATCH] Show multiple user PF's in the output --- sigsegv-monitor.bpf.c | 135 +++++++++++++++++++++++++++++++----------- sigsegv-monitor.c | 19 ++++-- sigsegv-monitor.h | 16 ++++- 3 files changed, 128 insertions(+), 42 deletions(-) diff --git a/sigsegv-monitor.bpf.c b/sigsegv-monitor.bpf.c index 49f28e8..c9d325f 100644 --- a/sigsegv-monitor.bpf.c +++ b/sigsegv-monitor.bpf.c @@ -4,10 +4,6 @@ #include #include "sigsegv-monitor.h" -// By default is commented: a lot of #PF events are hit -// so enable only if it is acceptable. -// #define TRACE_PF_CR2 - // if /sys/kernel/tracing/trace_on is set to 1, // cat /sys/kernel/tracing/trace // will show the bpf_printk() output @@ -21,12 +17,60 @@ struct trace_event_raw_page_fault_user { char __data[0]; }; +struct cr2_stat { + u64 cr2; + u64 err; + u64 tai; +}; + +struct cr2_stats { + struct cr2_stat stat[MAX_USER_PF_ENTRIES]; + u64 head; + u64 count; +}; + struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 1024); __type(key, u32); - __type(value, u64); + __type(value, struct cr2_stats); } tgid_cr2 SEC(".maps"); + +inline void cr2stats_init(struct cr2_stats* stats) { + stats->head = 0; + stats->count = 0; +} + +inline void cr2stats_push(struct cr2_stats* stats, struct cr2_stat* value) { + if (stats->head < MAX_USER_PF_ENTRIES) { + stats->stat[stats->head] = *value; + + if (++stats->head == MAX_USER_PF_ENTRIES) { + stats->head = 0; + } + + if (stats->count < MAX_USER_PF_ENTRIES) { + ++stats->count; + } + } +} + +// The `index` parameter here is not an index in the array, but an index in the ring buffer, +// i.e. passing an index 0 would return the oldest element in the ring buffer. +inline struct cr2_stat* cr2stats_get(struct cr2_stats* stats, u32 index) { + if (stats->count == MAX_USER_PF_ENTRIES) { + index += stats->head; + if (index >= MAX_USER_PF_ENTRIES) { + index -= MAX_USER_PF_ENTRIES; + } + } + + if (index < MAX_USER_PF_ENTRIES) { + return stats->stat + index; + } + + return NULL; +} #endif // Output map (for user space) @@ -75,24 +119,24 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) { bpf_probe_read_kernel_str(&event->tgleader_comm, sizeof(event->tgleader_comm), &task->group_leader->comm); // TODO: can the acquisition of pidns_tgid, pidns_pid be made more robust / simplified? { - struct pid const* thread_pid = task->thread_pid; - unsigned int const level = thread_pid->level; - // thread_pid->numbers is a size-one flexible array member (type numbers[1]) - // => cannot perform bounds-check against BTF information - // => need bpf_probe_read_kernel to read from indices potentially > 1 - struct upid const* upid_inv = &thread_pid->numbers[level]; - event->pidns_pid = BPF_CORE_READ(upid_inv, nr); // we already have implicit CO-RE, but we need the probe function call - } - { - struct pid const* tgid_pid = task->signal->pids[PIDTYPE_TGID]; - unsigned int const level = tgid_pid->level; - struct upid const* tgid_upid_inv = &tgid_pid->numbers[level]; - // TODO: doesn't this return the pid in the NS of the tg leader, instead of the pid in the NS of the current thread? - // TODO: don't we need RCU here? - event->pidns_tgid = BPF_CORE_READ(tgid_upid_inv, nr); - } - - event->regs.trapno = task->thread.trap_nr; // TODO: also copy the other fields like cr2 and error_code + struct pid const* thread_pid = task->thread_pid; + unsigned int const level = thread_pid->level; + // thread_pid->numbers is a size-one flexible array member (type numbers[1]) + // => cannot perform bounds-check against BTF information + // => need bpf_probe_read_kernel to read from indices potentially > 1 + struct upid const* upid_inv = &thread_pid->numbers[level]; + event->pidns_pid = BPF_CORE_READ(upid_inv, nr); // we already have implicit CO-RE, but we need the probe function call + } + { + struct pid const* tgid_pid = task->signal->pids[PIDTYPE_TGID]; + unsigned int const level = tgid_pid->level; + struct upid const* tgid_upid_inv = &tgid_pid->numbers[level]; + // TODO: doesn't this return the pid in the NS of the tg leader, instead of the pid in the NS of the current thread? + // TODO: don't we need RCU here? + event->pidns_tgid = BPF_CORE_READ(tgid_upid_inv, nr); + } + + event->regs.trapno = task->thread.trap_nr; event->regs.err = task->thread.error_code; // TODO: how are these regs acquired? @@ -119,18 +163,28 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) { event->regs.flags = regs->flags; event->regs.cr2 = task->thread.cr2; - event->regs.cr2_fault = -1; - - #ifdef TRACE_PF_CR2 - u32 tgid = task->tgid; - u64 *cr2 = bpf_map_lookup_elem(&tgid_cr2, &tgid); + } - if (cr2) { - event->regs.cr2_fault = *cr2; - bpf_map_delete_elem(&tgid_cr2, &tgid); + event->pf_count = 0; + #ifdef TRACE_PF_CR2 + u32 tgid = task->tgid; + struct cr2_stats *cr2stats = bpf_map_lookup_elem(&tgid_cr2, &tgid); + + if (cr2stats) { + for (u32 i = 0; i < cr2stats->count && i < MAX_USER_PF_ENTRIES; i++) { + struct cr2_stat* stat = cr2stats_get(cr2stats, i); + if (stat) { + event->pf[i].cr2 = stat->cr2; + event->pf[i].err = stat->err; + event->pf[i].tai = stat->tai; + + ++event->pf_count; + } } - #endif + + bpf_map_delete_elem(&tgid_cr2, &tgid); } + #endif // TODO: when is this snapshot taken? or does the CPU not do LBR in the kernel? long ret = bpf_get_branch_snapshot(&event->lbr, sizeof(event->lbr), 0); @@ -149,13 +203,24 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) { #ifdef TRACE_PF_CR2 SEC("tracepoint/exceptions/page_fault_user") int trace_page_fault(struct trace_event_raw_page_fault_user *ctx) { - u64 cr2; + struct cr2_stat stat; u32 tgid; - cr2 = ctx->address; + stat.cr2 = ctx->address; + stat.err = ctx->error_code; + stat.tai = bpf_ktime_get_tai_ns(); tgid = bpf_get_current_pid_tgid() >> 32; - bpf_map_update_elem(&tgid_cr2, &tgid, &cr2, BPF_ANY); + struct cr2_stats *cr2stats = bpf_map_lookup_elem(&tgid_cr2, &tgid); + if (cr2stats) { + cr2stats_push(cr2stats, &stat); + } else { + struct cr2_stats new_stats; + cr2stats_init(&new_stats); + cr2stats_push(&new_stats, &stat); + + bpf_map_update_elem(&tgid_cr2, &tgid, &new_stats, BPF_ANY); + } return 0; } diff --git a/sigsegv-monitor.c b/sigsegv-monitor.c index d8a7ac9..7d39a53 100644 --- a/sigsegv-monitor.c +++ b/sigsegv-monitor.c @@ -93,13 +93,22 @@ void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) { printf("\"flags\":\"0x%016llx\",", e->regs.flags); printf("\"trapno\":\"0x%016llx\",", e->regs.trapno); printf("\"err\":\"0x%016llx\",", e->regs.err); - printf("\"cr2\":\"0x%016llx\",", e->regs.cr2); - if (e->regs.cr2_fault != (u64)-1) - printf("\"cr2_fault\":\"0x%016llx\"", e->regs.cr2_fault); - else - printf("\"cr2_fault\":null"); + printf("\"cr2\":\"0x%016llx\"", e->regs.cr2); printf("},"); + #ifdef TRACE_PF_CR2 + printf("\"page_faults\": ["); + for_each(i, e->pf_count) + { + printf("{\"cr2\":\"0x%016llx\",\"err\":\"0x%016llx\",\"tai\":%llu}", e->pf[i].cr2, e->pf[i].err, e->pf[i].tai); + + if (i + 1 != e->pf_count) { + printf(","); + } + } + printf("],"); + #endif + printf("\"lbr\":["); int lbr_limit = (e->lbr_count < MAX_LBR_ENTRIES) ? e->lbr_count : MAX_LBR_ENTRIES; for_each(i, lbr_limit) { diff --git a/sigsegv-monitor.h b/sigsegv-monitor.h index 1465aba..1fdceba 100644 --- a/sigsegv-monitor.h +++ b/sigsegv-monitor.h @@ -1,7 +1,17 @@ #pragma once - #define MAX_LBR_ENTRIES 32 +#define MAX_USER_PF_ENTRIES 16 + +// By default is commented: a lot of #PF events are hit +// so enable only if it is acceptable. +// #define TRACE_PF_CR2 + +struct page_fault_info_t { + u64 cr2; + u64 err; + u64 tai; +}; struct user_regs_t { u64 rip; @@ -25,7 +35,6 @@ struct user_regs_t { u64 trapno; u64 err; u64 cr2; - u64 cr2_fault; }; // WARNING: this is for the SENDING process (e.g. pid) of the signal! @@ -45,4 +54,7 @@ struct event_t { struct perf_branch_entry lbr[MAX_LBR_ENTRIES]; u64 tai; // time atomic international + + u32 pf_count; + struct page_fault_info_t pf[MAX_USER_PF_ENTRIES]; };