From c3ed13f50190ef9675bc7404d2daa3989db06f8d Mon Sep 17 00:00:00 2001
From: Damian Dimanov <damian.dimanov@gmail.com>
Date: Fri, 20 Feb 2026 15:38:23 +0200
Subject: [PATCH] Show multiple user PF's in the output

---
 sigsegv-monitor.bpf.c | 135 +++++++++++++++++++++++++++++++-----------
 sigsegv-monitor.c     |  19 ++++--
 sigsegv-monitor.h     |  16 ++++-
 3 files changed, 128 insertions(+), 42 deletions(-)

diff --git a/sigsegv-monitor.bpf.c b/sigsegv-monitor.bpf.c
index 49f28e8..c9d325f 100644
--- a/sigsegv-monitor.bpf.c
+++ b/sigsegv-monitor.bpf.c
@@ -4,10 +4,6 @@
 #include <bpf/bpf_tracing.h>
 #include "sigsegv-monitor.h"
 
-// By default is commented: a lot of #PF events are hit
-// so enable only if it is acceptable.
-// #define TRACE_PF_CR2
-
 // if /sys/kernel/tracing/trace_on  is set to 1,
 //   cat /sys/kernel/tracing/trace
 // will show the bpf_printk() output
@@ -21,12 +17,60 @@ struct trace_event_raw_page_fault_user {
     char __data[0];
 };
 
+struct cr2_stat {
+    u64 cr2;
+    u64 err;
+    u64 tai;
+};
+
+struct cr2_stats {
+    struct cr2_stat stat[MAX_USER_PF_ENTRIES];
+    u64 head;
+    u64 count;
+};
+
 struct {
     __uint(type, BPF_MAP_TYPE_HASH);
     __uint(max_entries, 1024);
     __type(key, u32);
-    __type(value, u64);
+    __type(value, struct cr2_stats);
 } tgid_cr2 SEC(".maps");
+
+inline void cr2stats_init(struct cr2_stats* stats) {
+    stats->head = 0;
+    stats->count = 0;
+}
+
+inline void cr2stats_push(struct cr2_stats* stats, struct cr2_stat* value) {
+    if (stats->head < MAX_USER_PF_ENTRIES) {
+        stats->stat[stats->head] = *value;
+
+        if (++stats->head == MAX_USER_PF_ENTRIES) {
+            stats->head = 0;
+        }
+
+        if (stats->count < MAX_USER_PF_ENTRIES) {
+            ++stats->count;
+        }
+    }
+}
+
+// The `index` parameter here is not an index in the array, but an index in the ring buffer,
+// i.e. passing an index 0 would return the oldest element in the ring buffer.
+inline struct cr2_stat* cr2stats_get(struct cr2_stats* stats, u32 index) {
+    if (stats->count == MAX_USER_PF_ENTRIES) {
+        index += stats->head;
+        if (index >= MAX_USER_PF_ENTRIES) {
+            index -= MAX_USER_PF_ENTRIES;
+        }
+    }
+
+    if (index < MAX_USER_PF_ENTRIES) {
+        return stats->stat + index;
+    }
+
+    return NULL;
+}
 #endif
 
 // Output map (for user space)
@@ -75,24 +119,24 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) {
     bpf_probe_read_kernel_str(&event->tgleader_comm, sizeof(event->tgleader_comm), &task->group_leader->comm);
     // TODO: can the acquisition of pidns_tgid, pidns_pid be made more robust / simplified?
     {
-		struct pid const* thread_pid = task->thread_pid;
-		unsigned int const level = thread_pid->level;
-		// thread_pid->numbers is a size-one flexible array member (type numbers[1])
-		// => cannot perform bounds-check against BTF information
-		// => need bpf_probe_read_kernel to read from indices potentially > 1
-		struct upid const* upid_inv = &thread_pid->numbers[level];
-		event->pidns_pid = BPF_CORE_READ(upid_inv, nr); // we already have implicit CO-RE, but we need the probe function call
-	}
-	{
-		struct pid const* tgid_pid = task->signal->pids[PIDTYPE_TGID];
-		unsigned int const level = tgid_pid->level;
-		struct upid const* tgid_upid_inv = &tgid_pid->numbers[level];
-		// TODO: doesn't this return the pid in the NS of the tg leader, instead of the pid in the NS of the current thread?
-		// TODO: don't we need RCU here?
-		event->pidns_tgid = BPF_CORE_READ(tgid_upid_inv, nr);
-	}
-
-    event->regs.trapno = task->thread.trap_nr; // TODO: also copy the other fields like cr2 and error_code
+        struct pid const* thread_pid = task->thread_pid;
+        unsigned int const level = thread_pid->level;
+        // thread_pid->numbers is a size-one flexible array member (type numbers[1])
+        // => cannot perform bounds-check against BTF information
+        // => need bpf_probe_read_kernel to read from indices potentially > 1
+        struct upid const* upid_inv = &thread_pid->numbers[level];
+        event->pidns_pid = BPF_CORE_READ(upid_inv, nr); // we already have implicit CO-RE, but we need the probe function call
+    }
+    {
+        struct pid const* tgid_pid = task->signal->pids[PIDTYPE_TGID];
+        unsigned int const level = tgid_pid->level;
+        struct upid const* tgid_upid_inv = &tgid_pid->numbers[level];
+        // TODO: doesn't this return the pid in the NS of the tg leader, instead of the pid in the NS of the current thread?
+        // TODO: don't we need RCU here?
+        event->pidns_tgid = BPF_CORE_READ(tgid_upid_inv, nr);
+    }
+
+    event->regs.trapno = task->thread.trap_nr;
     event->regs.err = task->thread.error_code;
 
     // TODO: how are these regs acquired?
@@ -119,18 +163,28 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) {
         event->regs.flags = regs->flags;
 
         event->regs.cr2 = task->thread.cr2;
-        event->regs.cr2_fault = -1;
-
-        #ifdef TRACE_PF_CR2
-        u32 tgid = task->tgid;
-        u64 *cr2 = bpf_map_lookup_elem(&tgid_cr2, &tgid);
+    }
 
-        if (cr2) {
-            event->regs.cr2_fault = *cr2;
-            bpf_map_delete_elem(&tgid_cr2, &tgid);
+    event->pf_count = 0;
+    #ifdef TRACE_PF_CR2
+    u32 tgid = task->tgid;
+    struct cr2_stats *cr2stats = bpf_map_lookup_elem(&tgid_cr2, &tgid);
+
+    if (cr2stats) {
+        for (u32 i = 0; i < cr2stats->count && i < MAX_USER_PF_ENTRIES; i++) {
+            struct cr2_stat* stat = cr2stats_get(cr2stats, i);
+            if (stat) {
+                event->pf[i].cr2 = stat->cr2;
+                event->pf[i].err = stat->err;
+                event->pf[i].tai = stat->tai;
+
+                ++event->pf_count;
+            }
         }
-        #endif
+
+        bpf_map_delete_elem(&tgid_cr2, &tgid);
     }
+    #endif
 
     // TODO: when is this snapshot taken? or does the CPU not do LBR in the kernel?
     long ret = bpf_get_branch_snapshot(&event->lbr, sizeof(event->lbr), 0);
@@ -149,13 +203,24 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) {
 #ifdef TRACE_PF_CR2
 SEC("tracepoint/exceptions/page_fault_user")
 int trace_page_fault(struct trace_event_raw_page_fault_user *ctx) {
-    u64 cr2;
+    struct cr2_stat stat;
     u32 tgid;
 
-    cr2  = ctx->address;
+    stat.cr2 = ctx->address;
+    stat.err = ctx->error_code;
+    stat.tai = bpf_ktime_get_tai_ns();
     tgid = bpf_get_current_pid_tgid() >> 32;
 
-    bpf_map_update_elem(&tgid_cr2, &tgid, &cr2, BPF_ANY);
+    struct cr2_stats *cr2stats = bpf_map_lookup_elem(&tgid_cr2, &tgid);
+    if (cr2stats) {
+        cr2stats_push(cr2stats, &stat);
+    } else {
+        struct cr2_stats new_stats;
+        cr2stats_init(&new_stats);
+        cr2stats_push(&new_stats, &stat);
+
+        bpf_map_update_elem(&tgid_cr2, &tgid, &new_stats, BPF_ANY);
+    }
 
     return 0;
 }
diff --git a/sigsegv-monitor.c b/sigsegv-monitor.c
index d8a7ac9..7d39a53 100644
--- a/sigsegv-monitor.c
+++ b/sigsegv-monitor.c
@@ -93,13 +93,22 @@ void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) {
     printf("\"flags\":\"0x%016llx\",", e->regs.flags);
     printf("\"trapno\":\"0x%016llx\",", e->regs.trapno);
     printf("\"err\":\"0x%016llx\",", e->regs.err);
-    printf("\"cr2\":\"0x%016llx\",", e->regs.cr2);
-    if (e->regs.cr2_fault != (u64)-1)
-        printf("\"cr2_fault\":\"0x%016llx\"", e->regs.cr2_fault);
-    else
-        printf("\"cr2_fault\":null");
+    printf("\"cr2\":\"0x%016llx\"", e->regs.cr2);
     printf("},");
 
+    #ifdef TRACE_PF_CR2
+    printf("\"page_faults\": [");
+    for_each(i, e->pf_count)
+    {
+        printf("{\"cr2\":\"0x%016llx\",\"err\":\"0x%016llx\",\"tai\":%llu}", e->pf[i].cr2, e->pf[i].err, e->pf[i].tai);
+
+        if (i + 1 != e->pf_count) {
+            printf(",");
+        }
+    }
+    printf("],");
+    #endif
+
     printf("\"lbr\":[");
     int lbr_limit = (e->lbr_count < MAX_LBR_ENTRIES) ? e->lbr_count : MAX_LBR_ENTRIES;
     for_each(i, lbr_limit) {
diff --git a/sigsegv-monitor.h b/sigsegv-monitor.h
index 1465aba..1fdceba 100644
--- a/sigsegv-monitor.h
+++ b/sigsegv-monitor.h
@@ -1,7 +1,17 @@
 #pragma once
 
-
 #define MAX_LBR_ENTRIES 32
+#define MAX_USER_PF_ENTRIES 16
+
+// By default is commented: a lot of #PF events are hit
+// so enable only if it is acceptable.
+// #define TRACE_PF_CR2
+
+struct page_fault_info_t {
+    u64 cr2;
+    u64 err;
+    u64 tai;
+};
 
 struct user_regs_t {
     u64 rip;
@@ -25,7 +35,6 @@ struct user_regs_t {
     u64 trapno;
     u64 err;
     u64 cr2;
-    u64 cr2_fault;
 };
 
 // WARNING: this is for the SENDING process (e.g. pid) of the signal!
@@ -45,4 +54,7 @@ struct event_t {
     struct perf_branch_entry lbr[MAX_LBR_ENTRIES];
 
     u64 tai; // time atomic international
+
+    u32 pf_count;
+    struct page_fault_info_t pf[MAX_USER_PF_ENTRIES];
 };