-
Notifications
You must be signed in to change notification settings - Fork 2
Show multiple user PF's in the output #3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,10 +4,6 @@ | |
| #include <bpf/bpf_tracing.h> | ||
| #include "sigsegv-monitor.h" | ||
|
|
||
| // By default is commented: a lot of #PF events are hit | ||
| // so enable only if it is acceptable. | ||
| // #define TRACE_PF_CR2 | ||
|
|
||
| // if /sys/kernel/tracing/trace_on is set to 1, | ||
| // cat /sys/kernel/tracing/trace | ||
| // will show the bpf_printk() output | ||
|
|
@@ -21,12 +17,60 @@ struct trace_event_raw_page_fault_user { | |
| char __data[0]; | ||
| }; | ||
|
|
||
| struct cr2_stat { | ||
| u64 cr2; | ||
| u64 err; | ||
| u64 tai; | ||
| }; | ||
|
|
||
| struct cr2_stats { | ||
| struct cr2_stat stat[MAX_USER_PF_ENTRIES]; | ||
| u64 head; | ||
| u64 count; | ||
| }; | ||
|
|
||
| struct { | ||
| __uint(type, BPF_MAP_TYPE_HASH); | ||
| __uint(max_entries, 1024); | ||
| __type(key, u32); | ||
| __type(value, u64); | ||
| __type(value, struct cr2_stats); | ||
| } tgid_cr2 SEC(".maps"); | ||
|
|
||
| inline void cr2stats_init(struct cr2_stats* stats) { | ||
| stats->head = 0; | ||
| stats->count = 0; | ||
| } | ||
|
|
||
| inline void cr2stats_push(struct cr2_stats* stats, struct cr2_stat* value) { | ||
| if (stats->head < MAX_USER_PF_ENTRIES) { | ||
| stats->stat[stats->head] = *value; | ||
|
|
||
| if (++stats->head == MAX_USER_PF_ENTRIES) { | ||
| stats->head = 0; | ||
| } | ||
|
|
||
| if (stats->count < MAX_USER_PF_ENTRIES) { | ||
| ++stats->count; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // The `index` parameter here is not an index in the array, but an index in the ring buffer, | ||
| // i.e. passing an index 0 would return the oldest element in the ring buffer. | ||
| inline struct cr2_stat* cr2stats_get(struct cr2_stats* stats, u32 index) { | ||
damiandsap marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if (stats->count == MAX_USER_PF_ENTRIES) { | ||
| index += stats->head; | ||
| if (index >= MAX_USER_PF_ENTRIES) { | ||
| index -= MAX_USER_PF_ENTRIES; | ||
| } | ||
| } | ||
|
|
||
| if (index < MAX_USER_PF_ENTRIES) { | ||
| return stats->stat + index; | ||
| } | ||
|
|
||
| return NULL; | ||
| } | ||
| #endif | ||
|
|
||
| // Output map (for user space) | ||
|
|
@@ -75,24 +119,24 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) { | |
| bpf_probe_read_kernel_str(&event->tgleader_comm, sizeof(event->tgleader_comm), &task->group_leader->comm); | ||
| // TODO: can the acquisition of pidns_tgid, pidns_pid be made more robust / simplified? | ||
| { | ||
| struct pid const* thread_pid = task->thread_pid; | ||
| unsigned int const level = thread_pid->level; | ||
| // thread_pid->numbers is a size-one flexible array member (type numbers[1]) | ||
| // => cannot perform bounds-check against BTF information | ||
| // => need bpf_probe_read_kernel to read from indices potentially > 1 | ||
| struct upid const* upid_inv = &thread_pid->numbers[level]; | ||
| event->pidns_pid = BPF_CORE_READ(upid_inv, nr); // we already have implicit CO-RE, but we need the probe function call | ||
| } | ||
| { | ||
| struct pid const* tgid_pid = task->signal->pids[PIDTYPE_TGID]; | ||
| unsigned int const level = tgid_pid->level; | ||
| struct upid const* tgid_upid_inv = &tgid_pid->numbers[level]; | ||
| // TODO: doesn't this return the pid in the NS of the tg leader, instead of the pid in the NS of the current thread? | ||
| // TODO: don't we need RCU here? | ||
| event->pidns_tgid = BPF_CORE_READ(tgid_upid_inv, nr); | ||
| } | ||
|
|
||
| event->regs.trapno = task->thread.trap_nr; // TODO: also copy the other fields like cr2 and error_code | ||
| struct pid const* thread_pid = task->thread_pid; | ||
| unsigned int const level = thread_pid->level; | ||
| // thread_pid->numbers is a size-one flexible array member (type numbers[1]) | ||
| // => cannot perform bounds-check against BTF information | ||
| // => need bpf_probe_read_kernel to read from indices potentially > 1 | ||
| struct upid const* upid_inv = &thread_pid->numbers[level]; | ||
| event->pidns_pid = BPF_CORE_READ(upid_inv, nr); // we already have implicit CO-RE, but we need the probe function call | ||
| } | ||
| { | ||
| struct pid const* tgid_pid = task->signal->pids[PIDTYPE_TGID]; | ||
| unsigned int const level = tgid_pid->level; | ||
| struct upid const* tgid_upid_inv = &tgid_pid->numbers[level]; | ||
| // TODO: doesn't this return the pid in the NS of the tg leader, instead of the pid in the NS of the current thread? | ||
| // TODO: don't we need RCU here? | ||
| event->pidns_tgid = BPF_CORE_READ(tgid_upid_inv, nr); | ||
| } | ||
|
|
||
| event->regs.trapno = task->thread.trap_nr; | ||
| event->regs.err = task->thread.error_code; | ||
|
|
||
| // TODO: how are these regs acquired? | ||
|
|
@@ -119,18 +163,28 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) { | |
| event->regs.flags = regs->flags; | ||
|
|
||
| event->regs.cr2 = task->thread.cr2; | ||
| event->regs.cr2_fault = -1; | ||
|
|
||
| #ifdef TRACE_PF_CR2 | ||
| u32 tgid = task->tgid; | ||
| u64 *cr2 = bpf_map_lookup_elem(&tgid_cr2, &tgid); | ||
| } | ||
|
|
||
| if (cr2) { | ||
| event->regs.cr2_fault = *cr2; | ||
| bpf_map_delete_elem(&tgid_cr2, &tgid); | ||
| event->pf_count = 0; | ||
| #ifdef TRACE_PF_CR2 | ||
| u32 tgid = task->tgid; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure if we should use the process ID (tgid) or rather the thread ID (pid) for the key of the map... on the one hand, the map can be smaller. On the other hand, we'd need to record which thread has generated the PF, and we might rotate through the ring buffer to quickly. I'm also not sure if we'd need to use some form of locking if multiple threads can write into the ring buffer.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah we'd need locking: https://docs.ebpf.io/linux/helper-function/bpf_map_lookup_elem/ |
||
| struct cr2_stats *cr2stats = bpf_map_lookup_elem(&tgid_cr2, &tgid); | ||
|
|
||
| if (cr2stats) { | ||
| for (u32 i = 0; i < cr2stats->count && i < MAX_USER_PF_ENTRIES; i++) { | ||
| struct cr2_stat* stat = cr2stats_get(cr2stats, i); | ||
| if (stat) { | ||
| event->pf[i].cr2 = stat->cr2; | ||
| event->pf[i].err = stat->err; | ||
| event->pf[i].tai = stat->tai; | ||
|
|
||
| ++event->pf_count; | ||
| } | ||
| } | ||
| #endif | ||
|
|
||
| bpf_map_delete_elem(&tgid_cr2, &tgid); | ||
| } | ||
| #endif | ||
|
|
||
| // TODO: when is this snapshot taken? or does the CPU not do LBR in the kernel? | ||
| long ret = bpf_get_branch_snapshot(&event->lbr, sizeof(event->lbr), 0); | ||
|
|
@@ -149,13 +203,24 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) { | |
| #ifdef TRACE_PF_CR2 | ||
| SEC("tracepoint/exceptions/page_fault_user") | ||
| int trace_page_fault(struct trace_event_raw_page_fault_user *ctx) { | ||
| u64 cr2; | ||
| struct cr2_stat stat; | ||
| u32 tgid; | ||
|
|
||
| cr2 = ctx->address; | ||
| stat.cr2 = ctx->address; | ||
| stat.err = ctx->error_code; | ||
| stat.tai = bpf_ktime_get_tai_ns(); | ||
damiandsap marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| tgid = bpf_get_current_pid_tgid() >> 32; | ||
|
|
||
| bpf_map_update_elem(&tgid_cr2, &tgid, &cr2, BPF_ANY); | ||
| struct cr2_stats *cr2stats = bpf_map_lookup_elem(&tgid_cr2, &tgid); | ||
| if (cr2stats) { | ||
| cr2stats_push(cr2stats, &stat); | ||
| } else { | ||
| struct cr2_stats new_stats; | ||
| cr2stats_init(&new_stats); | ||
| cr2stats_push(&new_stats, &stat); | ||
|
|
||
| bpf_map_update_elem(&tgid_cr2, &tgid, &new_stats, BPF_ANY); | ||
| } | ||
|
|
||
| return 0; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,17 @@ | ||
| #pragma once | ||
|
|
||
|
|
||
| #define MAX_LBR_ENTRIES 32 | ||
| #define MAX_USER_PF_ENTRIES 16 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 16 is fine if the ring buffer is for a single thread, but I fear that with dozens of threads, there might be too many PF, so any interesting one might be rotated out by the time we land in |
||
|
|
||
| // By default is commented: a lot of #PF events are hit | ||
| // so enable only if it is acceptable. | ||
| // #define TRACE_PF_CR2 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should enable this, since it doesn't seem to have much of a performance impact. |
||
|
|
||
| struct page_fault_info_t { | ||
| u64 cr2; | ||
| u64 err; | ||
| u64 tai; | ||
| }; | ||
|
|
||
| struct user_regs_t { | ||
| u64 rip; | ||
|
|
@@ -25,7 +35,6 @@ struct user_regs_t { | |
| u64 trapno; | ||
| u64 err; | ||
| u64 cr2; | ||
| u64 cr2_fault; | ||
| }; | ||
|
|
||
| // WARNING: this is for the SENDING process (e.g. pid) of the signal! | ||
|
|
@@ -45,4 +54,7 @@ struct event_t { | |
| struct perf_branch_entry lbr[MAX_LBR_ENTRIES]; | ||
|
|
||
| u64 tai; // time atomic international | ||
|
|
||
| u32 pf_count; | ||
| struct page_fault_info_t pf[MAX_USER_PF_ENTRIES]; | ||
| }; | ||
Uh oh!
There was an error while loading. Please reload this page.