From f6edaa1c25f5739b02236277b7eea8901d339261 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 31 May 2026 17:59:55 -0700 Subject: [PATCH 1/4] core: move syscall name mapping out of context Signed-off-by: Cong Wang --- crates/sandlock-core/src/context.rs | 122 +----------------- crates/sandlock-core/src/sandbox.rs | 2 +- crates/sandlock-core/src/seccomp/mod.rs | 1 + .../src/seccomp/syscall_names.rs | 119 +++++++++++++++++ crates/sandlock-ffi/src/lib.rs | 2 +- 5 files changed, 123 insertions(+), 123 deletions(-) create mode 100644 crates/sandlock-core/src/seccomp/syscall_names.rs diff --git a/crates/sandlock-core/src/context.rs b/crates/sandlock-core/src/context.rs index 60b8a4a..6ce8888 100644 --- a/crates/sandlock-core/src/context.rs +++ b/crates/sandlock-core/src/context.rs @@ -119,127 +119,7 @@ pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result { Ok(u32::from_le_bytes(buf)) } -// ============================================================ -// Syscall name → number mapping -// ============================================================ - -/// Map a syscall name to its `libc::SYS_*` number. -/// -/// Covers all names in `DEFAULT_BLOCKLIST_SYSCALLS` plus extras needed for -/// notif and arg-filter lists. -pub fn syscall_name_to_nr(name: &str) -> Option { - let nr: i64 = match name { - "mount" => libc::SYS_mount, - "umount2" => libc::SYS_umount2, - "pivot_root" => libc::SYS_pivot_root, - "swapon" => libc::SYS_swapon, - "swapoff" => libc::SYS_swapoff, - "reboot" => libc::SYS_reboot, - "sethostname" => libc::SYS_sethostname, - "setdomainname" => libc::SYS_setdomainname, - "kexec_load" => libc::SYS_kexec_load, - "init_module" => libc::SYS_init_module, - "finit_module" => libc::SYS_finit_module, - "delete_module" => libc::SYS_delete_module, - "unshare" => libc::SYS_unshare, - "setns" => libc::SYS_setns, - "perf_event_open" => libc::SYS_perf_event_open, - "bpf" => libc::SYS_bpf, - "userfaultfd" => libc::SYS_userfaultfd, - "keyctl" => libc::SYS_keyctl, - "add_key" => libc::SYS_add_key, - "request_key" => libc::SYS_request_key, - "ptrace" => libc::SYS_ptrace, - "process_vm_readv" => libc::SYS_process_vm_readv, - "process_vm_writev" => libc::SYS_process_vm_writev, - "open_by_handle_at" => libc::SYS_open_by_handle_at, - "name_to_handle_at" => libc::SYS_name_to_handle_at, - "ioperm" => arch::SYS_IOPERM?, - "iopl" => arch::SYS_IOPL?, - "quotactl" => libc::SYS_quotactl, - "acct" => libc::SYS_acct, - "lookup_dcookie" => libc::SYS_lookup_dcookie, - // nfsservctl was removed in Linux 3.1; no libc constant — skip - "personality" => libc::SYS_personality, - "io_uring_setup" => libc::SYS_io_uring_setup, - "io_uring_enter" => libc::SYS_io_uring_enter, - "io_uring_register" => libc::SYS_io_uring_register, - // Additional syscalls for notif/arg filters - "clone" => libc::SYS_clone, - "clone3" => libc::SYS_clone3, - "vfork" => arch::SYS_VFORK?, - "mmap" => libc::SYS_mmap, - "munmap" => libc::SYS_munmap, - "brk" => libc::SYS_brk, - "mremap" => libc::SYS_mremap, - "connect" => libc::SYS_connect, - "sendto" => libc::SYS_sendto, - "sendmsg" => libc::SYS_sendmsg, - "sendmmsg" => libc::SYS_sendmmsg, - "ioctl" => libc::SYS_ioctl, - "socket" => libc::SYS_socket, - "prctl" => libc::SYS_prctl, - "getrandom" => libc::SYS_getrandom, - "openat" => libc::SYS_openat, - "open" => arch::SYS_OPEN?, - "getdents64" => libc::SYS_getdents64, - "getdents" => arch::SYS_GETDENTS?, - "bind" => libc::SYS_bind, - "getsockname" => libc::SYS_getsockname, - "clock_gettime" => libc::SYS_clock_gettime, - "gettimeofday" => libc::SYS_gettimeofday, - "time" => arch::SYS_TIME?, - "clock_nanosleep" => libc::SYS_clock_nanosleep, - "timerfd_settime" => libc::SYS_timerfd_settime, - "timer_settime" => libc::SYS_timer_settime, - "execve" => libc::SYS_execve, - "execveat" => libc::SYS_execveat, - // COW filesystem syscalls - "unlinkat" => libc::SYS_unlinkat, - "mkdirat" => libc::SYS_mkdirat, - "renameat2" => libc::SYS_renameat2, - "newfstatat" => libc::SYS_newfstatat, - "statx" => libc::SYS_statx, - "faccessat" => libc::SYS_faccessat, - "symlinkat" => libc::SYS_symlinkat, - "linkat" => libc::SYS_linkat, - "fchmodat" => libc::SYS_fchmodat, - "fchownat" => libc::SYS_fchownat, - "readlinkat" => libc::SYS_readlinkat, - "truncate" => libc::SYS_truncate, - "utimensat" => libc::SYS_utimensat, - "unlink" => arch::SYS_UNLINK?, - "rmdir" => arch::SYS_RMDIR?, - "mkdir" => arch::SYS_MKDIR?, - "rename" => arch::SYS_RENAME?, - "stat" => arch::SYS_STAT?, - "lstat" => arch::SYS_LSTAT?, - "access" => arch::SYS_ACCESS?, - "symlink" => arch::SYS_SYMLINK?, - "link" => arch::SYS_LINK?, - "chmod" => arch::SYS_CHMOD?, - "chown" => arch::SYS_CHOWN?, - "lchown" => arch::SYS_LCHOWN?, - "readlink" => arch::SYS_READLINK?, - "futimesat" => arch::SYS_FUTIMESAT?, - "fork" => arch::SYS_FORK?, - // SysV IPC (gated by extra_allow_syscalls=["sysv_ipc"]; denied by default) - "shmget" => libc::SYS_shmget, - "shmat" => libc::SYS_shmat, - "shmdt" => libc::SYS_shmdt, - "shmctl" => libc::SYS_shmctl, - "msgget" => libc::SYS_msgget, - "msgsnd" => libc::SYS_msgsnd, - "msgrcv" => libc::SYS_msgrcv, - "msgctl" => libc::SYS_msgctl, - "semget" => libc::SYS_semget, - "semop" => libc::SYS_semop, - "semctl" => libc::SYS_semctl, - "semtimedop" => libc::SYS_semtimedop, - _ => return None, - }; - Some(nr as u32) -} +use crate::seccomp::syscall_names::syscall_name_to_nr; // ============================================================ // Sandbox → syscall lists diff --git a/crates/sandlock-core/src/sandbox.rs b/crates/sandlock-core/src/sandbox.rs index a280bad..558d390 100644 --- a/crates/sandlock-core/src/sandbox.rs +++ b/crates/sandlock-core/src/sandbox.rs @@ -1674,7 +1674,7 @@ fn validate_syscall_names(names: &[String]) -> Result<(), SandboxError> { let unknown: Vec<&str> = names .iter() .map(String::as_str) - .filter(|name| crate::context::syscall_name_to_nr(name).is_none()) + .filter(|name| crate::seccomp::syscall_names::syscall_name_to_nr(name).is_none()) .collect(); if unknown.is_empty() { Ok(()) diff --git a/crates/sandlock-core/src/seccomp/mod.rs b/crates/sandlock-core/src/seccomp/mod.rs index 231fc86..b92cc5e 100644 --- a/crates/sandlock-core/src/seccomp/mod.rs +++ b/crates/sandlock-core/src/seccomp/mod.rs @@ -4,3 +4,4 @@ pub mod dispatch; pub mod notif; pub(crate) mod state; pub mod syscall; +pub mod syscall_names; diff --git a/crates/sandlock-core/src/seccomp/syscall_names.rs b/crates/sandlock-core/src/seccomp/syscall_names.rs new file mode 100644 index 0000000..d6cfc35 --- /dev/null +++ b/crates/sandlock-core/src/seccomp/syscall_names.rs @@ -0,0 +1,119 @@ +use crate::arch; + +/// Map a syscall name to its `libc::SYS_*` number. +/// +/// Covers all names in `DEFAULT_BLOCKLIST_SYSCALLS` plus extras needed for +/// notif and arg-filter lists. +pub fn syscall_name_to_nr(name: &str) -> Option { + let nr: i64 = match name { + "mount" => libc::SYS_mount, + "umount2" => libc::SYS_umount2, + "pivot_root" => libc::SYS_pivot_root, + "swapon" => libc::SYS_swapon, + "swapoff" => libc::SYS_swapoff, + "reboot" => libc::SYS_reboot, + "sethostname" => libc::SYS_sethostname, + "setdomainname" => libc::SYS_setdomainname, + "kexec_load" => libc::SYS_kexec_load, + "init_module" => libc::SYS_init_module, + "finit_module" => libc::SYS_finit_module, + "delete_module" => libc::SYS_delete_module, + "unshare" => libc::SYS_unshare, + "setns" => libc::SYS_setns, + "perf_event_open" => libc::SYS_perf_event_open, + "bpf" => libc::SYS_bpf, + "userfaultfd" => libc::SYS_userfaultfd, + "keyctl" => libc::SYS_keyctl, + "add_key" => libc::SYS_add_key, + "request_key" => libc::SYS_request_key, + "ptrace" => libc::SYS_ptrace, + "process_vm_readv" => libc::SYS_process_vm_readv, + "process_vm_writev" => libc::SYS_process_vm_writev, + "open_by_handle_at" => libc::SYS_open_by_handle_at, + "name_to_handle_at" => libc::SYS_name_to_handle_at, + "ioperm" => arch::SYS_IOPERM?, + "iopl" => arch::SYS_IOPL?, + "quotactl" => libc::SYS_quotactl, + "acct" => libc::SYS_acct, + "lookup_dcookie" => libc::SYS_lookup_dcookie, + // nfsservctl was removed in Linux 3.1; no libc constant -- skip. + "personality" => libc::SYS_personality, + "io_uring_setup" => libc::SYS_io_uring_setup, + "io_uring_enter" => libc::SYS_io_uring_enter, + "io_uring_register" => libc::SYS_io_uring_register, + // Additional syscalls for notif/arg filters. + "clone" => libc::SYS_clone, + "clone3" => libc::SYS_clone3, + "vfork" => arch::SYS_VFORK?, + "mmap" => libc::SYS_mmap, + "munmap" => libc::SYS_munmap, + "brk" => libc::SYS_brk, + "mremap" => libc::SYS_mremap, + "connect" => libc::SYS_connect, + "sendto" => libc::SYS_sendto, + "sendmsg" => libc::SYS_sendmsg, + "sendmmsg" => libc::SYS_sendmmsg, + "ioctl" => libc::SYS_ioctl, + "socket" => libc::SYS_socket, + "prctl" => libc::SYS_prctl, + "getrandom" => libc::SYS_getrandom, + "openat" => libc::SYS_openat, + "open" => arch::SYS_OPEN?, + "getdents64" => libc::SYS_getdents64, + "getdents" => arch::SYS_GETDENTS?, + "bind" => libc::SYS_bind, + "getsockname" => libc::SYS_getsockname, + "clock_gettime" => libc::SYS_clock_gettime, + "gettimeofday" => libc::SYS_gettimeofday, + "time" => arch::SYS_TIME?, + "clock_nanosleep" => libc::SYS_clock_nanosleep, + "timerfd_settime" => libc::SYS_timerfd_settime, + "timer_settime" => libc::SYS_timer_settime, + "execve" => libc::SYS_execve, + "execveat" => libc::SYS_execveat, + // COW filesystem syscalls. + "unlinkat" => libc::SYS_unlinkat, + "mkdirat" => libc::SYS_mkdirat, + "renameat2" => libc::SYS_renameat2, + "newfstatat" => libc::SYS_newfstatat, + "statx" => libc::SYS_statx, + "faccessat" => libc::SYS_faccessat, + "symlinkat" => libc::SYS_symlinkat, + "linkat" => libc::SYS_linkat, + "fchmodat" => libc::SYS_fchmodat, + "fchownat" => libc::SYS_fchownat, + "readlinkat" => libc::SYS_readlinkat, + "truncate" => libc::SYS_truncate, + "utimensat" => libc::SYS_utimensat, + "unlink" => arch::SYS_UNLINK?, + "rmdir" => arch::SYS_RMDIR?, + "mkdir" => arch::SYS_MKDIR?, + "rename" => arch::SYS_RENAME?, + "stat" => arch::SYS_STAT?, + "lstat" => arch::SYS_LSTAT?, + "access" => arch::SYS_ACCESS?, + "symlink" => arch::SYS_SYMLINK?, + "link" => arch::SYS_LINK?, + "chmod" => arch::SYS_CHMOD?, + "chown" => arch::SYS_CHOWN?, + "lchown" => arch::SYS_LCHOWN?, + "readlink" => arch::SYS_READLINK?, + "futimesat" => arch::SYS_FUTIMESAT?, + "fork" => arch::SYS_FORK?, + // SysV IPC (gated by extra_allow_syscalls=["sysv_ipc"]; denied by default). + "shmget" => libc::SYS_shmget, + "shmat" => libc::SYS_shmat, + "shmdt" => libc::SYS_shmdt, + "shmctl" => libc::SYS_shmctl, + "msgget" => libc::SYS_msgget, + "msgsnd" => libc::SYS_msgsnd, + "msgrcv" => libc::SYS_msgrcv, + "msgctl" => libc::SYS_msgctl, + "semget" => libc::SYS_semget, + "semop" => libc::SYS_semop, + "semctl" => libc::SYS_semctl, + "semtimedop" => libc::SYS_semtimedop, + _ => return None, + }; + Some(nr as u32) +} diff --git a/crates/sandlock-ffi/src/lib.rs b/crates/sandlock-ffi/src/lib.rs index d2a350b..003958a 100644 --- a/crates/sandlock-ffi/src/lib.rs +++ b/crates/sandlock-ffi/src/lib.rs @@ -533,7 +533,7 @@ pub unsafe extern "C" fn sandlock_syscall_nr(name: *const c_char) -> i64 { Ok(s) => s, Err(_) => return -1, }; - match sandlock_core::context::syscall_name_to_nr(name) { + match sandlock_core::seccomp::syscall_names::syscall_name_to_nr(name) { Some(nr) => i64::from(nr), None => -1, } From f2e7cb7693cd3e888516bb9965a48d4fb1e1120c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 31 May 2026 18:28:28 -0700 Subject: [PATCH 2/4] seccomp: guard against notify/deny list overlap Signed-off-by: Cong Wang --- crates/sandlock-core/src/arch.rs | 7 +++ crates/sandlock-core/src/chroot/dispatch.rs | 5 +-- crates/sandlock-core/src/context.rs | 10 ++--- crates/sandlock-core/src/cow/dispatch.rs | 5 +-- crates/sandlock-core/src/seccomp/bpf.rs | 46 +++++++++++++++++--- crates/sandlock-core/src/seccomp/dispatch.rs | 4 +- 6 files changed, 55 insertions(+), 22 deletions(-) diff --git a/crates/sandlock-core/src/arch.rs b/crates/sandlock-core/src/arch.rs index c2de84c..7d599e8 100644 --- a/crates/sandlock-core/src/arch.rs +++ b/crates/sandlock-core/src/arch.rs @@ -1,5 +1,12 @@ //! Architecture-specific syscall and seccomp helpers. +/// `faccessat2(2)` syscall number on Sandlock's supported Linux architectures. +/// +/// The `libc` crate does not expose this constant on all supported build +/// targets yet, but the seccomp filters and path virtualization handlers need +/// to intercept it because glibc 2.33+ may prefer it over `faccessat`. +pub const SYS_FACCESSAT2: i64 = 439; + #[cfg(target_arch = "x86_64")] mod imp { pub const AUDIT_ARCH: u32 = 0xC000_003E; diff --git a/crates/sandlock-core/src/chroot/dispatch.rs b/crates/sandlock-core/src/chroot/dispatch.rs index 035301b..6592dbe 100644 --- a/crates/sandlock-core/src/chroot/dispatch.rs +++ b/crates/sandlock-core/src/chroot/dispatch.rs @@ -334,9 +334,6 @@ fn exec_on_host(f: impl FnOnce(*const libc::c_char) -> libc::c_int, host: &Path) } } -/// SYS_faccessat2 syscall number (439 on both x86_64 and aarch64). -pub(crate) const SYS_FACCESSAT2: i64 = 439; - // ============================================================ // openat handler // ============================================================ @@ -1072,7 +1069,7 @@ pub(crate) async fn handle_chroot_stat( Err(a) => return a, }; - if nr == libc::SYS_faccessat || nr == SYS_FACCESSAT2 { + if nr == libc::SYS_faccessat || nr == crate::arch::SYS_FACCESSAT2 { return if real_path.exists() || real_path.is_symlink() { NotifAction::ReturnValue(0) } else { diff --git a/crates/sandlock-core/src/context.rs b/crates/sandlock-core/src/context.rs index 6ce8888..a0d1517 100644 --- a/crates/sandlock-core/src/context.rs +++ b/crates/sandlock-core/src/context.rs @@ -244,7 +244,7 @@ pub fn notif_syscalls(policy: &Sandbox, sandbox_name: Option<&str>) -> Vec libc::SYS_newfstatat as u32, libc::SYS_statx as u32, libc::SYS_faccessat as u32, - 439u32, // SYS_faccessat2 — glibc 2.33+ uses this instead of faccessat + arch::SYS_FACCESSAT2 as u32, libc::SYS_readlinkat as u32, libc::SYS_getdents64 as u32, libc::SYS_chdir as u32, @@ -277,7 +277,7 @@ pub fn notif_syscalls(policy: &Sandbox, sandbox_name: Option<&str>) -> Vec libc::SYS_newfstatat as u32, libc::SYS_statx as u32, libc::SYS_faccessat as u32, - 439u32, // SYS_faccessat2 — glibc 2.33+ uses this instead of faccessat + arch::SYS_FACCESSAT2 as u32, libc::SYS_readlinkat as u32, libc::SYS_getdents64 as u32, libc::SYS_chdir as u32, @@ -1071,8 +1071,6 @@ mod tests { /// chroot and COW modes — glibc 2.33+ uses it instead of faccessat. #[test] fn test_notif_syscalls_faccessat2() { - const SYS_FACCESSAT2: u32 = 439; - // Chroot mode let policy = Sandbox::builder() .chroot("/tmp") @@ -1080,7 +1078,7 @@ mod tests { .unwrap(); let nrs = notif_syscalls(&policy, None); assert!(nrs.contains(&(libc::SYS_faccessat as u32))); - assert!(nrs.contains(&SYS_FACCESSAT2), + assert!(nrs.contains(&(arch::SYS_FACCESSAT2 as u32)), "chroot notif filter must include SYS_faccessat2 (439)"); // COW mode @@ -1090,7 +1088,7 @@ mod tests { .unwrap(); let nrs = notif_syscalls(&policy, None); assert!(nrs.contains(&(libc::SYS_faccessat as u32))); - assert!(nrs.contains(&SYS_FACCESSAT2), + assert!(nrs.contains(&(arch::SYS_FACCESSAT2 as u32)), "COW notif filter must include SYS_faccessat2 (439)"); } diff --git a/crates/sandlock-core/src/cow/dispatch.rs b/crates/sandlock-core/src/cow/dispatch.rs index f3ba10f..1b9004b 100644 --- a/crates/sandlock-core/src/cow/dispatch.rs +++ b/crates/sandlock-core/src/cow/dispatch.rs @@ -590,9 +590,6 @@ pub(crate) async fn handle_cow_write( // access() handler — fake W_OK for COW-managed paths // ============================================================ -/// SYS_faccessat2 syscall number on x86_64 (439). Not always in libc crate. -pub(crate) const SYS_FACCESSAT2: i64 = 439; - /// Handle faccessat/faccessat2/access — return success for W_OK checks on /// COW-managed paths so programs that pre-check write permissions (like dpkg) /// don't fail before the COW layer can redirect their writes. @@ -768,7 +765,7 @@ pub(crate) async fn handle_cow_stat( }; drop(st); - if nr == libc::SYS_faccessat || nr == SYS_FACCESSAT2 { + if nr == libc::SYS_faccessat || nr == crate::arch::SYS_FACCESSAT2 { // For faccessat, just check if the file exists (we already resolved it) if real_path.exists() || real_path.is_symlink() { return NotifAction::ReturnValue(0); diff --git a/crates/sandlock-core/src/seccomp/bpf.rs b/crates/sandlock-core/src/seccomp/bpf.rs index 714058b..8681b36 100644 --- a/crates/sandlock-core/src/seccomp/bpf.rs +++ b/crates/sandlock-core/src/seccomp/bpf.rs @@ -48,17 +48,32 @@ pub(crate) fn jump(code: u16, k: u32, jt: u8, jf: u8) -> SockFilter { /// * `block_syscalls` — syscalls that return ERRNO(EPERM) /// * `arg_block` — pre-built arg filter instructions (from `context::arg_filters`) /// -/// Returns an error if the resulting program would exceed the kernel's -/// `BPF_MAXINSNS` (4096) instruction limit. Catching this here gives a -/// clearer error than the kernel's `EINVAL` from `seccomp(2)`, and also -/// guards the `(idx - n) as u8` jump-offset arithmetic below — cBPF jump -/// offsets are u8, so a program over 256 instructions plus careless -/// changes could silently truncate offsets. +/// Returns an error if a syscall appears in both notification and deny lists, +/// or if the resulting program would exceed the kernel's `BPF_MAXINSNS` +/// (4096) instruction limit. Catching the size limit here gives a clearer +/// error than the kernel's `EINVAL` from `seccomp(2)`, and also guards the +/// `(idx - n) as u8` jump-offset arithmetic below — cBPF jump offsets are u8, +/// so a program over 256 instructions plus careless changes could silently +/// truncate offsets. pub fn assemble_filter( notif_syscalls: &[u32], block_syscalls: &[u32], arg_block: &[SockFilter], ) -> Result, std::io::Error> { + if let Some(&nr) = notif_syscalls + .iter() + .find(|&&nr| block_syscalls.contains(&nr)) + { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!( + "syscall {} appears in both notification and deny lists; \ + notification rules are evaluated first", + nr + ), + )); + } + // ---- compute final layout sizes ---- let arch_block = 2usize; // LD arch, JEQ arch (KILL is in ret section) let arg_block_len = arg_block.len(); @@ -226,6 +241,25 @@ mod tests { assert!(has_openat); } + #[test] + fn test_rejects_notif_deny_overlap() { + let err = match assemble_filter( + &[libc::SYS_openat as u32], + &[libc::SYS_openat as u32], + &[], + ) { + Ok(_) => panic!("expected notif/deny overlap to be rejected"), + Err(err) => err, + }; + + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); + assert!( + err.to_string().contains("both notification and deny lists"), + "unexpected error: {}", + err + ); + } + #[test] fn test_arch_jf_lands_on_kill() { let prog = assemble_filter(&[], &[], &[]).unwrap(); diff --git a/crates/sandlock-core/src/seccomp/dispatch.rs b/crates/sandlock-core/src/seccomp/dispatch.rs index e87d792..9c96d1c 100644 --- a/crates/sandlock-core/src/seccomp/dispatch.rs +++ b/crates/sandlock-core/src/seccomp/dispatch.rs @@ -849,7 +849,7 @@ fn register_chroot_handlers( for &nr in &[ libc::SYS_newfstatat, libc::SYS_faccessat, - crate::chroot::dispatch::SYS_FACCESSAT2, + arch::SYS_FACCESSAT2, ] { table.register(nr, chroot_handler!(policy, crate::chroot::dispatch::handle_chroot_stat)); @@ -942,7 +942,7 @@ fn register_cow_handlers(table: &mut DispatchTable, ctx: &Arc) { table.register(libc::SYS_utimensat, cow_call!(crate::cow::dispatch::handle_cow_utimensat)); - let mut access_nrs = vec![libc::SYS_faccessat, crate::cow::dispatch::SYS_FACCESSAT2]; + let mut access_nrs = vec![libc::SYS_faccessat, arch::SYS_FACCESSAT2]; access_nrs.extend(arch::SYS_ACCESS); for nr in access_nrs { table.register(nr, cow_call!(crate::cow::dispatch::handle_cow_access)); From 80e73157d7560ca18f78596a1c677e4afe63d6fb Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 31 May 2026 18:37:21 -0700 Subject: [PATCH 3/4] core: refactor notification syscall list builder --- crates/sandlock-core/src/context.rs | 405 +++++++++++++++++----------- 1 file changed, 251 insertions(+), 154 deletions(-) diff --git a/crates/sandlock-core/src/context.rs b/crates/sandlock-core/src/context.rs index a0d1517..c4ba86a 100644 --- a/crates/sandlock-core/src/context.rs +++ b/crates/sandlock-core/src/context.rs @@ -125,15 +125,237 @@ use crate::seccomp::syscall_names::syscall_name_to_nr; // Sandbox → syscall lists // ============================================================ +#[derive(Default)] +struct SyscallList { + nrs: Vec, +} + +impl SyscallList { + fn with(syscalls: &[i64]) -> Self { + let mut list = Self::default(); + list.extend(syscalls); + list + } + + fn push(&mut self, nr: i64) { + self.nrs.push(nr as u32); + } + + fn extend(&mut self, syscalls: &[i64]) { + self.nrs.extend(syscalls.iter().map(|&nr| nr as u32)); + } + + fn push_optional(&mut self, nr: Option) { + if let Some(nr) = nr { + self.push(nr); + } + } + + fn extend_optional(&mut self, syscalls: &[Option]) { + for &nr in syscalls { + self.push_optional(nr); + } + } + + fn finish(mut self) -> Vec { + self.nrs.sort_unstable(); + self.nrs.dedup(); + self.nrs + } +} + +const BASE_NOTIF_SYSCALLS: &[i64] = &[ + libc::SYS_clone, + libc::SYS_clone3, + libc::SYS_wait4, + libc::SYS_waitid, +]; + +const MEMORY_NOTIF_SYSCALLS: &[i64] = &[ + libc::SYS_mmap, + libc::SYS_munmap, + libc::SYS_brk, + libc::SYS_mremap, +]; + +const NETWORK_POLICY_SYSCALLS: &[i64] = &[ + libc::SYS_connect, + libc::SYS_sendto, + libc::SYS_sendmsg, + libc::SYS_sendmmsg, + libc::SYS_bind, +]; + +// Also intercept openat so the supervisor can re-patch vDSO after exec. +const RANDOM_NOTIF_SYSCALLS: &[i64] = &[libc::SYS_getrandom, libc::SYS_openat]; + +// Also intercept openat so the supervisor gets a notification after exec +// and can re-patch the vDSO (exec replaces vDSO with a fresh copy). +const TIME_NOTIF_SYSCALLS: &[i64] = &[ + libc::SYS_clock_nanosleep, + libc::SYS_timerfd_settime, + libc::SYS_timer_settime, + libc::SYS_openat, +]; + +// /proc virtualization + /etc/hosts virtualization (always on). +// +// `openat` carries the simple `(AT_FDCWD, "/proc/...")` and +// `(AT_FDCWD, "/etc/hosts")` spellings; `openat2` is the same shape +// on newer libcs; legacy `open(path, ...)` is the same path without a +// dirfd. The handlers normalize all three into a single absolute path +// check, so we have to put every variant on the notif list -- otherwise +// a caller that picks `open` or `openat2` slips past virtualization +// and reads the real on-disk file. +const PROCFS_HOSTS_NOTIF_SYSCALLS: &[i64] = &[ + libc::SYS_openat, + arch::SYS_OPENAT2, + libc::SYS_getdents64, +]; +const PROCFS_HOSTS_OPTIONAL_SYSCALLS: &[Option] = &[ + arch::SYS_OPEN, + arch::SYS_GETDENTS, +]; + +// Netlink virtualization (always on): +// socket, bind, getsockname -- swap in a unix socketpair for AF_NETLINK +// recvfrom, recvmsg -- zero msg_name so glibc accepts the reply +// (kernel only writes sun_family on unix +// recvmsg, leaving nl_pid uninitialized) +// close -- unregister (pid, fd) so reuse doesn't +// collide with the cookie set +// Send traffic flows through the real socketpair untouched. +const NETLINK_NOTIF_SYSCALLS: &[i64] = &[ + libc::SYS_socket, + libc::SYS_bind, + libc::SYS_getsockname, + libc::SYS_recvfrom, + libc::SYS_recvmsg, + libc::SYS_close, +]; + +const COW_PATH_SYSCALLS: &[i64] = &[ + libc::SYS_openat, + libc::SYS_execve, + libc::SYS_execveat, + libc::SYS_unlinkat, + libc::SYS_mkdirat, + libc::SYS_renameat2, + libc::SYS_symlinkat, + libc::SYS_linkat, + libc::SYS_fchmodat, + libc::SYS_fchownat, + libc::SYS_truncate, + libc::SYS_utimensat, + libc::SYS_newfstatat, + libc::SYS_statx, + libc::SYS_faccessat, + arch::SYS_FACCESSAT2, + libc::SYS_readlinkat, + libc::SYS_getdents64, + libc::SYS_chdir, + libc::SYS_getcwd, +]; +const COW_LEGACY_PATH_SYSCALLS: &[Option] = &[ + arch::SYS_OPEN, + arch::SYS_UNLINK, + arch::SYS_RMDIR, + arch::SYS_MKDIR, + arch::SYS_RENAME, + arch::SYS_SYMLINK, + arch::SYS_LINK, + arch::SYS_CHMOD, + arch::SYS_CHOWN, + arch::SYS_LCHOWN, + arch::SYS_STAT, + arch::SYS_LSTAT, + arch::SYS_ACCESS, + arch::SYS_READLINK, + arch::SYS_GETDENTS, +]; + +const CHROOT_PATH_SYSCALLS: &[i64] = &[ + libc::SYS_openat, + libc::SYS_execve, + libc::SYS_execveat, + libc::SYS_unlinkat, + libc::SYS_mkdirat, + libc::SYS_renameat2, + libc::SYS_symlinkat, + libc::SYS_linkat, + libc::SYS_fchmodat, + libc::SYS_fchownat, + libc::SYS_truncate, + libc::SYS_newfstatat, + libc::SYS_statx, + libc::SYS_faccessat, + arch::SYS_FACCESSAT2, + libc::SYS_readlinkat, + libc::SYS_getdents64, + libc::SYS_chdir, + libc::SYS_getcwd, + libc::SYS_statfs, + libc::SYS_utimensat, +]; +const CHROOT_LEGACY_PATH_SYSCALLS: &[Option] = &[ + arch::SYS_OPEN, + arch::SYS_STAT, + arch::SYS_LSTAT, + arch::SYS_ACCESS, + arch::SYS_READLINK, + arch::SYS_GETDENTS, + arch::SYS_UNLINK, + arch::SYS_RMDIR, + arch::SYS_MKDIR, + arch::SYS_RENAME, + arch::SYS_SYMLINK, + arch::SYS_LINK, + arch::SYS_CHMOD, + arch::SYS_CHOWN, + arch::SYS_LCHOWN, +]; + +const FS_DENIED_PATH_SYSCALLS: &[i64] = &[ + libc::SYS_openat, + libc::SYS_execve, + libc::SYS_execveat, + libc::SYS_linkat, + libc::SYS_renameat2, + libc::SYS_symlinkat, +]; +const FS_DENIED_LEGACY_PATH_SYSCALLS: &[Option] = &[ + arch::SYS_OPEN, + arch::SYS_LINK, + arch::SYS_RENAME, + arch::SYS_SYMLINK, +]; + +const POLICY_EVENT_SYSCALLS: &[i64] = &[ + libc::SYS_openat, + libc::SYS_connect, + libc::SYS_sendto, + libc::SYS_bind, + libc::SYS_execve, + libc::SYS_execveat, +]; + +const PORT_REMAP_SYSCALLS: &[i64] = &[ + libc::SYS_bind, + libc::SYS_getsockname, +]; + +fn needs_network_supervision(policy: &Sandbox) -> bool { + !policy.net_allow.is_empty() + || policy.policy_fn.is_some() + || !policy.http_allow.is_empty() + || !policy.http_deny.is_empty() +} + /// Determine which syscalls need `SECCOMP_RET_USER_NOTIF`. pub fn notif_syscalls(policy: &Sandbox, sandbox_name: Option<&str>) -> Vec { - let mut nrs = vec![ - libc::SYS_clone as u32, - libc::SYS_clone3 as u32, - libc::SYS_wait4 as u32, - libc::SYS_waitid as u32, - ]; - arch::push_optional_syscall(&mut nrs, arch::SYS_VFORK); + let mut nrs = SyscallList::with(BASE_NOTIF_SYSCALLS); + nrs.push_optional(arch::SYS_VFORK); + // Bare fork(2) carries none of the namespace/process-limit risk of // clone/clone3 and was historically left out of the BPF filter so // hot fork-loops (COW map-reduce) bypass the supervisor entirely. @@ -141,199 +363,74 @@ pub fn notif_syscalls(policy: &Sandbox, sandbox_name: Option<&str>) -> Vec // supervisor can register the new child via ptrace fork events // before it can run user code (argv-safety invariant). if policy.policy_fn.is_some() { - arch::push_optional_syscall(&mut nrs, arch::SYS_FORK); + nrs.push_optional(arch::SYS_FORK); } if policy.max_memory.is_some() { - nrs.push(libc::SYS_mmap as u32); - nrs.push(libc::SYS_munmap as u32); - nrs.push(libc::SYS_brk as u32); - nrs.push(libc::SYS_mremap as u32); + nrs.extend(MEMORY_NOTIF_SYSCALLS); // shmget is in notif only when SysV IPC is allowed. The BPF // layout puts notif JEQs before deny JEQs, so a syscall on // both lists would notify (RET_USER_NOTIF) and silently // bypass the kernel-level deny. When extra_allow_syscalls does not contain "sysv_ipc", // shmget belongs only on the blocklist. if policy.allows_sysv_ipc() { - nrs.push(libc::SYS_shmget as u32); + nrs.push(libc::SYS_shmget); } } - if !policy.net_allow.is_empty() - || policy.policy_fn.is_some() - || !policy.http_allow.is_empty() - || !policy.http_deny.is_empty() - { - nrs.push(libc::SYS_connect as u32); - nrs.push(libc::SYS_sendto as u32); - nrs.push(libc::SYS_sendmsg as u32); - nrs.push(libc::SYS_sendmmsg as u32); - nrs.push(libc::SYS_bind as u32); + if needs_network_supervision(policy) { + nrs.extend(NETWORK_POLICY_SYSCALLS); } if policy.random_seed.is_some() { - nrs.push(libc::SYS_getrandom as u32); - // Also intercept openat so the supervisor can re-patch vDSO after exec. - nrs.push(libc::SYS_openat as u32); + nrs.extend(RANDOM_NOTIF_SYSCALLS); } if policy.time_start.is_some() { - nrs.extend_from_slice(&[ - libc::SYS_clock_nanosleep as u32, - libc::SYS_timerfd_settime as u32, - libc::SYS_timer_settime as u32, - ]); - // Also intercept openat so the supervisor gets a notification after exec - // and can re-patch the vDSO (exec replaces vDSO with a fresh copy). - nrs.push(libc::SYS_openat as u32); + nrs.extend(TIME_NOTIF_SYSCALLS); } - // /proc virtualization + /etc/hosts virtualization (always on). - // - // `openat` carries the simple `(AT_FDCWD, "/proc/...")` and - // `(AT_FDCWD, "/etc/hosts")` spellings; `openat2` is the same shape - // on newer libcs; legacy `open(path, ...)` is the same path without a - // dirfd. The handlers normalize all three into a single absolute path - // check, so we have to put every variant on the notif list — otherwise - // a caller that picks `open` or `openat2` slips past virtualization - // and reads the real on-disk file. - nrs.push(libc::SYS_openat as u32); - nrs.push(arch::SYS_OPENAT2 as u32); - arch::push_optional_syscall(&mut nrs, arch::SYS_OPEN); - nrs.push(libc::SYS_getdents64 as u32); - arch::push_optional_syscall(&mut nrs, arch::SYS_GETDENTS); - - // Netlink virtualization (always on): - // socket, bind, getsockname — swap in a unix socketpair for AF_NETLINK - // recvfrom, recvmsg — zero msg_name so glibc accepts the reply - // (kernel only writes sun_family on unix - // recvmsg, leaving nl_pid uninitialized) - // close — unregister (pid, fd) so reuse doesn't - // collide with the cookie set - // Send traffic flows through the real socketpair untouched. - nrs.push(libc::SYS_socket as u32); - nrs.push(libc::SYS_bind as u32); - nrs.push(libc::SYS_getsockname as u32); - nrs.push(libc::SYS_recvfrom as u32); - nrs.push(libc::SYS_recvmsg as u32); - nrs.push(libc::SYS_close as u32); + nrs.extend(PROCFS_HOSTS_NOTIF_SYSCALLS); + nrs.extend_optional(PROCFS_HOSTS_OPTIONAL_SYSCALLS); + nrs.extend(NETLINK_NOTIF_SYSCALLS); + // Virtualize sched_getaffinity so nproc/sysconf agree with /proc/cpuinfo if policy.num_cpus.is_some() { - nrs.push(libc::SYS_sched_getaffinity as u32); + nrs.push(libc::SYS_sched_getaffinity); } if sandbox_name.is_some() { - nrs.push(libc::SYS_uname as u32); - nrs.push(libc::SYS_openat as u32); + nrs.extend(&[libc::SYS_uname, libc::SYS_openat]); } // COW filesystem interception (seccomp-based, unprivileged) if policy.workdir.is_some() { - nrs.extend_from_slice(&[ - libc::SYS_openat as u32, - libc::SYS_execve as u32, - libc::SYS_execveat as u32, - libc::SYS_unlinkat as u32, - libc::SYS_mkdirat as u32, - libc::SYS_renameat2 as u32, - libc::SYS_symlinkat as u32, - libc::SYS_linkat as u32, - libc::SYS_fchmodat as u32, - libc::SYS_fchownat as u32, - libc::SYS_truncate as u32, - libc::SYS_utimensat as u32, - libc::SYS_newfstatat as u32, - libc::SYS_statx as u32, - libc::SYS_faccessat as u32, - arch::SYS_FACCESSAT2 as u32, - libc::SYS_readlinkat as u32, - libc::SYS_getdents64 as u32, - libc::SYS_chdir as u32, - libc::SYS_getcwd as u32, - ]); - for nr in [ - arch::SYS_OPEN, arch::SYS_UNLINK, arch::SYS_RMDIR, arch::SYS_MKDIR, - arch::SYS_RENAME, arch::SYS_SYMLINK, arch::SYS_LINK, arch::SYS_CHMOD, - arch::SYS_CHOWN, arch::SYS_LCHOWN, arch::SYS_STAT, arch::SYS_LSTAT, - arch::SYS_ACCESS, arch::SYS_READLINK, arch::SYS_GETDENTS, - ] { - arch::push_optional_syscall(&mut nrs, nr); - } + nrs.extend(COW_PATH_SYSCALLS); + nrs.extend_optional(COW_LEGACY_PATH_SYSCALLS); } // Chroot path interception if policy.chroot.is_some() { - nrs.extend_from_slice(&[ - libc::SYS_openat as u32, - libc::SYS_execve as u32, - libc::SYS_execveat as u32, - libc::SYS_unlinkat as u32, - libc::SYS_mkdirat as u32, - libc::SYS_renameat2 as u32, - libc::SYS_symlinkat as u32, - libc::SYS_linkat as u32, - libc::SYS_fchmodat as u32, - libc::SYS_fchownat as u32, - libc::SYS_truncate as u32, - libc::SYS_newfstatat as u32, - libc::SYS_statx as u32, - libc::SYS_faccessat as u32, - arch::SYS_FACCESSAT2 as u32, - libc::SYS_readlinkat as u32, - libc::SYS_getdents64 as u32, - libc::SYS_chdir as u32, - libc::SYS_getcwd as u32, - libc::SYS_statfs as u32, - libc::SYS_utimensat as u32, - ]); - for nr in [ - arch::SYS_OPEN, arch::SYS_STAT, arch::SYS_LSTAT, arch::SYS_ACCESS, - arch::SYS_READLINK, arch::SYS_GETDENTS, arch::SYS_UNLINK, - arch::SYS_RMDIR, arch::SYS_MKDIR, arch::SYS_RENAME, - arch::SYS_SYMLINK, arch::SYS_LINK, arch::SYS_CHMOD, - arch::SYS_CHOWN, arch::SYS_LCHOWN, - ] { - arch::push_optional_syscall(&mut nrs, nr); - } + nrs.extend(CHROOT_PATH_SYSCALLS); + nrs.extend_optional(CHROOT_LEGACY_PATH_SYSCALLS); } // Explicit deny-paths need path-bearing syscalls intercepted. if !policy.fs_denied.is_empty() { - nrs.extend_from_slice(&[ - libc::SYS_openat as u32, - libc::SYS_execve as u32, - libc::SYS_execveat as u32, - libc::SYS_linkat as u32, - libc::SYS_renameat2 as u32, - libc::SYS_symlinkat as u32, - ]); - for nr in [arch::SYS_OPEN, arch::SYS_LINK, arch::SYS_RENAME, arch::SYS_SYMLINK] { - arch::push_optional_syscall(&mut nrs, nr); - } + nrs.extend(FS_DENIED_PATH_SYSCALLS); + nrs.extend_optional(FS_DENIED_LEGACY_PATH_SYSCALLS); } // Dynamic policy callback — intercept key syscalls for event emission. if policy.policy_fn.is_some() { - nrs.extend_from_slice(&[ - libc::SYS_openat as u32, - libc::SYS_connect as u32, - libc::SYS_sendto as u32, - libc::SYS_bind as u32, - libc::SYS_execve as u32, - libc::SYS_execveat as u32, - ]); + nrs.extend(POLICY_EVENT_SYSCALLS); } // Port remapping if policy.port_remap { - nrs.extend_from_slice(&[ - libc::SYS_bind as u32, - libc::SYS_getsockname as u32, - ]); + nrs.extend(PORT_REMAP_SYSCALLS); } - nrs.sort_unstable(); - nrs.dedup(); - nrs + nrs.finish() } /// Resolve `NO_SUPERVISOR_BLOCKLIST_SYSCALLS` names to numbers, plus From f53599b77a17b29b9d878915d5a1265e8efaf41b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 31 May 2026 19:13:46 -0700 Subject: [PATCH 4/4] http: Move HTTP net allow normalization Signed-off-by: Cong Wang --- crates/sandlock-core/src/http.rs | 87 +++++++++++++++++++++++++++++ crates/sandlock-core/src/sandbox.rs | 40 ++----------- 2 files changed, 93 insertions(+), 34 deletions(-) diff --git a/crates/sandlock-core/src/http.rs b/crates/sandlock-core/src/http.rs index 98b26fb..39c0e04 100644 --- a/crates/sandlock-core/src/http.rs +++ b/crates/sandlock-core/src/http.rs @@ -1,6 +1,7 @@ use serde::{Deserialize, Serialize}; use crate::error::SandboxError; +use crate::network::{NetAllow, Protocol}; /// An HTTP access control rule. #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] @@ -172,6 +173,54 @@ pub fn http_acl_check( false // allow rules exist but none matched } +/// Add the network allowlist entries needed for HTTP ACL interception. +/// +/// HTTP ACLs are enforced by a local proxy, but the sandbox still needs to be +/// allowed to reach the original destination on the intercepted ports. Concrete +/// HTTP rule hosts tighten the IP allowlist to those hosts; wildcard hosts or +/// explicit HTTP ports with no rules allow any IP on the HTTP ports. +pub(crate) fn extend_net_allow_for_http( + net_allow: &mut Vec, + http_allow: &[HttpRule], + http_deny: &[HttpRule], + http_ports: &[u16], +) { + if http_ports.is_empty() { + return; + } + + let mut wildcard_seen = false; + let mut concrete_hosts: Vec = Vec::new(); + for rule in http_allow.iter().chain(http_deny.iter()) { + if rule.host == "*" { + wildcard_seen = true; + } else if !concrete_hosts + .iter() + .any(|host| host.eq_ignore_ascii_case(&rule.host)) + { + concrete_hosts.push(rule.host.clone()); + } + } + + if wildcard_seen || (http_allow.is_empty() && http_deny.is_empty()) { + net_allow.push(NetAllow { + protocol: Protocol::Tcp, + host: None, + ports: http_ports.to_vec(), + all_ports: false, + }); + } + + for host in concrete_hosts { + net_allow.push(NetAllow { + protocol: Protocol::Tcp, + host: Some(host), + ports: http_ports.to_vec(), + all_ports: false, + }); + } +} + #[cfg(test)] mod tests { use super::*; @@ -411,4 +460,42 @@ mod tests { let rule = HttpRule::parse("GET example.com/v1//models").unwrap(); assert_eq!(rule.path, "/v1/models"); } + + #[test] + fn extend_net_allow_for_http_adds_concrete_hosts() { + let allow = vec![ + HttpRule::parse("GET api.example.com/v1/*").unwrap(), + HttpRule::parse("POST API.example.com/v2/*").unwrap(), + ]; + let deny = vec![HttpRule::parse("* admin.example.com/*").unwrap()]; + let mut net_allow = Vec::new(); + + extend_net_allow_for_http(&mut net_allow, &allow, &deny, &[80, 443]); + + assert_eq!(net_allow.len(), 2); + assert_eq!(net_allow[0].protocol, Protocol::Tcp); + assert_eq!(net_allow[0].host.as_deref(), Some("api.example.com")); + assert_eq!(net_allow[0].ports, vec![80, 443]); + assert_eq!(net_allow[1].protocol, Protocol::Tcp); + assert_eq!(net_allow[1].host.as_deref(), Some("admin.example.com")); + assert_eq!(net_allow[1].ports, vec![80, 443]); + } + + #[test] + fn extend_net_allow_for_http_adds_any_ip_for_wildcard_or_bare_port() { + let mut net_allow = Vec::new(); + extend_net_allow_for_http(&mut net_allow, &[], &[], &[8080]); + assert_eq!(net_allow.len(), 1); + assert_eq!(net_allow[0].protocol, Protocol::Tcp); + assert_eq!(net_allow[0].host, None); + assert_eq!(net_allow[0].ports, vec![8080]); + + let allow = vec![HttpRule::parse("* */public/*").unwrap()]; + let mut net_allow = Vec::new(); + extend_net_allow_for_http(&mut net_allow, &allow, &[], &[80]); + assert_eq!(net_allow.len(), 1); + assert_eq!(net_allow[0].protocol, Protocol::Tcp); + assert_eq!(net_allow[0].host, None); + assert_eq!(net_allow[0].ports, vec![80]); + } } diff --git a/crates/sandlock-core/src/sandbox.rs b/crates/sandlock-core/src/sandbox.rs index 558d390..3ef66ab 100644 --- a/crates/sandlock-core/src/sandbox.rs +++ b/crates/sandlock-core/src/sandbox.rs @@ -2216,40 +2216,12 @@ impl SandboxBuilder { .map(|s| NetAllow::parse(&s)) .collect::>()?; - // Auto-merge HTTP rules into the network allowlist so the proxy's - // intercept ports remain reachable. A rule with a concrete host - // tightens the IP allowlist (only that host on http_ports); - // wildcard hosts add a `:port` (any IP) rule. This mirrors the - // intent of the old `http_port → net_connect` merge but at the - // endpoint level so HTTP and net_allow stay aligned. - if !http_ports.is_empty() { - let mut wildcard_seen = false; - let mut concrete_hosts: Vec = Vec::new(); - for rule in http_allow.iter().chain(http_deny.iter()) { - if rule.host == "*" { - wildcard_seen = true; - } else if !concrete_hosts.iter().any(|h| h.eq_ignore_ascii_case(&rule.host)) { - concrete_hosts.push(rule.host.clone()); - } - } - if wildcard_seen || (http_allow.is_empty() && http_deny.is_empty()) { - // Fallback: explicit --http-port without rules, or wildcard rules. - net_allow.push(NetAllow { - protocol: Protocol::Tcp, - host: None, - ports: http_ports.clone(), - all_ports: false, - }); - } - for h in concrete_hosts { - net_allow.push(NetAllow { - protocol: Protocol::Tcp, - host: Some(h), - ports: http_ports.clone(), - all_ports: false, - }); - } - } + crate::http::extend_net_allow_for_http( + &mut net_allow, + &http_allow, + &http_deny, + &http_ports, + ); Ok(Sandbox { fs_writable: self.fs_writable,