From a24fc536a910cdab33b6eff11d509c9300eb5a7d Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Wed, 27 May 2026 13:50:28 +0800 Subject: [PATCH] Speedup vDSO CNTVCT and amortized urandom vDSO clock_gettime drops from 1256 ns SVC trap to 2.5ns via CNTVCT-based fast path (493x speedup, 20x under the sub-50 ns design target). The trampoline emits a 28-instruction A64 sequence that reads CNTVCT_EL0, LDAR-acquires the vvar initialized flag, and interpolates wall clock from the anchor as delta * 125 / 3 (Apple Silicon CNTFRQ = 24 MHz), falling back to SVC on first call or CNTVCT regression. The first SVC seeds the vvar via a three-state CAS (0 -> 2 -> 1) so concurrent first calls cannot tear the anchor fields. The seed is gated on ELR_EL1 matching the trampoline's svc_fallback PC so an unrelated raw clock_gettime syscall cannot poison the anchor from arbitrary X9. /dev/urandom 1-byte reads drop from 5688 ns uncached to 2054 ns (2.77x) via a new per-fd entropy cache: an arc4random_buf-refilled 4 KiB buffer per FD_URANDOM slot. The cache is zeroed on close via a type-to-cleanup registry that also closes pre-existing dup and fork-state race windows for every synthetic fd type. eventfd dup shares state across aliases per the Linux contract (refcounted slot plus eventfd_owner[FD_TABLE_SIZE] table). The dup path holds fd_lock and sfd_lock together for the bind commit so racing close cannot leak the refcount; the source identity is pinned via snapshotted host fd so a racing close-and-rebind of the source cannot bind to the wrong slot. tests/test-eventfd-dup pins the shared-state contract. fork_ipc_send_fd_table filters eventfd, signalfd, timerfd, inotify, netlink, pidfd, and epoll out of the SCM_RIGHTS payload. macOS rejects kqueue fds across SCM_RIGHTS and per-class side-table state is not transferable, so a clean drop is the only honest contract. tests/test-fork-synthetic-fd pins it. Startup decomposition: ELFUSE_STARTUP_TRACE=1 emits per-step wall time for VM bring-up (17 steps on test-hello, dominated by hv_vcpu_create and guest_init at roughly 0.9 ms each). Zero overhead when unset. --- src/core/bootstrap.c | 39 +++ src/core/guest.c | 19 ++ src/core/rosetta.c | 2 +- src/core/startup-trace.h | 66 +++++ src/core/vdso.c | 446 ++++++++++++++++++++++++++++++--- src/core/vdso.h | 30 ++- src/runtime/fork-state.c | 48 +++- src/runtime/forkipc.c | 2 +- src/syscall/abi.h | 1 + src/syscall/fd.c | 176 ++++++++++++- src/syscall/fd.h | 15 ++ src/syscall/fdtable.c | 72 +++++- src/syscall/fs.c | 107 +++++--- src/syscall/fuse.c | 16 +- src/syscall/inotify.c | 1 + src/syscall/internal.h | 62 ++++- src/syscall/io.c | 247 ++++++++++++++---- src/syscall/io.h | 2 + src/syscall/net-msg.c | 4 +- src/syscall/net.c | 4 +- src/syscall/netlink.c | 1 + src/syscall/proc-pidfd.c | 7 + src/syscall/proc-pidfd.h | 1 + src/syscall/signal.c | 2 +- src/syscall/syscall.c | 3 + src/syscall/time.c | 22 ++ tests/manifest.txt | 5 +- tests/test-cow-fork.c | 10 +- tests/test-eventfd-dup.c | 65 +++++ tests/test-fork-synthetic-fd.c | 218 ++++++++++++++++ tests/test-large-io-boundary.c | 43 ++++ tests/test-matrix.sh | 2 +- tests/test-syscall-smoke.c | 243 ++++++++++++++++++ tests/test-vdso.c | 242 ++++++++++++++++++ 34 files changed, 2056 insertions(+), 167 deletions(-) create mode 100644 src/core/startup-trace.h create mode 100644 tests/test-eventfd-dup.c create mode 100644 tests/test-fork-synthetic-fd.c create mode 100644 tests/test-vdso.c diff --git a/src/core/bootstrap.c b/src/core/bootstrap.c index c6522df..595c0d3 100644 --- a/src/core/bootstrap.c +++ b/src/core/bootstrap.c @@ -21,6 +21,7 @@ #include "core/bootstrap.h" #include "core/rosetta.h" #include "core/stack.h" +#include "core/startup-trace.h" #include "core/vdso.h" #include "runtime/thread.h" @@ -334,14 +335,17 @@ int guest_bootstrap_prepare(guest_t *g, mem_region_t regions[MAX_BOOT_REGIONS]; int nregions = 0; uint64_t native_vdso; + uint64_t t0; memset(boot, 0, sizeof(*boot)); *guest_initialized = false; + t0 = startup_trace_now_ns(); if (elf_load(elf_host_path, &boot->elf_info) < 0) { log_error("failed to load ELF: %s", elf_host_path); return -1; } + startup_trace_step("elf_load", t0); bool want_rosetta = false; if (boot->elf_info.e_machine == EM_X86_64) { @@ -374,10 +378,12 @@ int guest_bootstrap_prepare(guest_t *g, * the request is non-fatal in either direction. */ uint32_t req_ipa = want_rosetta ? 48 : 0; + t0 = startup_trace_now_ns(); if (guest_init(g, 0, req_ipa) < 0) { log_error("failed to initialize guest"); return -1; } + startup_trace_step("guest_init", t0); *guest_initialized = true; g->is_rosetta = want_rosetta; proc_set_rosetta_active(want_rosetta); @@ -405,11 +411,13 @@ int guest_bootstrap_prepare(guest_t *g, } else { boot->elf_load_base = (boot->elf_info.e_type == ET_DYN) ? PIE_LOAD_BASE : 0; + t0 = startup_trace_now_ns(); if (elf_map_segments(&boot->elf_info, elf_host_path, g->host_base, g->guest_size, boot->elf_load_base) < 0) { log_error("failed to map ELF segments"); return -1; } + startup_trace_step("elf_map_segments", t0); /* Track the lowest loaded ELF address so the legacy fork IPC path * copies low-linked ET_EXECs (e.g. linked at 0x200000) in full. @@ -427,8 +435,10 @@ int guest_bootstrap_prepare(guest_t *g, g->stack_top = STACK_TOP_DEFAULT; g->stack_base = g->stack_top - STACK_SIZE; + t0 = startup_trace_now_ns(); if (!load_interpreter(g, sysroot, boot)) return -1; + startup_trace_step("load_interpreter", t0); } if (shim_bin_len > BLOCK_2MIB) { @@ -436,6 +446,7 @@ int guest_bootstrap_prepare(guest_t *g, return -1; } + t0 = startup_trace_now_ns(); memcpy((uint8_t *) g->host_base + g->shim_base, shim_bin, shim_bin_len); log_debug("shim loaded at offset 0x%llx (%zu bytes)", (unsigned long long) g->shim_base, shim_bin_len); @@ -448,12 +459,15 @@ int guest_bootstrap_prepare(guest_t *g, } sys_icache_invalidate((uint8_t *) g->host_base + g->shim_base, shim_bin_len); + startup_trace_step("shim_load_icache", t0); + t0 = startup_trace_now_ns(); if (!build_boot_regions(regions, &nregions, g, boot, shim_bin_len)) { log_error("too many memory regions (%d >= %d)", nregions, MAX_BOOT_REGIONS); return -1; } + startup_trace_step("build_boot_regions", t0); /* Rosetta path: append the rosetta image as a non-identity region so the * page-table builder maps VA 0x800000000000 -> primary buffer GPA. @@ -461,24 +475,29 @@ int guest_bootstrap_prepare(guest_t *g, * from the same pool that guest_build_page_tables is about to consume). */ if (want_rosetta) { + t0 = startup_trace_now_ns(); if (rosetta_prepare(g, elf_host_path, regions, &nregions, MAX_BOOT_REGIONS, verbose, &rr) < 0) { log_error("rosetta_prepare failed for %s", elf_guest_path); return -1; } + startup_trace_step("rosetta_prepare", t0); } + t0 = startup_trace_now_ns(); boot->ttbr0 = guest_build_page_tables(g, regions, nregions); if (!boot->ttbr0) { log_error("failed to build page tables"); return -1; } + startup_trace_step("guest_build_page_tables", t0); /* No TLBI request here: the shim's _start does TLBI VMALLE1IS before * enabling the MMU (src/core/shim.S), and the per-vCPU accumulator is the * wrong place to stage a bring-up flush -- bootstrap may run on a thread * whose slot is later consumed by an unrelated syscall. */ + t0 = startup_trace_now_ns(); if (want_rosetta) { /* /proc/self/maps for a rosetta guest reports the rosetta translator * as a single anonymous region covering [VA, VA+size). The original @@ -505,12 +524,14 @@ int guest_bootstrap_prepare(guest_t *g, } register_runtime_regions(g, shim_bin_len); + startup_trace_step("register_regions", t0); log_debug("TTBR0=0x%llx, IPA base=0x%llx", (unsigned long long) boot->ttbr0, (unsigned long long) g->ipa_base); if (verbose) log_initial_page_tables(g, boot->ttbr0); + t0 = startup_trace_now_ns(); syscall_init(); proc_init(); @@ -526,6 +547,7 @@ int guest_bootstrap_prepare(guest_t *g, proc_set_elf_path(elf_guest_path); if (sysroot) proc_set_sysroot(sysroot); + startup_trace_step("runtime_init", t0); /* rosetta_finalize pre-opens the x86_64 binary at fd 3, constructs the * binfmt_misc argv ([ROSETTA_PATH, binary, original_argv[1..]]), refreshes @@ -536,18 +558,22 @@ int guest_bootstrap_prepare(guest_t *g, int rosetta_argc = 0; const char **rosetta_argv = NULL; if (want_rosetta) { + t0 = startup_trace_now_ns(); if (rosetta_finalize(g, 0, elf_host_path, elf_host_path_temp, elf_guest_path, guest_argc, guest_argv, &rr, verbose, &rosetta_argc, &rosetta_argv, NULL) < 0) { log_error("rosetta_finalize failed"); return -1; } + startup_trace_step("rosetta_finalize", t0); } else { proc_set_cmdline(guest_argc, guest_argv); } proc_set_environ((const char **) environ); + t0 = startup_trace_now_ns(); native_vdso = vdso_build(g); + startup_trace_step("vdso_build", t0); linux_stack_auxv_t auxv; const elf_info_t *stack_elf = want_rosetta ? &rr.rosetta_info : &boot->elf_info; @@ -555,6 +581,7 @@ int guest_bootstrap_prepare(guest_t *g, uint64_t stack_interp_base = want_rosetta ? 0 : boot->interp_base; int stack_argc = want_rosetta ? rosetta_argc : guest_argc; const char **stack_argv = want_rosetta ? rosetta_argv : guest_argv; + t0 = startup_trace_now_ns(); boot->stack_pointer = build_linux_stack( g, g->stack_top, stack_argc, stack_argv, (const char **) environ, stack_elf, stack_elf_load_base, stack_interp_base, native_vdso, -1, @@ -564,6 +591,7 @@ int guest_bootstrap_prepare(guest_t *g, free(rosetta_argv); return -1; } + startup_trace_step("build_linux_stack", t0); /* rosetta_argv was copied into the guest stack; the host allocation is * no longer needed. The strings themselves are constants (ROSETTA_PATH) * or owned by the caller (binary_path, guest_argv entries) so freeing @@ -599,6 +627,7 @@ int guest_bootstrap_create_vcpu(guest_t *g, { uint64_t sctlr; uint64_t sctlr_with_mmu; + uint64_t t0; /* Rosetta needs TTBR1 walks enabled and TBI1=1 so the kbuf window at * KBUF_VA_BASE (bits-63-set) resolves and TaggedPointer extraction keeps * working. Aarch64 guests stay on the EPD1=1 variant which keeps the @@ -613,7 +642,9 @@ int guest_bootstrap_create_vcpu(guest_t *g, hv_vcpu_t vcpu; hv_vcpu_exit_t *vexit; + t0 = startup_trace_now_ns(); HV_CHECK(hv_vcpu_create(&vcpu, &vexit, NULL)); + startup_trace_step("hv_vcpu_create", t0); g->vcpu = vcpu; g->exit = vexit; *out_vcpu = vcpu; @@ -621,6 +652,7 @@ int guest_bootstrap_create_vcpu(guest_t *g, thread_register_main(vcpu, vexit, proc_get_pid(), el1_sp); + t0 = startup_trace_now_ns(); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_VBAR_EL1, shim_ipa + 0x800)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_MAIR_EL1, 0xFF00)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TCR_EL1, tcr_value)); @@ -632,6 +664,12 @@ int guest_bootstrap_create_vcpu(guest_t *g, HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL0, sp_ipa)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, el1_sp)); + /* CNTKCTL_EL1.EL0VCTEN | EL0PCTEN: allow EL0 to read CNTVCT_EL0 / + * CNTPCT_EL0. Required by the vDSO clock_gettime fast path (and is the + * default on native Linux), without which the guest gets 0 back from MRS. + */ + HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CNTKCTL_EL1, 0x3ULL)); + HV_CHECK(hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, &sctlr)); log_debug("SCTLR_EL1 default=0x%llx", (unsigned long long) sctlr); @@ -645,6 +683,7 @@ int guest_bootstrap_create_vcpu(guest_t *g, sctlr_with_mmu = SCTLR_RES1 | SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_DZE | SCTLR_UCT | SCTLR_UCI; HV_CHECK(hv_vcpu_set_reg(vcpu, HV_REG_X0, sctlr_with_mmu)); + startup_trace_step("hv_vcpu_configure", t0); log_debug( "vCPU configured: PC=0x%llx SCTLR=0x%llx VBAR=0x%llx TTBR0=0x%llx " diff --git a/src/core/guest.c b/src/core/guest.c index 6393b00..6098828 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -38,6 +38,7 @@ #include #include "core/guest.h" +#include "core/startup-trace.h" #include "debug/log.h" #include "utils.h" #include "runtime/thread.h" /* thread_destroy_all_vcpus */ @@ -202,6 +203,8 @@ static uint64_t *pt_at(const guest_t *g, uint64_t gpa) int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) { + uint64_t t0; + memset(g, 0, sizeof(*g)); g->shm_fd = -1; g->ipa_base = GUEST_IPA_BASE; @@ -257,6 +260,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) * seconds max wait) to handle this gracefully. */ hv_return_t ret = HV_ERROR; + t0 = startup_trace_now_ns(); for (int attempt = 0; attempt < 30; attempt++) { hv_vm_config_t config = hv_vm_config_create(); hv_vm_config_set_ipa_size(config, vm_ipa); @@ -266,6 +270,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) break; usleep(500000); /* 500ms between attempts */ } + startup_trace_step("hv_vm_create", t0); if (ret != HV_SUCCESS) { log_error("guest: hv_vm_create failed: %d (ipa_bits=%u)", (int) ret, vm_ipa); @@ -307,8 +312,10 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) * physical memory. Do NOT memset because that would touch every * page and defeat demand paging. */ + t0 = startup_trace_now_ns(); g->host_base = mmap(NULL, try_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + startup_trace_step("primary_mmap", t0); if (g->host_base == MAP_FAILED) { perror("guest: mmap"); g->host_base = NULL; @@ -320,6 +327,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) * path instead of SCM_RIGHTS fd passing. */ char tmppath[] = "/tmp/elfuse-XXXXXX"; + t0 = startup_trace_now_ns(); int sfd = mkstemp(tmppath); if (sfd >= 0) { unlink(tmppath); /* Unlink immediately; fd keeps file alive */ @@ -335,9 +343,12 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) close(sfd); } } + startup_trace_step("cow_shm_upgrade", t0); + t0 = startup_trace_now_ns(); ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, try_size, HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC); + startup_trace_step("hv_vm_map", t0); if (ret == HV_SUCCESS) { mapped_size = try_size; mapped = true; @@ -380,6 +391,8 @@ int guest_init_from_shm(guest_t *g, uint64_t size, uint32_t ipa_bits) { + uint64_t t0; + memset(g, 0, sizeof(*g)); g->shm_fd = -1; /* Child does not own the shm */ g->ipa_base = GUEST_IPA_BASE; @@ -403,8 +416,10 @@ int guest_init_from_shm(guest_t *g, * the parent's frozen snapshot; writes are private to this process. * macOS CoW is page-granular: only modified pages are duplicated. */ + t0 = startup_trace_now_ns(); g->host_base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, shm_fd, 0); + startup_trace_step("shm_mmap", t0); if (g->host_base == MAP_FAILED) { perror("guest: mmap shm"); g->host_base = NULL; @@ -417,6 +432,7 @@ int guest_init_from_shm(guest_t *g, /* Create HVF VM with the same IPA width as the parent */ hv_return_t ret = HV_ERROR; + t0 = startup_trace_now_ns(); for (int attempt = 0; attempt < 30; attempt++) { hv_vm_config_t config = hv_vm_config_create(); hv_vm_config_set_ipa_size(config, ipa_bits); @@ -426,6 +442,7 @@ int guest_init_from_shm(guest_t *g, break; usleep(500000); } + startup_trace_step("hv_vm_create_shm", t0); if (ret != HV_SUCCESS) { log_error("guest: hv_vm_create (shm) failed: %d", (int) ret); munmap(g->host_base, size); @@ -433,8 +450,10 @@ int guest_init_from_shm(guest_t *g, return -1; } + t0 = startup_trace_now_ns(); ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, size, HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC); + startup_trace_step("hv_vm_map_shm", t0); if (ret != HV_SUCCESS) { log_error("guest: hv_vm_map (shm) failed: %d", (int) ret); hv_vm_destroy(); diff --git a/src/core/rosetta.c b/src/core/rosetta.c index 32588b4..caeabae 100644 --- a/src/core/rosetta.c +++ b/src/core/rosetta.c @@ -469,7 +469,7 @@ int rosetta_finalize(guest_t *g, * goto fail must be introduced below, or the fail handler would * double-close it. */ - int bin_guest_fd = fd_alloc_at(3, FD_REGULAR, bin_host_fd); + int bin_guest_fd = fd_alloc_at(3, FD_REGULAR, bin_host_fd, NULL); if (bin_guest_fd < 0) { log_error("rosetta_finalize: fd_alloc_at(3) failed"); goto fail; diff --git a/src/core/startup-trace.h b/src/core/startup-trace.h new file mode 100644 index 0000000..b2b75d8 --- /dev/null +++ b/src/core/startup-trace.h @@ -0,0 +1,66 @@ +/* Startup tracing helpers + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Lightweight per-step wall-time tracer for VM bring-up. Gated by the + * ELFUSE_STARTUP_TRACE environment variable so a release-build run pays + * exactly one getenv + one branch per step when disabled. The helpers are + * static inline so each translation unit can use them without pulling in a + * separate object; the getenv check resolves once per translation unit but + * the resolution itself is idempotent. + */ + +#ifndef ELFUSE_STARTUP_TRACE_H +#define ELFUSE_STARTUP_TRACE_H + +#include +#include +#include +#include +#include +#include +#include + +/* File-scope cache (one copy per translation unit including this header). + * pthread_once serializes concurrent first callers and supplies the + * memory ordering that makes the cached value safely visible to all + * subsequent readers without explicit atomics. + */ +static pthread_once_t startup_trace_once = PTHREAD_ONCE_INIT; +static bool startup_trace_value; + +static inline void startup_trace_resolve(void) +{ + const char *v = getenv("ELFUSE_STARTUP_TRACE"); + startup_trace_value = v && v[0] && strcmp(v, "0") != 0; +} + +static inline bool startup_trace_enabled(void) +{ + pthread_once(&startup_trace_once, startup_trace_resolve); + return startup_trace_value; +} + +static inline uint64_t startup_trace_now_ns(void) +{ + if (!startup_trace_enabled()) + return 0; + struct timespec ts; + if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) + return 0; + return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec; +} + +static inline void startup_trace_step(const char *label, uint64_t start_ns) +{ + if (start_ns == 0) + return; + uint64_t end_ns = startup_trace_now_ns(); + if (end_ns < start_ns) + return; + fprintf(stderr, "startup %-28s %8.3f ms\n", label, + (double) (end_ns - start_ns) / 1000000.0); +} + +#endif /* ELFUSE_STARTUP_TRACE_H */ diff --git a/src/core/vdso.c b/src/core/vdso.c index 444be88..f50c5f8 100644 --- a/src/core/vdso.c +++ b/src/core/vdso.c @@ -4,7 +4,7 @@ * Copyright 2025 Moritz Angermann, zw3rk pte. ltd. * SPDX-License-Identifier: Apache-2.0 * - * Builds a minimal vDSO ELF image in guest memory exposing + * Builds a minimal vDSO ELF image in guest memory exposing versioned * __kernel_{rt_sigreturn,clock_getres,clock_gettime,gettimeofday}. Each entry * point is an SVC trampoline that traps back to the host for the actual work. * @@ -13,7 +13,9 @@ * CNTVCT_EL0 from the macOS frame of reference while the guest reads it through * HVF's CNTVOFF_EL2 virtualization, so the seqlock interpolation produced bogus * times (year 26382). The fast path is gone; SVC is correct and the trap cost - * is negligible compared to the work clock_gettime callers tend to do anyway. + * is still one syscall round trip, but the versioned ELF metadata lets modern + * libcs find the trampoline instead of falling back to their generic syscall + * path. */ #include @@ -44,11 +46,28 @@ typedef struct { uint64_t st_value, st_size; } elf64_sym_t; +typedef struct { + uint16_t vd_version; + uint16_t vd_flags; + uint16_t vd_ndx; + uint16_t vd_cnt; + uint32_t vd_hash; + uint32_t vd_aux; + uint32_t vd_next; +} elf64_verdef_t; + +typedef struct { + uint32_t vda_name; + uint32_t vda_next; +} elf64_verdaux_t; + /* ELF constants */ #define SHT_STRTAB 3 #define SHT_HASH 5 #define SHT_DYNAMIC 6 #define SHT_DYNSYM 11 +#define SHT_GNU_VERDEF 0x6ffffffd +#define SHT_GNU_VERSYM 0x6fffffff #define SHF_ALLOC (1ULL << 1) #define SHF_EXECINSTR (1ULL << 2) #define DT_NULL 0 @@ -57,8 +76,13 @@ typedef struct { #define DT_SYMTAB 6 #define DT_STRSZ 10 #define DT_SYMENT 11 +#define DT_VERSYM 0x6ffffff0 +#define DT_VERDEF 0x6ffffffc +#define DT_VERDEFNUM 0x6ffffffd #define STB_GLOBAL 1 #define STT_FUNC 2 +#define VER_DEF_CURRENT 1 +#define VDSO_LINUX_VERSION_INDEX 2 #define ELF_ST_INFO(bind, type) (((bind) << 4) | ((type) & 0xf)) /* Layout. @@ -75,50 +99,115 @@ typedef struct { #define VDSO_OFF_PHDR 0x040 #define VDSO_OFF_PHDR1 0x078 -/* .text trampolines (each 12 bytes: mov x8, #N; svc #0; ret). */ -#define TEXT_OFF_SIGRET 0x0B0 -#define TEXT_OFF_GETRES 0x0BC -#define TEXT_OFF_GETTIME 0x0C8 -#define TEXT_OFF_GETTOD 0x0D4 -#define TEXT_END 0x0E0 +/* vvar at fixed offset; host writes the wall-clock anchor on first + * clock_gettime SVC, after the guest trampoline has stored its own + * CNTVCT_EL0 read into X9. Layout: + * +0 uint32 initialized (host sets 1 after anchor_sec/anchor_nsec) + * +4 uint32 pad + * +8 uint64 anchor_cntvct (guest frame, written by host from X9) + * +16 uint64 anchor_sec + * +24 uint64 anchor_nsec + */ +#define VDSO_OFF_VVAR 0x0B0 +#define VVAR_OFF_INITIALIZED 0x00 +#define VVAR_OFF_ANCHOR_CNTVCT 0x08 +#define VVAR_OFF_ANCHOR_SEC 0x10 +#define VVAR_OFF_ANCHOR_NSEC 0x18 +#define VVAR_SIZE 0x20 + +/* .text trampolines. rt_sigreturn / clock_getres / gettimeofday are 12-byte + * SVC trampolines. clock_gettime is the CNTVCT-based fast-path trampoline + * (112 bytes = 28 instructions including the svc_fallback tail). The + * trampoline uses LDAR on the vvar initialized flag, treats both states + * 0 (unseeded) and 2 (host-side reservation in vdso_seed_anchor) as + * fall-back, and guards the CNTVCT-anchor subtraction against unsigned + * underflow via SUBS + B.LO. + */ +#define TEXT_OFF_SIGRET 0x0D0 +#define TEXT_OFF_GETRES 0x0DC +#define TEXT_OFF_GETTIME 0x0E8 +#define TEXT_GETTIME_SIZE 0x70 +#define TEXT_OFF_GETTOD (TEXT_OFF_GETTIME + TEXT_GETTIME_SIZE) +#define TEXT_END (TEXT_OFF_GETTOD + 12) +/* Address of the SVC inside __kernel_clock_gettime's svc_fallback (offset + * 0x68 within the trampoline). The host's sys_clock_gettime uses this + * value to gate vvar seeding: only a trap whose ELR_EL1 equals SVC_PC + 4 + * came from the trampoline and may carry a trustworthy CNTVCT in X9. + */ +#define VDSO_CLOCK_GETTIME_SVC_PC (TEXT_OFF_GETTIME + 0x68) + +/* dynstr, dynsym, hash, GNU version metadata, dynamic, shdr follow. + * TEXT_END is 0x164 after the fast-path expansion; pad to 8-byte align. + */ +#define VDSO_OFF_DYNSTR 0x168 -/* dynstr, dynsym, hash, dynamic, shdr follow */ -#define VDSO_OFF_DYNSTR 0x0E0 -#define DYNSTR_SIZE 90 +/* Padded to 8-byte align: 0x168 + 103 = 0x1CF, pad to 0x1D0 */ +#define VDSO_OFF_DYNSYM 0x1D0 -/* Padded to 4-byte align: 0x0E0 + 90 = 0x13A, pad to 0x13C */ -#define VDSO_OFF_DYNSYM 0x13C +/* 5 * 24 = 120, 0x1D0 + 120 = 0x248 */ +#define VDSO_OFF_HASH 0x248 -/* 5 * 24 = 120, 0x13C + 120 = 0x1B4 */ -#define VDSO_OFF_HASH 0x1B4 +/* 2+1+5 = 8 words * 4 = 32, 0x248 + 32 = 0x268 */ +#define VDSO_OFF_VERSYM 0x268 -/* 2+1+5 = 8 words * 4 = 32, 0x1B4 + 32 = 0x1D4, pad to 0x1D8 */ -#define VDSO_OFF_DYNAMIC 0x1D8 +/* 5 * 2 = 10, 0x268 + 10 = 0x272, pad to 0x278 */ +#define VDSO_OFF_VERDEF 0x278 -/* 6 * 16 = 96, 0x1D8 + 96 = 0x238 */ -#define VDSO_OFF_SHDR 0x238 +/* Verdef + verdaux = 28, 0x278 + 28 = 0x294, pad to 0x298 */ +#define VDSO_OFF_DYNAMIC 0x298 -/* 6 * 64 = 384, 0x238 + 384 = 0x3B8 (fits in 4KiB) */ +/* 9 * 16 = 144, 0x298 + 144 = 0x328 */ +#define VDSO_OFF_SHDR 0x328 + +/* 8 * 64 = 512, 0x328 + 512 = 0x528 (fits in 4 KiB) */ #define VDSO_NUM_SYMS 4 #define HASH_NCHAIN (VDSO_NUM_SYMS + 1) #define HASH_NBUCKET 1 #define HASH_SIZE ((2 + HASH_NBUCKET + HASH_NCHAIN) * sizeof(uint32_t)) +#define VERSYM_SIZE ((VDSO_NUM_SYMS + 1) * sizeof(uint16_t)) +#define VERDEF_SIZE (sizeof(elf64_verdef_t) + sizeof(elf64_verdaux_t)) +#define VDSO_NUM_DYN 9 /* .dynstr data */ static const char dynstr_data[] = "\0__kernel_rt_sigreturn" "\0__kernel_clock_getres" "\0__kernel_clock_gettime" - "\0__kernel_gettimeofday"; - -/* Symbol name offsets */ -static const uint32_t sym_name_offsets[VDSO_NUM_SYMS] = {1, 23, 45, 68}; + "\0__kernel_gettimeofday" + "\0LINUX_2.6.39"; +#define DYNSTR_SIZE sizeof(dynstr_data) + +/* Symbol name offsets, derived from preceding string-literal lengths so a + * future edit to dynstr_data shifts them in lockstep instead of silently + * breaking the version lookup (sizeof("\0X") - 1 == bytes contributed when + * X is concatenated into dynstr_data; only the very last literal's trailing + * NUL survives concatenation). + */ +#define DYNSTR_BYTES_RT_SIGRETURN (sizeof("\0__kernel_rt_sigreturn") - 1) +#define DYNSTR_BYTES_CLOCK_GETRES (sizeof("\0__kernel_clock_getres") - 1) +#define DYNSTR_BYTES_CLOCK_GETTIME (sizeof("\0__kernel_clock_gettime") - 1) +#define DYNSTR_BYTES_GETTIMEOFDAY (sizeof("\0__kernel_gettimeofday") - 1) + +static const uint32_t sym_name_offsets[VDSO_NUM_SYMS] = { + 1, + DYNSTR_BYTES_RT_SIGRETURN + 1, + DYNSTR_BYTES_RT_SIGRETURN + DYNSTR_BYTES_CLOCK_GETRES + 1, + DYNSTR_BYTES_RT_SIGRETURN + DYNSTR_BYTES_CLOCK_GETRES + + DYNSTR_BYTES_CLOCK_GETTIME + 1, +}; +/* Skip the leading \0 of "\0LINUX_2.6.39" to land on 'L'. */ +#define VDSO_LINUX_VERSION_NAME_OFF \ + (DYNSTR_BYTES_RT_SIGRETURN + DYNSTR_BYTES_CLOCK_GETRES + \ + DYNSTR_BYTES_CLOCK_GETTIME + DYNSTR_BYTES_GETTIMEOFDAY + 1) + +_Static_assert(sizeof(dynstr_data) <= 104, + "dynstr_data outgrew the DYNSYM padding window"); /* Symbol text offsets and sizes */ static const uint32_t sym_text_off[VDSO_NUM_SYMS] = { TEXT_OFF_SIGRET, TEXT_OFF_GETRES, TEXT_OFF_GETTIME, TEXT_OFF_GETTOD}; -static const uint32_t sym_text_size[VDSO_NUM_SYMS] = { - 12, 12, TEXT_OFF_GETTOD - TEXT_OFF_GETTIME, 12}; +static const uint32_t sym_text_size[VDSO_NUM_SYMS] = {12, 12, TEXT_GETTIME_SIZE, + 12}; /* Emit a 12-byte SVC trampoline: mov x8, #syscall_nr; svc #0; ret. */ static void emit_svc_trampoline(uint32_t *code, unsigned syscall_nr) @@ -129,6 +218,209 @@ static void emit_svc_trampoline(uint32_t *code, unsigned syscall_nr) code[2] = 0xD65F03C0U; /* ret */ } +/* CNTVCT-based fast-path trampoline for __kernel_clock_gettime. The guest + * always reads CNTVCT_EL0 into X9 first, then either falls through to a + * full SVC (CLOCK_REALTIME, unsupported clockids, vvar uninitialized) or + * interpolates wall_clock from the vvar anchor. The host's + * sys_clock_gettime handler reads X9 on the first SVC and seeds the vvar + * (anchor_cntvct = X9, anchor_sec/nsec = wall_clock), so subsequent calls + * skip the trap. CNTKCTL_EL1.EL0VCTEN is set in bootstrap to allow the + * MRS at EL0; without that the trampoline gets 0 back and the math + * collapses. + * + * Layout (vvar_off is byte offset from the trampoline's first instruction + * to VDSO_OFF_VVAR; resolved by emit_clock_gettime_trampoline below): + * + * 00: mrs x9, cntvct_el0 ; always read first + * 04: cmp w0, #1 ; CLOCK_MONOTONIC? + * 08: b.ne svc_fallback + * 0C: adr x2, vvar + * 10: ldr w3, [x2, #INITIALIZED] + * 14: cbz w3, svc_fallback ; not seeded yet + * 18: ldr x3, [x2, #ANCHOR_CNTVCT] + * 1C: ldr x4, [x2, #ANCHOR_SEC] + * 20: ldr x5, [x2, #ANCHOR_NSEC] + * 24: sub x6, x9, x3 ; delta cycles (CNTFRQ = 24 MHz) + * 28: mov x7, #125 + * 2C: mul x6, x6, x7 ; delta * 125 + * 30: mov x7, #3 + * 34: udiv x6, x6, x7 ; delta_ns + * 38: add x5, x5, x6 ; raw nsec + * 3C: mov x7, #0xCA00 + * 40: movk x7, #0x3B9A, lsl #16 ; x7 = 1e9 + * 44: udiv x8, x5, x7 ; sec carry + * 48: msub x5, x8, x7, x5 ; nsec %= 1e9 + * 4C: add x4, x4, x8 ; final sec + * 50: stp x4, x5, [x1] ; store {sec, nsec} + * 54: mov x0, #0 + * 58: ret + * 5C: (svc_fallback: mov x8 #113; svc #0; ret) + * + * The svc_fallback tail lives in __kernel_clock_gettime's slot too so a + * single RET ends the function in either path. + */ + +/* AArch64 instruction encoders (only the ones used here). */ +static uint32_t enc_movz_x(unsigned rd, uint16_t imm) +{ + return 0xD2800000U | ((uint32_t) imm << 5) | (rd & 0x1F); +} + +static uint32_t enc_movk_x_lsl16(unsigned rd, uint16_t imm) +{ + return 0xF2A00000U | ((uint32_t) imm << 5) | (rd & 0x1F); +} + +static uint32_t enc_adr(unsigned rd, int32_t pc_rel) +{ + uint32_t immlo = (uint32_t) (pc_rel & 0x3); + uint32_t immhi = (uint32_t) ((pc_rel >> 2) & 0x7FFFF); + return 0x10000000U | (immlo << 29) | (immhi << 5) | (rd & 0x1F); +} + +/* B.cond imm19. cond is the 4-bit AArch64 condition (NE=0x1, LO=0x3, etc.). */ +#define COND_NE 0x1 +#define COND_LO 0x3 +static uint32_t enc_bcond_imm19(unsigned cond, int32_t pc_rel) +{ + uint32_t imm19 = (uint32_t) ((pc_rel >> 2) & 0x7FFFF); + return 0x54000000U | (imm19 << 5) | (cond & 0xF); +} + +static uint32_t enc_ldr_x_imm12(unsigned rt, unsigned rn, uint32_t off_bytes) +{ + return 0xF9400000U | ((off_bytes / 8) << 10) | ((rn & 0x1F) << 5) | + (rt & 0x1F); +} + +static uint32_t enc_add_x(unsigned rd, unsigned rn, unsigned rm) +{ + return 0x8B000000U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F); +} + +static uint32_t enc_mul_x(unsigned rd, unsigned rn, unsigned rm) +{ + return 0x9B007C00U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F); +} + +static uint32_t enc_udiv_x(unsigned rd, unsigned rn, unsigned rm) +{ + return 0x9AC00800U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F); +} + +static uint32_t enc_msub_x(unsigned rd, unsigned rn, unsigned rm, unsigned ra) +{ + return 0x9B008000U | ((rm & 0x1F) << 16) | ((ra & 0x1F) << 10) | + ((rn & 0x1F) << 5) | (rd & 0x1F); +} + +static uint32_t enc_stp_x_imm7(unsigned rt1, + unsigned rt2, + unsigned rn, + int32_t off_bytes) +{ + int32_t imm7 = (off_bytes / 8) & 0x7F; + return 0xA9000000U | ((uint32_t) imm7 << 15) | ((rt2 & 0x1F) << 10) | + ((rn & 0x1F) << 5) | (rt1 & 0x1F); +} + +static uint32_t enc_cmp_w_imm12(unsigned rn, uint32_t imm12) +{ + /* SUBS WZR, Wn, #imm12 */ + return 0x7100001FU | ((imm12 & 0xFFF) << 10) | ((rn & 0x1F) << 5); +} + +/* LDAR Wt, [Xn] -- acquire load of a 32-bit word. Pairs with the host's + * __atomic_store_n(initialized, ..., __ATOMIC_RELEASE) so that observing + * initialized != 0 also makes the prior anchor stores visible. + */ +static uint32_t enc_ldar_w(unsigned rt, unsigned rn) +{ + return 0x88DFFC00U | ((rn & 0x1F) << 5) | (rt & 0x1F); +} + +/* SUBS Xd, Xn, Xm (set flags). */ +static uint32_t enc_subs_x(unsigned rd, unsigned rn, unsigned rm) +{ + return 0xEB000000U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F); +} + +/* Emit the CNTVCT fast-path clock_gettime trampoline at page+pc_off; the + * vvar lives at page+vvar_off. The trampoline is exactly TEXT_GETTIME_SIZE + * bytes; the static_assert below catches drift. + */ +static void emit_clock_gettime_trampoline(uint32_t *code, + uint32_t pc_off, + uint32_t vvar_off) +{ + /* svc_fallback starts at offset 0x64 within the trampoline. The + * branch instructions live at offsets 0x08 (b.ne on clockid != 1), + * 0x18 (b.ne on initialized != 1), and 0x2C (b.lo on cntvct underflow). + * Each branch encoder takes a byte-relative offset (target - branch_pc) + * and shifts >> 2 internally for imm19. + */ + int32_t svc_fallback_off = 0x64; + int32_t adr_pc_off = 0x0C; + int32_t vvar_rel = (int32_t) vvar_off - (int32_t) (pc_off + adr_pc_off); + + code[0] = 0xD53BE049U; /* mrs x9, cntvct_el0 */ + code[1] = enc_cmp_w_imm12(0, 1); /* cmp w0, #1 */ + code[2] = + enc_bcond_imm19(COND_NE, svc_fallback_off - 0x08); /* b.ne fallback */ + code[3] = enc_adr(2, vvar_rel); /* adr x2, vvar */ + code[4] = enc_ldar_w(3, 2); /* ldar w3, [x2] */ + code[5] = enc_cmp_w_imm12(3, 1); /* cmp w3, #1 */ + code[6] = + enc_bcond_imm19(COND_NE, svc_fallback_off - 0x18); /* b.ne fallback */ + code[7] = enc_ldr_x_imm12(3, 2, VVAR_OFF_ANCHOR_CNTVCT); + code[8] = enc_ldr_x_imm12(4, 2, VVAR_OFF_ANCHOR_SEC); + code[9] = enc_ldr_x_imm12(5, 2, VVAR_OFF_ANCHOR_NSEC); + code[10] = enc_subs_x(6, 9, 3); /* subs x6, x9, x3 (delta) */ + code[11] = + enc_bcond_imm19(COND_LO, svc_fallback_off - 0x2C); /* b.lo fallback */ + code[12] = enc_movz_x(7, 125); + code[13] = enc_mul_x(6, 6, 7); /* delta * 125 */ + code[14] = enc_movz_x(7, 3); + code[15] = enc_udiv_x(6, 6, 7); /* delta_ns = delta*125/3 */ + code[16] = enc_add_x(5, 5, 6); /* nsec + delta_ns */ + code[17] = enc_movz_x(7, 0xCA00); + code[18] = enc_movk_x_lsl16(7, 0x3B9A); /* x7 = 1e9 */ + code[19] = enc_udiv_x(8, 5, 7); /* sec_carry */ + code[20] = enc_msub_x(5, 8, 7, 5); /* nsec %= 1e9 */ + code[21] = enc_add_x(4, 4, 8); /* sec += carry */ + code[22] = enc_stp_x_imm7(4, 5, 1, 0); /* stp x4, x5, [x1] */ + code[23] = enc_movz_x(0, 0); /* return 0 */ + code[24] = 0xD65F03C0U; /* ret */ + /* svc_fallback at offset 0x64: mov x8, #113; svc #0; ret */ + code[25] = enc_movz_x(8, 113); + code[26] = 0xD4000001U; /* svc #0 */ + code[27] = 0xD65F03C0U; /* ret */ +} + +_Static_assert(TEXT_GETTIME_SIZE == 28 * sizeof(uint32_t), + "clock_gettime trampoline size must match emitter"); + +/* The public sigret offset declared in core/vdso.h must match the + * internal layout above; signal.c sets X30 to VDSO_BASE + VDSO_OFF_SIGRET + * as the return-from-handler target. + */ +_Static_assert(VDSO_OFF_SIGRET == TEXT_OFF_SIGRET, + "VDSO_OFF_SIGRET in core/vdso.h must equal TEXT_OFF_SIGRET"); + +static uint32_t elf_hash(const char *name) +{ + uint32_t h = 0, g; + + while (*name) { + h = (h << 4) + (unsigned char) *name++; + g = h & 0xf0000000U; + if (g) + h ^= g >> 24; + h &= ~g; + } + return h; +} + uint64_t vdso_build(guest_t *g) { uint8_t *page = (uint8_t *) guest_ptr(g, VDSO_BASE); @@ -160,7 +452,7 @@ uint64_t vdso_build(guest_t *g) ehdr->e_phentsize = sizeof(elf64_phdr_t); ehdr->e_phnum = 2; ehdr->e_shentsize = sizeof(elf64_shdr_t); - ehdr->e_shnum = 6; + ehdr->e_shnum = 8; ehdr->e_shstrndx = 2; /* Program header 0: PT_LOAD. */ @@ -181,8 +473,8 @@ uint64_t vdso_build(guest_t *g) phdr1->p_offset = VDSO_OFF_DYNAMIC; phdr1->p_vaddr = VDSO_OFF_DYNAMIC; phdr1->p_paddr = VDSO_OFF_DYNAMIC; - phdr1->p_filesz = 6 * sizeof(elf64_dyn_t); - phdr1->p_memsz = 6 * sizeof(elf64_dyn_t); + phdr1->p_filesz = VDSO_NUM_DYN * sizeof(elf64_dyn_t); + phdr1->p_memsz = VDSO_NUM_DYN * sizeof(elf64_dyn_t); phdr1->p_align = 8; /* Text trampolines. Each entry is the same 12-byte mov/svc/ret pattern @@ -190,9 +482,14 @@ uint64_t vdso_build(guest_t *g) */ emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_SIGRET), 139); emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_GETRES), 114); - emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_GETTIME), 113); + emit_clock_gettime_trampoline((uint32_t *) (page + TEXT_OFF_GETTIME), + TEXT_OFF_GETTIME, VDSO_OFF_VVAR); emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_GETTOD), 169); + /* vvar starts zero (initialized==0). The first __kernel_clock_gettime + * SVC fallback will let the host populate the anchor. + */ + /* Dynamic string table. */ memcpy(page + VDSO_OFF_DYNSTR, dynstr_data, DYNSTR_SIZE); @@ -221,6 +518,27 @@ uint64_t vdso_build(guest_t *g) } hash[2] = first_sym; + /* GNU symbol versioning. glibc's aarch64 vDSO resolver asks for + * LINUX_2.6.39 and ignores unversioned helpers. + */ + uint16_t *versym = (uint16_t *) (page + VDSO_OFF_VERSYM); + versym[0] = 0; + for (int i = 1; i <= VDSO_NUM_SYMS; i++) + versym[i] = VDSO_LINUX_VERSION_INDEX; + + elf64_verdef_t *verdef = (elf64_verdef_t *) (page + VDSO_OFF_VERDEF); + elf64_verdaux_t *verdaux = + (elf64_verdaux_t *) (page + VDSO_OFF_VERDEF + sizeof(*verdef)); + verdef->vd_version = VER_DEF_CURRENT; + verdef->vd_flags = 0; + verdef->vd_ndx = VDSO_LINUX_VERSION_INDEX; + verdef->vd_cnt = 1; + verdef->vd_hash = elf_hash("LINUX_2.6.39"); + verdef->vd_aux = sizeof(*verdef); + verdef->vd_next = 0; + verdaux->vda_name = VDSO_LINUX_VERSION_NAME_OFF; + verdaux->vda_next = 0; + /* Dynamic table. */ elf64_dyn_t *dyn = (elf64_dyn_t *) (page + VDSO_OFF_DYNAMIC); dyn[0] = (elf64_dyn_t) {DT_HASH, VDSO_OFF_HASH}; @@ -228,7 +546,10 @@ uint64_t vdso_build(guest_t *g) dyn[2] = (elf64_dyn_t) {DT_STRTAB, VDSO_OFF_DYNSTR}; dyn[3] = (elf64_dyn_t) {DT_STRSZ, DYNSTR_SIZE}; dyn[4] = (elf64_dyn_t) {DT_SYMENT, sizeof(elf64_sym_t)}; - dyn[5] = (elf64_dyn_t) {DT_NULL, 0}; + dyn[5] = (elf64_dyn_t) {DT_VERSYM, VDSO_OFF_VERSYM}; + dyn[6] = (elf64_dyn_t) {DT_VERDEF, VDSO_OFF_VERDEF}; + dyn[7] = (elf64_dyn_t) {DT_VERDEFNUM, 1}; + dyn[8] = (elf64_dyn_t) {DT_NULL, 0}; /* Section headers. */ elf64_shdr_t *shdr = (elf64_shdr_t *) (page + VDSO_OFF_SHDR); @@ -276,10 +597,71 @@ uint64_t vdso_build(guest_t *g) shdr[5].sh_flags = SHF_ALLOC; shdr[5].sh_addr = VDSO_OFF_DYNAMIC; shdr[5].sh_offset = VDSO_OFF_DYNAMIC; - shdr[5].sh_size = 6 * sizeof(elf64_dyn_t); + shdr[5].sh_size = VDSO_NUM_DYN * sizeof(elf64_dyn_t); shdr[5].sh_link = 2; shdr[5].sh_addralign = 8; shdr[5].sh_entsize = sizeof(elf64_dyn_t); + shdr[6].sh_name = 0; + shdr[6].sh_type = SHT_GNU_VERSYM; + shdr[6].sh_flags = SHF_ALLOC; + shdr[6].sh_addr = VDSO_OFF_VERSYM; + shdr[6].sh_offset = VDSO_OFF_VERSYM; + shdr[6].sh_size = VERSYM_SIZE; + shdr[6].sh_link = 3; + shdr[6].sh_addralign = 2; + shdr[6].sh_entsize = sizeof(uint16_t); + + shdr[7].sh_name = 0; + shdr[7].sh_type = SHT_GNU_VERDEF; + shdr[7].sh_flags = SHF_ALLOC; + shdr[7].sh_addr = VDSO_OFF_VERDEF; + shdr[7].sh_offset = VDSO_OFF_VERDEF; + shdr[7].sh_size = VERDEF_SIZE; + shdr[7].sh_link = 2; + shdr[7].sh_info = 1; + shdr[7].sh_addralign = 4; + return VDSO_BASE; } + +void vdso_seed_anchor(guest_t *g, + uint64_t guest_cntvct, + int64_t anchor_sec, + int64_t anchor_nsec) +{ + uint8_t *page = (uint8_t *) guest_ptr(g, VDSO_BASE); + if (!page) + return; + uint32_t *initialized = (uint32_t *) (page + VDSO_OFF_VVAR); + uint8_t *vvar = page + VDSO_OFF_VVAR; + + /* Three-state CAS reservation: 0 = unseeded, 2 = reserving (one host + * thread owns the anchor stores), 1 = ready. Multiple host threads can + * concurrently take the SVC fallback on the first guest call; without + * the reservation they race on the plain anchor stores. The CAS winner + * writes the fields and releases 1; losers bail. The guest trampoline + * loads initialized with LDAR and only takes the fast path on + * initialized == 1, so state 2 still routes to the SVC fallback. + */ + uint32_t expected = 0; + if (!__atomic_compare_exchange_n(initialized, &expected, 2, + /* weak */ false, __ATOMIC_ACQUIRE, + __ATOMIC_RELAXED)) + return; + + *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_CNTVCT) = guest_cntvct; + *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_SEC) = (uint64_t) anchor_sec; + *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_NSEC) = (uint64_t) anchor_nsec; + + /* The release-store on initialized pairs with the trampoline's LDAR + * load on the same address; observing 1 also makes the anchor fields + * visible to the guest. + */ + __atomic_store_n(initialized, 1, __ATOMIC_RELEASE); +} + +uint64_t vdso_clock_gettime_svc_pc(void) +{ + return VDSO_BASE + VDSO_CLOCK_GETTIME_SVC_PC; +} diff --git a/src/core/vdso.h b/src/core/vdso.h index e3a41d5..b1ea9c2 100644 --- a/src/core/vdso.h +++ b/src/core/vdso.h @@ -17,12 +17,36 @@ /* Guest address where the vDSO is placed (one 4KiB page, below PT pool) */ #define VDSO_BASE 0x0000F000ULL #define VDSO_SIZE 0x00001000ULL /* 4KiB */ -#define VDSO_OFF_TEXT 0x0B0 /* Offset of .text (trampoline code) */ +/* Offset of __kernel_rt_sigreturn (the signal trampoline glibc/musl jumps + * to via X30/LR after the handler returns). Must match TEXT_OFF_SIGRET in + * src/core/vdso.c; kept here so signal.c can target it without including + * the vDSO internals. + */ +#define VDSO_OFF_SIGRET 0x0D0 /* Build a minimal vDSO ELF image at VDSO_BASE in guest memory. * The image contains a valid ELF header, one LOAD program header, SHT_DYNSYM - * and SHT_STRTAB sections, and a __kernel_rt_sigreturn symbol pointing to - * a small trampoline (mov x8, #139; svc #0). + * and SHT_STRTAB sections, and a __kernel_rt_sigreturn symbol pointing to a + * small trampoline (mov x8, #139; svc #0). * Returns the GVA of the ELF header (== VDSO_BASE), or 0 on failure. */ uint64_t vdso_build(guest_t *g); + +/* If the vvar anchor has not been seeded yet, install the supplied cntvct as + * the guest-frame anchor paired with the given wall_clock. Idempotent: + * subsequent calls with initialized==1 are no-ops. Used by sys_clock_gettime + * to upgrade the first __kernel_clock_gettime SVC fallback into a permanent + * vvar fast path. + */ +void vdso_seed_anchor(guest_t *g, + uint64_t guest_cntvct, + int64_t anchor_sec, + int64_t anchor_nsec); + +/* GVA at which the trampoline's svc_fallback issues its SVC. Used by + * sys_clock_gettime to verify a clock_gettime trap actually came from the vDSO + * fallback path (and thus carries a guest-frame CNTVCT in X9) versus an + * unrelated raw syscall(SYS_clock_gettime, ...). The trap returns to SVC_PC + * + 4, so callers compare ELR_EL1 against that. + */ +uint64_t vdso_clock_gettime_svc_pc(void); diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c index f9746cd..edf1758 100644 --- a/src/runtime/fork-state.c +++ b/src/runtime/fork-state.c @@ -21,6 +21,7 @@ #include "debug/log.h" #include "syscall/abi.h" #include "syscall/internal.h" +#include "syscall/io.h" #include "syscall/mem.h" #include "syscall/proc.h" @@ -249,9 +250,19 @@ int fork_ipc_send_fd_table(int ipc_sock) if (fd_table[i].type == FD_CLOSED) continue; + /* Synthetic-fd types are filtered here; see fd_type_is_synthetic + * in syscall/internal.h for the rationale (kqueue cannot cross + * SCM_RIGHTS on macOS, per-class side tables are not serialized). + * The child sees these slots as FD_CLOSED and recreates them via + * the appropriate syscall. + */ + int t = fd_table[i].type; + if (fd_type_is_synthetic(t)) + continue; + int host_fd; bool was_duped = false; - if (fd_table[i].type != FD_STDIO) { + if (t != FD_STDIO) { int duped = dup(fd_table[i].host_fd); if (duped < 0) continue; @@ -315,8 +326,11 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g) return -1; } - if (num_fds == 0) + if (num_fds == 0) { + for (int fd = 0; fd < 3; fd++) + fd_mark_closed(fd); return 0; + } ipc_fd_entry_t *fd_entries = calloc(num_fds, sizeof(ipc_fd_entry_t)); if (!fd_entries) @@ -328,6 +342,16 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g) return -1; } + bool low_fd_present[3] = {false, false, false}; + for (uint32_t i = 0; i < num_fds; i++) { + int gfd = fd_entries[i].guest_fd; + if (RANGE_CHECK(gfd, 0, 3) && !fd_type_is_synthetic(fd_entries[i].type)) + low_fd_present[gfd] = true; + } + for (int fd = 0; fd < 3; fd++) + if (!low_fd_present[fd]) + fd_mark_closed(fd); + int *host_fds = calloc(num_fds, sizeof(int)); if (!host_fds) { free(fd_entries); @@ -364,12 +388,30 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g) memcpy(fd_table[gfd].proc_path, fd_entries[i].proc_path, sizeof(fd_table[gfd].proc_path)); fd_table[gfd].seals = fd_entries[i].seals; + } else if (fd_type_is_synthetic(fd_entries[i].type)) { + /* Defense in depth: the parent's fork_ipc_send_fd_table + * already filters synthetic types out of the SCM_RIGHTS + * payload (see fd_type_is_synthetic in syscall/internal.h). + * If anything still arrives here, drop the inherited host + * fd and leave the slot FD_CLOSED so the child must + * recreate the fd via the appropriate syscall. + */ + log_debug( + "fork-child: dropping unexpected synthetic-type fd %d (type " + "%d)", + gfd, fd_entries[i].type); + close(host_fds[i]); + fd_mark_closed(gfd); + continue; } else { - fd_alloc_at(gfd, fd_entries[i].type, host_fds[i]); + void (*cleanup)(int) = fd_cleanup_for_type(fd_entries[i].type); + fd_alloc_at(gfd, fd_entries[i].type, host_fds[i], cleanup); fd_table[gfd].linux_flags = fd_entries[i].linux_flags; memcpy(fd_table[gfd].proc_path, fd_entries[i].proc_path, sizeof(fd_table[gfd].proc_path)); fd_table[gfd].seals = fd_entries[i].seals; + if (fd_entries[i].type == FD_URANDOM) + urandom_fd_reset_cache(gfd); if (fd_entries[i].type != FD_DIR) continue; diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index 963cb61..59c9ffe 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -1272,7 +1272,7 @@ int64_t sys_clone(hv_vcpu_t vcpu, * * Rosetta guests are excluded from CoW even when shm-backed: rosetta's * JIT state (TLS slabs, code caches, indirect-call tables, block lists) - * is process-local and corrupts when COW-shared. The legacy region-copy + * is process-local and corrupts when CoW-shared. The legacy region-copy * path preserves the parent's JIT state independently per child. */ bool use_shm = (g->shm_fd >= 0) && !g->is_rosetta; diff --git a/src/syscall/abi.h b/src/syscall/abi.h index eda9bc7..122b351 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -639,6 +639,7 @@ typedef struct { #define FD_FUSE_DEV 14 #define FD_FUSE_FILE 15 #define FD_FUSE_DIR 16 +#define FD_URANDOM 17 #define FD_VIRTUAL_PATH_MAX 64 /* File sealing flags (F_SEAL_*) for memfd_create. Tracked per-FD. */ diff --git a/src/syscall/fd.c b/src/syscall/fd.c index c1f828f..f06b0d2 100644 --- a/src/syscall/fd.c +++ b/src/syscall/fd.c @@ -104,6 +104,7 @@ void timerfd_init(void) { for (int i = 0; i < TIMERFD_MAX; i++) timerfd_state[i].guest_fd = -1; + fd_register_cleanup(FD_TIMERFD, timerfd_close); } static int timerfd_find(int guest_fd) @@ -514,10 +515,20 @@ static void timerfd_close(int guest_fd) #define LINUX_EFD_NONBLOCK 0x800 /* Same as O_NONBLOCK */ #define LINUX_EFD_SEMAPHORE 1 -/* Per-eventfd state */ +/* Per-eventfd state. The slot is shared across guest_fds that point at it (via + * dup/dup2/fcntl F_DUPFD), matching the Linux contract that dup'd eventfd fds + * share the same kernel object. eventfd_owner[gfd] maps a guest_fd to its slot; + * multiple guest_fds can map to the same slot. The slot owns its own read end + * for readiness/drain/blocking operations so it does not depend on any one + * guest fd remaining open. The slot is freed when refcount drops to zero. The + * slot's guest_fd field is retained for sfd_alloc_slot's + * "free if guest_fd == -1" convention and tracks the most recently allocated + * primary owner. + */ #define EVENTFD_MAX 32 static struct { - int guest_fd; /* Guest fd (-1 if unused) */ + int guest_fd; /* Primary guest fd, -1 when slot is free */ + int refcount; /* Number of guest_fds bound to this slot */ int pipe_rd; /* Read end of self-pipe (for poll/epoll readiness) */ int pipe_wr; /* Write end of self-pipe */ uint64_t counter; /* Accumulated event counter */ @@ -525,16 +536,22 @@ static struct { int nonblock; /* O_NONBLOCK */ } eventfd_state[EVENTFD_MAX]; +static int eventfd_owner[FD_TABLE_SIZE]; /* guest_fd -> slot, or -1 */ + void eventfd_init(void) { for (int i = 0; i < EVENTFD_MAX; i++) eventfd_state[i].guest_fd = -1; + for (int i = 0; i < FD_TABLE_SIZE; i++) + eventfd_owner[i] = -1; + fd_register_cleanup(FD_EVENTFD, eventfd_close); } static int eventfd_find(int guest_fd) { - return sfd_find_slot(eventfd_state, EVENTFD_MAX, sizeof(eventfd_state[0]), - guest_fd); + if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE)) + return -1; + return eventfd_owner[guest_fd]; } static int eventfd_slot_alloc(void) @@ -542,6 +559,19 @@ static int eventfd_slot_alloc(void) return sfd_alloc_slot(eventfd_state, EVENTFD_MAX, sizeof(eventfd_state[0])); } +static void eventfd_release_ref_locked(int slot) +{ + if (--eventfd_state[slot].refcount <= 0) { + close(eventfd_state[slot].pipe_rd); + close(eventfd_state[slot].pipe_wr); + eventfd_state[slot].guest_fd = -1; + eventfd_state[slot].counter = 0; + eventfd_state[slot].refcount = 0; + eventfd_state[slot].pipe_rd = -1; + eventfd_state[slot].pipe_wr = -1; + } +} + int64_t sys_eventfd2(unsigned int initval, int flags) { if (flags & ~(LINUX_EFD_CLOEXEC | LINUX_EFD_NONBLOCK | LINUX_EFD_SEMAPHORE)) @@ -564,9 +594,22 @@ int64_t sys_eventfd2(unsigned int initval, int flags) return linux_errno(); } + int state_rd = dup(pipefd[0]); + if (state_rd < 0 || fd_set_nonblock(state_rd) < 0 || + fd_set_cloexec(state_rd) < 0) { + int saved_errno = errno; + if (state_rd >= 0) + close(state_rd); + close(pipefd[0]); + close(pipefd[1]); + errno = saved_errno; + return linux_errno(); + } + /* Allocate guest fd: use read end as the host fd so epoll/poll sees it */ int gfd = fd_alloc(FD_EVENTFD, pipefd[0], eventfd_close); if (gfd < 0) { + close(state_rd); close(pipefd[0]); close(pipefd[1]); return -LINUX_EMFILE; @@ -577,17 +620,20 @@ int64_t sys_eventfd2(unsigned int initval, int flags) if (slot < 0) { pthread_mutex_unlock(&sfd_lock); fd_mark_closed(gfd); + close(state_rd); close(pipefd[0]); close(pipefd[1]); return -LINUX_ENOMEM; } eventfd_state[slot].guest_fd = gfd; - eventfd_state[slot].pipe_rd = pipefd[0]; + eventfd_state[slot].refcount = 1; + eventfd_state[slot].pipe_rd = state_rd; eventfd_state[slot].pipe_wr = pipefd[1]; eventfd_state[slot].counter = (uint64_t) initval; eventfd_state[slot].semaphore = (flags & LINUX_EFD_SEMAPHORE) ? 1 : 0; eventfd_state[slot].nonblock = (flags & LINUX_EFD_NONBLOCK) ? 1 : 0; + eventfd_owner[gfd] = slot; pthread_mutex_unlock(&sfd_lock); fd_table[gfd].linux_flags = @@ -610,14 +656,117 @@ static void eventfd_close(int guest_fd) pthread_mutex_lock(&sfd_lock); int slot = eventfd_find(guest_fd); if (slot >= 0) { - close(eventfd_state[slot].pipe_wr); - /* pipe_rd is closed by sys_close() as host_fd */ - eventfd_state[slot].guest_fd = -1; - eventfd_state[slot].counter = 0; + eventfd_owner[guest_fd] = -1; + eventfd_release_ref_locked(slot); } pthread_mutex_unlock(&sfd_lock); } +/* Bind an additional guest_fd to the same slot as src_fd, sharing the + * counter and pipe state. Two races to defeat: + * + * - Source identity. duplicate_guest_fd() snapshots src_fd under + * fd_lock, releases it, then calls us. Between those points src_fd + * could be closed and rebound to a different eventfd. We carry the + * caller's snapshot of fd_table[src_fd].host_fd as src_host_fd and verify + * under fd_lock + sfd_lock that the source fd still has that host fd and + * still maps to a live eventfd slot. + * + * - Destination close. fd_alloc_*_relaxed publishes the new guest_fd + * with eventfd_close as cleanup before we install the owner mapping. + * A racing close would run eventfd_close, see owner == -1, skip the + * refcount decrement, and leak the slot. We defeat this by reserving a + * slot ref before publishing the destination, then holding fd_lock + + * sfd_lock together while we verify fd_table[new] is still FD_EVENTFD with + * the host_fd we allocated and set eventfd_owner. Any close that already + * ran is observed here as FD_CLOSED, and we abandon the bind cleanly with + * no leak. + */ +int eventfd_dup_fd(int src_fd, + int src_host_fd, + int min_guest_fd, + int fixed_guest_fd, + bool fixed_slot, + int linux_flags) +{ + /* Pin the source under fd_lock + sfd_lock and dup the slot-owned + * readiness fd. The slot fd is independent of any guest alias, so closing + * the source later cannot invalidate eventfd_state[slot].pipe_rd. + */ + pthread_mutex_lock(&fd_lock); + pthread_mutex_lock(&sfd_lock); + int slot = eventfd_find(src_fd); + if (slot < 0 || fd_table[src_fd].type != FD_EVENTFD || + fd_table[src_fd].host_fd != src_host_fd || + eventfd_state[slot].refcount <= 0) { + pthread_mutex_unlock(&sfd_lock); + pthread_mutex_unlock(&fd_lock); + errno = EBADF; + return -1; + } + eventfd_state[slot].refcount++; + int new_host_fd = dup(eventfd_state[slot].pipe_rd); + int original_pipe_rd = eventfd_state[slot].pipe_rd; + if (new_host_fd < 0) + eventfd_release_ref_locked(slot); + pthread_mutex_unlock(&sfd_lock); + pthread_mutex_unlock(&fd_lock); + if (new_host_fd < 0) + return -1; + + /* Publish the destination fd with eventfd_close as cleanup. The + * eventfd_owner mapping is still -1, so a racing close here observes + * owner == -1 and does nothing; we detect that below. + */ + int new_guest_fd = fixed_slot + ? fd_alloc_at_relaxed(fixed_guest_fd, FD_EVENTFD, + new_host_fd, eventfd_close) + : fd_alloc_from_relaxed(min_guest_fd, FD_EVENTFD, + new_host_fd, eventfd_close); + if (new_guest_fd < 0) { + close(new_host_fd); + pthread_mutex_lock(&sfd_lock); + eventfd_release_ref_locked(slot); + pthread_mutex_unlock(&sfd_lock); + if (fixed_slot) + errno = EBADF; + return -1; + } + + /* Commit the bind under both locks in the documented order + * (fd_lock then sfd_lock). If a close already ran, fd_table[new].type + * is FD_CLOSED and we just bail with -EBADF; the host_fd is already + * gone via sys_close. Otherwise verify the source slot is still + * alive and unchanged, then install owner for the reserved ref. + */ + pthread_mutex_lock(&fd_lock); + pthread_mutex_lock(&sfd_lock); + if (fd_table[new_guest_fd].type != FD_EVENTFD || + fd_table[new_guest_fd].host_fd != new_host_fd || + eventfd_state[slot].refcount <= 0 || + eventfd_state[slot].pipe_rd != original_pipe_rd) { + pthread_mutex_unlock(&sfd_lock); + pthread_mutex_unlock(&fd_lock); + /* If the destination is still open but the source went away, + * tear it down. (If the destination already closed itself, the + * snapshot below sees FD_CLOSED and is a no-op.) + */ + fd_entry_t snap; + if (fd_snapshot_and_close(new_guest_fd, &snap)) + fd_cleanup_entry(new_guest_fd, &snap); + pthread_mutex_lock(&sfd_lock); + eventfd_release_ref_locked(slot); + pthread_mutex_unlock(&sfd_lock); + errno = EBADF; + return -1; + } + eventfd_owner[new_guest_fd] = slot; + fd_table[new_guest_fd].linux_flags = linux_flags; + pthread_mutex_unlock(&sfd_lock); + pthread_mutex_unlock(&fd_lock); + return new_guest_fd; +} + /* Read from eventfd: return 8-byte counter value, then reset to 0. * In EFD_SEMAPHORE mode, return 1 and decrement counter by 1. */ @@ -657,8 +806,12 @@ int64_t eventfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) return linux_errno(); pthread_mutex_lock(&sfd_lock); - /* Re-validate: slot may have been freed by eventfd_close() */ - if (eventfd_state[slot].guest_fd != guest_fd) { + /* Re-validate via the owner table, not eventfd_state[slot].guest_fd: + * dup'd aliases bind multiple guest_fds to the same slot, so a + * legitimate caller's guest_fd may not equal the primary owner. + */ + if (eventfd_owner[guest_fd] != slot || + eventfd_state[slot].refcount <= 0) { pthread_mutex_unlock(&sfd_lock); return -LINUX_EBADF; } @@ -809,6 +962,7 @@ void signalfd_init(void) { for (int i = 0; i < SIGNALFD_MAX; i++) signalfd_state[i].guest_fd = -1; + fd_register_cleanup(FD_SIGNALFD, signalfd_close); } static int signalfd_find(int guest_fd) diff --git a/src/syscall/fd.h b/src/syscall/fd.h index e087ed4..faaf958 100644 --- a/src/syscall/fd.h +++ b/src/syscall/fd.h @@ -33,6 +33,21 @@ int64_t sys_timerfd_gettime(guest_t *g, int fd, uint64_t curr_value_gva); /* eventfd (emulated via pipe + counter) */ int64_t sys_eventfd2(unsigned int initval, int flags); +/* Duplicate an eventfd into a new guest_fd slot, sharing the counter and + * pipe state with src_fd. Mirrors the Linux contract that dup'd eventfds + * share the same underlying kernel object. src_host_fd must be the host + * fd snapshotted from fd_table[src_fd].host_fd by the caller; the + * implementation uses it to verify under fd_lock + sfd_lock that the source + * fd still refers to the same live eventfd between the caller's snapshot and + * the dup commit. Returns the new guest_fd or -1 with errno set. + */ +int eventfd_dup_fd(int src_fd, + int src_host_fd, + int min_guest_fd, + int fixed_guest_fd, + bool fixed_slot, + int linux_flags); + /* signalfd (emulated via synthetic signal reads) */ int64_t sys_signalfd4(guest_t *g, int fd, diff --git a/src/syscall/fdtable.c b/src/syscall/fdtable.c index 5455f41..9c388c4 100644 --- a/src/syscall/fdtable.c +++ b/src/syscall/fdtable.c @@ -169,26 +169,29 @@ int fd_alloc(int type, int host_fd, void (*cleanup)(int)) /* Allocate the lowest available FD >= minfd. Returns -1 if none available * or RLIMIT_NOFILE would be exceeded. */ -int fd_alloc_from(int minfd, int type, int host_fd) +int fd_alloc_from(int minfd, int type, int host_fd, void (*cleanup)(int)) { pthread_mutex_lock(&fd_lock); - int fd = fd_alloc_locked(minfd, type, host_fd, NULL); + int fd = fd_alloc_locked(minfd, type, host_fd, cleanup); pthread_mutex_unlock(&fd_lock); return fd; } -int fd_alloc_from_relaxed(int minfd, int type, int host_fd) +int fd_alloc_from_relaxed(int minfd, + int type, + int host_fd, + void (*cleanup)(int)) { if (!thread_is_single_active()) - return fd_alloc_from(minfd, type, host_fd); - return fd_alloc_locked(minfd, type, host_fd, NULL); + return fd_alloc_from(minfd, type, host_fd, cleanup); + return fd_alloc_locked(minfd, type, host_fd, cleanup); } /* Allocate a specific FD slot. Enforces RLIMIT_NOFILE. Properly cleans up any * existing entry (including DIR* for directory FDs) before overwriting. Returns * -1 if out of range. */ -int fd_alloc_at(int fd, int type, int host_fd) +int fd_alloc_at(int fd, int type, int host_fd, void (*cleanup)(int)) { if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) return -1; @@ -204,7 +207,7 @@ int fd_alloc_at(int fd, int type, int host_fd) pthread_mutex_lock(&fd_lock); if (fd_table[fd].type != FD_CLOSED) old = fd_table[fd]; - fd_init_entry(fd, type, host_fd, NULL); + fd_init_entry(fd, type, host_fd, cleanup); pthread_mutex_unlock(&fd_lock); /* Clean up old resources outside fd_lock */ @@ -214,19 +217,19 @@ int fd_alloc_at(int fd, int type, int host_fd) return fd; } -int fd_alloc_at_relaxed(int fd, int type, int host_fd) +int fd_alloc_at_relaxed(int fd, int type, int host_fd, void (*cleanup)(int)) { if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) return -1; if (fd >= rlimit_nofile_cur) return -1; if (!thread_is_single_active()) - return fd_alloc_at(fd, type, host_fd); + return fd_alloc_at(fd, type, host_fd, cleanup); if (fd_table[fd].type != FD_CLOSED) - return fd_alloc_at(fd, type, host_fd); + return fd_alloc_at(fd, type, host_fd, cleanup); - fd_init_entry(fd, type, host_fd, NULL); + fd_init_entry(fd, type, host_fd, cleanup); return fd; } @@ -334,6 +337,53 @@ bool fd_snapshot(int guest_fd, fd_entry_t *out) return ok; } +int fd_snapshot_and_dup(int guest_fd, fd_entry_t *out) +{ + out->type = FD_CLOSED; + if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE)) + return -1; + pthread_mutex_lock(&fd_lock); + if (!fd_snapshot_locked(guest_fd, out, false)) { + pthread_mutex_unlock(&fd_lock); + return -1; + } + int host = (out->host_fd >= 0) ? dup(out->host_fd) : -1; + pthread_mutex_unlock(&fd_lock); + return host; +} + +int fd_get_type(int guest_fd) +{ + if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE)) + return FD_CLOSED; + pthread_mutex_lock(&fd_lock); + int type = fd_table[guest_fd].type; + pthread_mutex_unlock(&fd_lock); + return type; +} + +/* Sized to cover all FD_* constants in abi.h plus a small headroom. Indexed + * by type. Each slot defaults to NULL (no per-type cleanup). Modules that + * own a type call fd_register_cleanup() at init time; dup and fork-restore + * paths read back the binding via fd_cleanup_for_type(). + */ +#define FD_TYPE_REGISTRY_SIZE 32 +static void (*fd_type_cleanup[FD_TYPE_REGISTRY_SIZE])(int); + +void fd_register_cleanup(int type, void (*cleanup)(int)) +{ + if (type < 0 || type >= FD_TYPE_REGISTRY_SIZE) + return; + fd_type_cleanup[type] = cleanup; +} + +void (*fd_cleanup_for_type(int type))(int) +{ + if (type < 0 || type >= FD_TYPE_REGISTRY_SIZE) + return NULL; + return fd_type_cleanup[type]; +} + /* Look up a guest FD and return a dup'd host fd that the caller owns. * The dup is performed under fd_lock so that close() on another thread * cannot invalidate the host fd between lookup and dup. Caller must diff --git a/src/syscall/fs.c b/src/syscall/fs.c index ce951eb..426c6df 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -27,6 +27,7 @@ #include "runtime/procemu.h" #include "syscall/abi.h" +#include "syscall/fd.h" /* eventfd_dup_fd */ #include "syscall/fuse.h" #include "syscall/fs.h" #include "syscall/internal.h" @@ -62,6 +63,16 @@ static int opened_fd_type(int host_fd, int linux_flags) return FD_REGULAR; } +static int intercepted_fd_type(const char *path, int host_fd, int linux_flags) +{ + int type = opened_fd_type(host_fd, linux_flags); + if (type < 0) + return type; + if (type == FD_REGULAR && path && !strcmp(path, "/dev/urandom")) + return FD_URANDOM; + return type; +} + static const char *proc_virtual_dir_path(const char *path, char *buf, size_t bufsz); @@ -168,16 +179,11 @@ static const char *proc_virtual_dir_path(const char *path, return virt; } -static int dup_fd_type(int guest_fd) -{ - return fd_table[guest_fd].type == FD_STDIO ? FD_REGULAR - : fd_table[guest_fd].type; -} - static int fd_alloc_opened_host(int host_fd, int type, int linux_flags, - int min_guest_fd) + int min_guest_fd, + void (*cleanup)(int)) { DIR *dir = NULL; @@ -193,9 +199,10 @@ static int fd_alloc_opened_host(int host_fd, } } - int guest_fd = min_guest_fd >= 0 - ? fd_alloc_from_relaxed(min_guest_fd, type, host_fd) - : fd_alloc_from_relaxed(0, type, host_fd); + int guest_fd = + min_guest_fd >= 0 + ? fd_alloc_from_relaxed(min_guest_fd, type, host_fd, cleanup) + : fd_alloc_from_relaxed(0, type, host_fd, cleanup); if (guest_fd < 0) { int saved_errno = errno; if (dir) @@ -249,7 +256,7 @@ int64_t sys_openat_path(guest_t *g, return linux_errno(); } int guest_fd = - fd_alloc_opened_host(sidecar_fd, type, linux_flags, -1); + fd_alloc_opened_host(sidecar_fd, type, linux_flags, -1, NULL); if (guest_fd < 0) { close_keep_errno(sidecar_fd); return linux_errno(); @@ -278,7 +285,8 @@ int64_t sys_openat_path(guest_t *g, close_keep_errno(host_fd); return linux_errno(); } - int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1); + int guest_fd = + fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL); if (guest_fd < 0) { close_keep_errno(host_fd); return linux_errno(); @@ -303,15 +311,17 @@ int64_t sys_openat_path(guest_t *g, * /proc files use fd_alloc_from(128) to avoid races with * concurrent GC finalizers that may close stale low-numbered fds. */ - int type = opened_fd_type(intercepted, linux_flags); + int type = intercepted_fd_type(tx.intercept_path, intercepted, + linux_flags); if (type < 0) { close_keep_errno(intercepted); return linux_errno(); } int min_guest_fd = (!strncmp(tx.intercept_path, "/dev/", 5)) ? -1 : 128; - int guest_fd = fd_alloc_opened_host(intercepted, type, linux_flags, - min_guest_fd); + int guest_fd = + fd_alloc_opened_host(intercepted, type, linux_flags, + min_guest_fd, fd_cleanup_for_type(type)); if (guest_fd < 0) { close_keep_errno(intercepted); return linux_errno(); @@ -336,7 +346,8 @@ int64_t sys_openat_path(guest_t *g, close_keep_errno(host_fd); return linux_errno(); } - int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1); + int guest_fd = + fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL); if (guest_fd < 0) { close_keep_errno(host_fd); return linux_errno(); @@ -358,7 +369,7 @@ int64_t sys_openat_path(guest_t *g, close_keep_errno(host_fd); return linux_errno(); } - int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1); + int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL); if (guest_fd < 0) { close_keep_errno(host_fd); return linux_errno(); @@ -436,14 +447,16 @@ static void discard_allocated_fd(int guest_fd) fd_cleanup_entry(guest_fd, &snap); } -static void copy_fd_alias_metadata(int src_fd, int dst_fd, int linux_flags) +static void install_fd_alias_metadata(int dst_fd, + const fd_entry_t *src_snap, + int linux_flags) { - int preserved_flags = fd_table[src_fd].linux_flags & + int preserved_flags = src_snap->linux_flags & (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE); fd_table[dst_fd].linux_flags = preserved_flags | linux_flags; - fd_table[dst_fd].seals = fd_table[src_fd].seals; - memcpy(fd_table[dst_fd].proc_path, fd_table[src_fd].proc_path, + fd_table[dst_fd].seals = src_snap->seals; + memcpy(fd_table[dst_fd].proc_path, src_snap->proc_path, sizeof(fd_table[dst_fd].proc_path)); } @@ -457,28 +470,44 @@ static int duplicate_guest_fd(int src_fd, bool fixed_slot, int linux_flags) { - if (RANGE_CHECK(src_fd, 0, FD_TABLE_SIZE)) { - int t = fd_table[src_fd].type; - if (t == FD_FUSE_DEV || t == FD_FUSE_FILE || t == FD_FUSE_DIR) - return fuse_dup_fd(src_fd, min_guest_fd, fixed_guest_fd, fixed_slot, - linux_flags); - } - - host_fd_ref_t host_ref; - if (host_fd_ref_open(src_fd, &host_ref) < 0) { + /* Snapshot the source entry and dup its host fd in a single fd_lock + * critical section so the type, host fd, and metadata captured here + * cannot drift apart under a racing close + reopen. + */ + fd_entry_t src_snap; + int new_host_fd = fd_snapshot_and_dup(src_fd, &src_snap); + if (new_host_fd < 0 && src_snap.type == FD_CLOSED) { errno = EBADF; return -1; } - - int new_type = dup_fd_type(src_fd); - int new_host_fd = dup(host_ref.fd); - host_fd_ref_close(&host_ref); + if (src_snap.type == FD_FUSE_DEV || src_snap.type == FD_FUSE_FILE || + src_snap.type == FD_FUSE_DIR) { + if (new_host_fd >= 0) + close_keep_errno(new_host_fd); + return fuse_dup_fd(src_fd, min_guest_fd, fixed_guest_fd, fixed_slot, + linux_flags); + } + /* eventfd dup must share the underlying counter and pipe state across + * the source and destination fds (Linux contract). Pass src_snap's + * host_fd through so eventfd_dup_fd can verify the source fd still + * refers to the same live eventfd between the snapshot here and the + * bind there. + */ + if (src_snap.type == FD_EVENTFD) { + if (new_host_fd >= 0) + close_keep_errno(new_host_fd); + return eventfd_dup_fd(src_fd, src_snap.host_fd, min_guest_fd, + fixed_guest_fd, fixed_slot, linux_flags); + } if (new_host_fd < 0) return -1; - int guest_fd = - fixed_slot ? fd_alloc_at_relaxed(fixed_guest_fd, new_type, new_host_fd) - : fd_alloc_from_relaxed(min_guest_fd, new_type, new_host_fd); + int new_type = (src_snap.type == FD_STDIO) ? FD_REGULAR : src_snap.type; + void (*cleanup)(int) = fd_cleanup_for_type(new_type); + int guest_fd = fixed_slot ? fd_alloc_at_relaxed(fixed_guest_fd, new_type, + new_host_fd, cleanup) + : fd_alloc_from_relaxed(min_guest_fd, new_type, + new_host_fd, cleanup); if (guest_fd < 0) { if (fixed_slot) errno = EBADF; @@ -486,7 +515,7 @@ static int duplicate_guest_fd(int src_fd, return -1; } - copy_fd_alias_metadata(src_fd, guest_fd, linux_flags); + install_fd_alias_metadata(guest_fd, &src_snap, linux_flags); if (clone_dir_stream_if_needed(src_fd, guest_fd, new_host_fd) < 0) { int saved_errno = errno; discard_allocated_fd(guest_fd); @@ -600,7 +629,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) return linux_errno(); int linux_fl = mac_to_linux_status_flags(mac_fl); if (snap.type == FD_REGULAR || snap.type == FD_DIR || - snap.type == FD_PATH) + snap.type == FD_PATH || snap.type == FD_URANDOM) linux_fl = (linux_fl & ~O_ACCMODE) | (snap.linux_flags & 3); linux_fl |= snap.linux_flags & (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | diff --git a/src/syscall/fuse.c b/src/syscall/fuse.c index ae248e1..157191a 100644 --- a/src/syscall/fuse.c +++ b/src/syscall/fuse.c @@ -1281,6 +1281,9 @@ void fuse_init(void) memset(fuse_file_bindings, 0, sizeof(fuse_file_bindings)); fuse_next_mount_id = 100; pthread_mutex_unlock(&fuse_lock); + fd_register_cleanup(FD_FUSE_DEV, fuse_fd_cleanup); + fd_register_cleanup(FD_FUSE_FILE, fuse_fd_cleanup); + fd_register_cleanup(FD_FUSE_DIR, fuse_fd_cleanup); } int fuse_proc_open(int linux_flags) @@ -2540,9 +2543,15 @@ int fuse_dup_fd(int src_fd, return -1; } - int guest_fd = fixed_slot - ? fd_alloc_at_relaxed(fixed_guest_fd, snap.type, -1) - : fd_alloc_from_relaxed(min_guest_fd, snap.type, -1); + /* Install cleanup atomically with the type. Without this, a racing + * close between fd_alloc_*_relaxed publishing the slot and the later + * fd_table[guest_fd].cleanup assignment would skip fuse_fd_cleanup + * and leak the session or file ref. + */ + int guest_fd = fixed_slot ? fd_alloc_at_relaxed(fixed_guest_fd, snap.type, + -1, fuse_fd_cleanup) + : fd_alloc_from_relaxed(min_guest_fd, snap.type, + -1, fuse_fd_cleanup); if (guest_fd < 0) { if (fixed_slot) errno = EBADF; @@ -2588,7 +2597,6 @@ int fuse_dup_fd(int src_fd, (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE); fd_table[guest_fd].linux_flags = preserved_flags | linux_flags; - fd_table[guest_fd].cleanup = fuse_fd_cleanup; pthread_mutex_unlock(&fuse_lock); return guest_fd; } diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c index 7513e5c..d9b54dd 100644 --- a/src/syscall/inotify.c +++ b/src/syscall/inotify.c @@ -111,6 +111,7 @@ void inotify_init(void) { for (int i = 0; i < INOTIFY_MAX; i++) inotify_state[i].guest_fd = -1; + fd_register_cleanup(FD_INOTIFY, inotify_close); } static int inotify_find(int guest_fd) diff --git a/src/syscall/internal.h b/src/syscall/internal.h index 2760ce9..ca38b62 100644 --- a/src/syscall/internal.h +++ b/src/syscall/internal.h @@ -59,32 +59,78 @@ void fdtable_init(void); */ int fd_alloc(int type, int host_fd, void (*cleanup)(int)); -/* Allocate the lowest available FD >= minfd. Returns -1 if none available. */ -int fd_alloc_from(int minfd, int type, int host_fd); +/* Allocate the lowest available FD >= minfd. Returns -1 if none available. + * cleanup is set atomically under fd_lock (pass NULL for plain fds). + */ +int fd_alloc_from(int minfd, int type, int host_fd, void (*cleanup)(int)); /* Allocate the lowest available FD >= minfd with a single-thread fast path. * Falls back to fd_alloc_from() when multiple guest threads are active. */ -int fd_alloc_from_relaxed(int minfd, int type, int host_fd); +int fd_alloc_from_relaxed(int minfd, + int type, + int host_fd, + void (*cleanup)(int)); -/* Allocate a specific FD slot. Returns -1 if out of range. */ -int fd_alloc_at(int fd, int type, int host_fd); +/* Allocate a specific FD slot. + * Returns -1 if out of range. + * cleanup is set atomically under fd_lock (pass NULL for plain fds). + */ +int fd_alloc_at(int fd, int type, int host_fd, void (*cleanup)(int)); /* Allocate a specific FD slot with a single-thread fast path. * Falls back to fd_alloc_at() when replacement/cleanup must stay serialized. */ -int fd_alloc_at_relaxed(int fd, int type, int host_fd); +int fd_alloc_at_relaxed(int fd, int type, int host_fd, void (*cleanup)(int)); /* Look up a guest FD. Returns host FD or -1 if invalid. * Unsafe for concurrent use; see fd_snapshot/fd_to_host_dup. */ int fd_to_host(int guest_fd); -/* Snapshot an fd entry under fd_lock. Thread-safe alternative to - * direct fd_table[] access. Returns true on success, false if closed. +/* Snapshot an fd entry under fd_lock. Thread-safe alternative to direct + * fd_table[] access. + * Returns true on success, false if closed. */ bool fd_snapshot(int guest_fd, fd_entry_t *out); +/* Snapshot an fd entry AND dup its host fd in a single fd_lock critical + * section. Eliminates the TOCTOU window between reading the type/metadata + * and duplicating the host fd in the dup(2) path. Returns the dup'd host + * fd (owned by the caller) on success, -1 on failure. On success the + * snapshot in *out is consistent with the dup'd host fd. + */ +int fd_snapshot_and_dup(int guest_fd, fd_entry_t *out); + +/* Read just the fd type under fd_lock. Returns FD_CLOSED for out-of-range or + * closed slots. Cheaper than fd_snapshot when only the type is needed for + * dispatch (sys_read/sys_readv/sys_writev fast paths). + */ +int fd_get_type(int guest_fd); + +/* Type -> cleanup registry. Modules that own a synthetic fd type register + * their cleanup at init time; dup and fork-restore paths look up the + * cleanup from the type so the binding stays consistent without each path + * re-deriving the dispatch table. + */ +void fd_register_cleanup(int type, void (*cleanup)(int)); +void (*fd_cleanup_for_type(int type))(int); + +/* True for fd types whose host backing (kqueue for timerfd/inotify, pipe + * halves for eventfd/signalfd/netlink/pidfd, epoll instance) cannot be + * meaningfully inherited across fork IPC: macOS SCM_RIGHTS rejects kqueue + * fds, and the per-class side-table state (eventfd counter, signalfd mask, + * pidfd target, epoll set, ...) is not serialized. The child must recreate + * such fds via the appropriate syscall, so the parent filters them from the + * SCM_RIGHTS payload and the receiver drops any that still arrive. + */ +static inline bool fd_type_is_synthetic(int type) +{ + return type == FD_EVENTFD || type == FD_SIGNALFD || type == FD_TIMERFD || + type == FD_INOTIFY || type == FD_NETLINK || type == FD_PIDFD || + type == FD_EPOLL; +} + /* Look up a guest FD and return a dup'd host fd owned by the caller. * Thread-safe: dup is performed under fd_lock. Returns -1 on failure. * Caller MUST close() the returned fd when done. diff --git a/src/syscall/io.c b/src/syscall/io.c index ee183dd..f901ba7 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ #define SYSCALL_IOV_MAX 1024 #define SYSCALL_IOV_STACK_MAX 64 +#define URANDOM_CACHE_SIZE 4096 /* Linux terminal struct types. */ @@ -60,6 +62,15 @@ typedef struct { uint8_t c_cc[19]; } linux_termios_t; +typedef struct { + uint8_t buf[URANDOM_CACHE_SIZE]; + size_t off; + size_t len; +} urandom_cache_t; + +static pthread_mutex_t urandom_lock = PTHREAD_MUTEX_INITIALIZER; +static urandom_cache_t urandom_cache[FD_TABLE_SIZE]; + _Static_assert(sizeof(linux_termios_t) == 36, "aarch64 Linux TCGETS struct termios must be 36 bytes"); @@ -123,6 +134,120 @@ static int64_t io_return_zero(host_fd_ref_t *host_ref) return 0; } +void urandom_fd_reset_cache(int guest_fd) +{ + if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE)) + return; + + pthread_mutex_lock(&urandom_lock); + memset(&urandom_cache[guest_fd], 0, sizeof(urandom_cache[guest_fd])); + pthread_mutex_unlock(&urandom_lock); +} + +void urandom_fd_cleanup(int guest_fd) +{ + if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE)) + return; + + urandom_fd_reset_cache(guest_fd); +} + +static int64_t urandom_check_readable(int guest_fd) +{ + fd_entry_t snap; + if (!fd_snapshot(guest_fd, &snap) || snap.type != FD_URANDOM) + return -LINUX_EBADF; + if ((snap.linux_flags & 3) == LINUX_O_WRONLY) + return -LINUX_EBADF; + return 0; +} + +static int64_t urandom_fill_iov(int guest_fd, + const struct iovec *iov, + int iovcnt) +{ + int64_t err = urandom_check_readable(guest_fd); + if (err < 0) + return err; + + size_t total = 0; + for (int i = 0; i < iovcnt; i++) { + if (iov[i].iov_len > (size_t) SSIZE_MAX - total) + return -LINUX_EINVAL; + total += iov[i].iov_len; + } + if (total == 0) + return 0; + + pthread_mutex_lock(&urandom_lock); + urandom_cache_t *c = &urandom_cache[guest_fd]; + size_t done = 0; + for (int i = 0; i < iovcnt && done < total; i++) { + uint8_t *dst = iov[i].iov_base; + size_t iov_done = 0; + size_t iov_len = iov[i].iov_len; + if (iov_len > total - done) + iov_len = total - done; + while (iov_done < iov_len) { + if (c->off == c->len) { + arc4random_buf(c->buf, sizeof(c->buf)); + c->off = 0; + c->len = sizeof(c->buf); + } + size_t chunk = c->len - c->off; + if (chunk > iov_len - iov_done) + chunk = iov_len - iov_done; + memcpy(dst + iov_done, c->buf + c->off, chunk); + c->off += chunk; + iov_done += chunk; + done += chunk; + } + } + pthread_mutex_unlock(&urandom_lock); + return (int64_t) done; +} + +static int64_t validate_iov_total(guest_t *g, uint64_t iov_gva, int iovcnt) +{ + if (iovcnt <= 0 || iovcnt > SYSCALL_IOV_MAX) + return -LINUX_EINVAL; + + size_t total = 0; + for (int i = 0; i < iovcnt; i++) { + linux_iovec_t giov; + if (guest_read_small(g, iov_gva + (uint64_t) i * sizeof(giov), &giov, + sizeof(giov)) < 0) + return -LINUX_EFAULT; + if (giov.iov_len > (uint64_t) SSIZE_MAX - total) + return -LINUX_EINVAL; + total += (size_t) giov.iov_len; + } + return 0; +} + +static int64_t urandom_read(guest_t *g, + int guest_fd, + uint64_t buf_gva, + uint64_t count) +{ + if (count > SSIZE_MAX) + count = SSIZE_MAX; + if (count == 0) { + struct iovec empty = {0}; + return urandom_fill_iov(guest_fd, &empty, 1); + } + + uint64_t avail = 0; + void *dst = guest_ptr_bound(g, buf_gva, &avail, MEM_PERM_W, count); + if (!dst) + return -LINUX_EFAULT; + if (count > avail) + count = avail; + + struct iovec iov = {.iov_base = dst, .iov_len = (size_t) count}; + return urandom_fill_iov(guest_fd, &iov, 1); +} + static bool rosetta_ioctl_target_fd(guest_t *g, int host_fd) { if (!g->is_rosetta) @@ -689,12 +814,11 @@ static int64_t io_write_result(ssize_t ret) int64_t sys_write(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) { - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) { - if (fd_table[fd].type == FD_FUSE_DEV) - return fuse_dev_write(g, fd, buf_gva, count); - if (fd_table[fd].type == FD_EVENTFD) - return eventfd_write(fd, g, buf_gva, count); - } + int type = fd_get_type(fd); + if (type == FD_FUSE_DEV) + return fuse_dev_write(g, fd, buf_gva, count); + if (type == FD_EVENTFD) + return eventfd_write(fd, g, buf_gva, count); host_fd_ref_t host_ref; int64_t err = host_fd_ref_open_checked(fd, &host_ref, true); @@ -741,21 +865,28 @@ int64_t sys_write(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) int64_t sys_read(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) { - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) { - if (fd_table[fd].type == FD_FUSE_DEV) - return fuse_dev_read(fd, g, buf_gva, count); - if (fd_table[fd].type == FD_FUSE_FILE) - return fuse_read_fd(g, fd, buf_gva, count); - if (fd_table[fd].type == FD_EVENTFD) - return eventfd_read(fd, g, buf_gva, count); - if (fd_table[fd].type == FD_SIGNALFD) - return signalfd_read(fd, g, buf_gva, count); - if (fd_table[fd].type == FD_TIMERFD) - return timerfd_read(fd, g, buf_gva, count); - if (fd_table[fd].type == FD_INOTIFY) - return inotify_read(fd, g, buf_gva, count); - if (fd_table[fd].type == FD_NETLINK) - return netlink_read(fd, g, buf_gva, count); + /* Read the type once under fd_lock so a concurrent close/reopen cannot + * make different dispatch checks disagree. Each handler still + * re-validates internally and returns EBADF if its slot changed. + */ + int type = fd_get_type(fd); + switch (type) { + case FD_FUSE_DEV: + return fuse_dev_read(fd, g, buf_gva, count); + case FD_FUSE_FILE: + return fuse_read_fd(g, fd, buf_gva, count); + case FD_EVENTFD: + return eventfd_read(fd, g, buf_gva, count); + case FD_SIGNALFD: + return signalfd_read(fd, g, buf_gva, count); + case FD_TIMERFD: + return timerfd_read(fd, g, buf_gva, count); + case FD_INOTIFY: + return inotify_read(fd, g, buf_gva, count); + case FD_NETLINK: + return netlink_read(fd, g, buf_gva, count); + case FD_URANDOM: + return urandom_read(g, fd, buf_gva, count); } host_fd_ref_t host_ref; @@ -914,11 +1045,23 @@ static int64_t build_host_iov(guest_t *g, free(guest_iov); return -LINUX_EFAULT; } - /* Cap to contiguous permitted bytes */ + /* Cap to contiguous permitted bytes. When the guest iov entry + * spans a non-contiguous boundary (different mapping or + * permission), zero every subsequent host iov length so the + * host readv/writev returns a POSIX-compliant short I/O rather + * than silently packing the truncated tail of buffer i into + * buffer i+1 -- which corrupts the guest's data layout. + */ uint64_t len = guest_iov[i].iov_len; - if (len > avail) - len = avail; host_iov[i].iov_base = base; + if (len > avail) { + host_iov[i].iov_len = avail; + for (int j = i + 1; j < iovcnt; j++) { + host_iov[j].iov_base = NULL; + host_iov[j].iov_len = 0; + } + break; + } host_iov[i].iov_len = len; } if (guest_iov != stack_giov) @@ -981,29 +1124,49 @@ int64_t sys_readv(guest_t *g, int fd, uint64_t iov_gva, int iovcnt) int64_t err = single_guest_iov(g, iov_gva, &giov); if (err < 0) return err; + if (fd_get_type(fd) == FD_URANDOM && + giov.iov_len > (uint64_t) SSIZE_MAX) { + err = urandom_check_readable(fd); + if (err < 0) + return err; + return -LINUX_EINVAL; + } return sys_read(g, fd, giov.iov_base, giov.iov_len); } /* Special FD types need their custom read handlers because glibc may use * readv() instead of read() for the same logical operation. Delegate - * to the first iov entry's buffer. Use the first iov's length (not - * the sum of all iovs) because the data goes into giov[0].iov_base - * which is only giov[0].iov_len bytes long. + * scalar special fds to the first iov entry's buffer. Use the first iov's + * length (not the sum of all iovs) because the data goes into + * giov[0].iov_base which is only giov[0].iov_len bytes long. */ - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) { - int type = fd_table[fd].type; - if (type == FD_EVENTFD || type == FD_SIGNALFD || type == FD_TIMERFD || - type == FD_INOTIFY) { - if (iovcnt <= 0) - return -LINUX_EINVAL; - /* Use guest_read for the iov array since guest_ptr alone is unsafe - * if the array spans a 2MiB block boundary. - */ - linux_iovec_t giov; - if (guest_read_small(g, iov_gva, &giov, sizeof(giov)) < 0) - return -LINUX_EFAULT; - return sys_read(g, fd, giov.iov_base, giov.iov_len); - } + int type = fd_get_type(fd); + if (type == FD_URANDOM) { + int64_t err = urandom_check_readable(fd); + if (err < 0) + return err; + err = validate_iov_total(g, iov_gva, iovcnt); + if (err < 0) + return err; + host_iov_buf_t host_iov; + err = host_iov_prepare(g, iov_gva, iovcnt, MEM_PERM_W, &host_iov); + if (err < 0) + return err; + int64_t ret = urandom_fill_iov(fd, host_iov.iov, iovcnt); + host_iov_free(&host_iov); + return ret; + } + if (type == FD_EVENTFD || type == FD_SIGNALFD || type == FD_TIMERFD || + type == FD_INOTIFY) { + if (iovcnt <= 0) + return -LINUX_EINVAL; + /* Use guest_read for the iov array since guest_ptr alone is unsafe + * if the array spans a 2MiB block boundary. + */ + linux_iovec_t giov; + if (guest_read_small(g, iov_gva, &giov, sizeof(giov)) < 0) + return -LINUX_EFAULT; + return sys_read(g, fd, giov.iov_base, giov.iov_len); } host_fd_ref_t host_ref; @@ -1051,7 +1214,7 @@ int64_t sys_writev(guest_t *g, int fd, uint64_t iov_gva, int iovcnt) * sum of all iovs) because the data is at giov.iov_base which is only * giov.iov_len bytes. eventfd expects exactly 8 bytes. */ - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_EVENTFD) { + if (fd_get_type(fd) == FD_EVENTFD) { if (iovcnt <= 0) return -LINUX_EINVAL; linux_iovec_t giov; diff --git a/src/syscall/io.h b/src/syscall/io.h index 05a3321..399b551 100644 --- a/src/syscall/io.h +++ b/src/syscall/io.h @@ -22,6 +22,8 @@ /* read/write and their positional variants. */ int64_t sys_write(guest_t *g, int fd, uint64_t buf_gva, uint64_t count); int64_t sys_read(guest_t *g, int fd, uint64_t buf_gva, uint64_t count); +void urandom_fd_cleanup(int guest_fd); +void urandom_fd_reset_cache(int guest_fd); int64_t sys_pread64(guest_t *g, int fd, uint64_t buf_gva, diff --git a/src/syscall/net-msg.c b/src/syscall/net-msg.c index ecc9f71..96221ff 100644 --- a/src/syscall/net-msg.c +++ b/src/syscall/net-msg.c @@ -98,7 +98,7 @@ static void recvmsg_close_host_rights(const void *data_src, size_t data_len) int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags) { - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_NETLINK) + if (fd_get_type(fd) == FD_NETLINK) return netlink_sendmsg(fd, g, msg_gva, linux_flags); host_fd_ref_t host_ref; @@ -339,7 +339,7 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags) int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags) { - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_NETLINK) + if (fd_get_type(fd) == FD_NETLINK) return netlink_recvmsg(fd, g, msg_gva, flags); host_fd_ref_t host_ref; diff --git a/src/syscall/net.c b/src/syscall/net.c index b80ca18..05b0c76 100644 --- a/src/syscall/net.c +++ b/src/syscall/net.c @@ -215,7 +215,7 @@ int64_t sys_socketpair(guest_t *g, int64_t sys_bind(guest_t *g, int fd, uint64_t addr_gva, uint32_t addrlen) { /* Netlink sockets use synthetic fd; dispatch to netlink handler */ - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_NETLINK) + if (fd_get_type(fd) == FD_NETLINK) return netlink_bind(fd, g, addr_gva, addrlen); host_fd_ref_t host_ref; @@ -469,7 +469,7 @@ int64_t sys_connect(guest_t *g, int fd, uint64_t addr_gva, uint32_t addrlen) return linux_errno(); } - if (fd_alloc_at(fd, FD_SOCKET, pair[0]) < 0) { + if (fd_alloc_at(fd, FD_SOCKET, pair[0], NULL) < 0) { close(pair[0]); close(pair[1]); host_fd_ref_close(&host_ref); diff --git a/src/syscall/netlink.c b/src/syscall/netlink.c index a1b555e..32c3ec3 100644 --- a/src/syscall/netlink.c +++ b/src/syscall/netlink.c @@ -396,6 +396,7 @@ static int nl_build_getaddr(netlink_state_t *ns) void netlink_init(void) { memset(nl_state, 0, sizeof(nl_state)); + fd_register_cleanup(FD_NETLINK, netlink_close); } int64_t netlink_socket(int protocol, int type) diff --git a/src/syscall/proc-pidfd.c b/src/syscall/proc-pidfd.c index 62480f3..635eb88 100644 --- a/src/syscall/proc-pidfd.c +++ b/src/syscall/proc-pidfd.c @@ -50,6 +50,13 @@ static pidfd_entry_t *pidfd_find_guest_fd_entry(int guest_fd) return NULL; } +static void pidfd_cleanup(int guest_fd); + +void pidfd_init(void) +{ + fd_register_cleanup(FD_PIDFD, pidfd_cleanup); +} + static void pidfd_cleanup(int guest_fd) { pthread_mutex_lock(&pidfd_lock); diff --git a/src/syscall/proc-pidfd.h b/src/syscall/proc-pidfd.h index 8d02df4..79e55e5 100644 --- a/src/syscall/proc-pidfd.h +++ b/src/syscall/proc-pidfd.h @@ -10,6 +10,7 @@ #include "core/guest.h" +void pidfd_init(void); int pidfd_create(guest_t *g, int64_t target_pid); void proc_pidfd_notify_exit(int64_t exited_pid); int64_t proc_pidfd_lookup_pid(int guest_fd); diff --git a/src/syscall/signal.c b/src/syscall/signal.c index 2156638..2ac58be 100644 --- a/src/syscall/signal.c +++ b/src/syscall/signal.c @@ -1447,7 +1447,7 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code) * glibc leaves sa_restorer uninitialized (garbage); musl sets it to * __restore_rt. Match the kernel: always use the vDSO trampoline. */ - hv_vcpu_set_reg(vcpu, HV_REG_X30, VDSO_BASE + VDSO_OFF_TEXT); + hv_vcpu_set_reg(vcpu, HV_REG_X30, VDSO_BASE + VDSO_OFF_SIGRET); if (act->sa_flags & LINUX_SA_SIGINFO) { /* X1 = pointer to siginfo, X2 = pointer to ucontext */ diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index 68cad6d..81a51f7 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -56,6 +56,7 @@ #include "syscall/poll.h" #include "syscall/path.h" #include "syscall/proc.h" +#include "syscall/proc-pidfd.h" #include "syscall/signal.h" #include "syscall/sys.h" #include "syscall/sysvipc.h" @@ -95,6 +96,8 @@ void syscall_init(void) inotify_init(); netlink_init(); fuse_init(); + pidfd_init(); + fd_register_cleanup(FD_URANDOM, urandom_fd_cleanup); wakeup_pipe_init(); } diff --git a/src/syscall/time.c b/src/syscall/time.c index 8a76c4b..d29932b 100644 --- a/src/syscall/time.c +++ b/src/syscall/time.c @@ -15,6 +15,7 @@ #include "utils.h" +#include "core/vdso.h" #include "runtime/thread.h" /* current_thread, guest_tid */ #include "syscall/abi.h" #include "syscall/internal.h" @@ -253,6 +254,27 @@ int64_t sys_clock_gettime(guest_t *g, int clockid, uint64_t tp_gva) if (guest_write_small(g, tp_gva, &ts, sizeof(ts)) < 0) return -LINUX_EFAULT; + /* If this trap came from the __kernel_clock_gettime vDSO svc_fallback, + * the trampoline parked the guest's CNTVCT_EL0 read in X9 before + * issuing SVC, and ELR_EL1 holds the address immediately after that + * SVC. Pair X9 with the wall_clock we just computed and seed the vvar + * so subsequent calls hit the fast path. Skip the seed for any other + * trap (raw syscall(SYS_clock_gettime, ...) from guest code, etc.): + * X9 is then arbitrary guest state, and seeding from it would poison + * the anchor and break every later fast-path call. + */ + if (clockid == 1 /* CLOCK_MONOTONIC */ && current_thread) { + uint64_t elr = 0; + uint64_t guest_cntvct = 0; + if (hv_vcpu_get_sys_reg(current_thread->vcpu, HV_SYS_REG_ELR_EL1, + &elr) == HV_SUCCESS && + elr == vdso_clock_gettime_svc_pc() + 4 && + hv_vcpu_get_reg(current_thread->vcpu, HV_REG_X9, &guest_cntvct) == + HV_SUCCESS && + guest_cntvct != 0) + vdso_seed_anchor(g, guest_cntvct, ts.tv_sec, ts.tv_nsec); + } + return 0; } diff --git a/tests/manifest.txt b/tests/manifest.txt index ff9631b..e1f6c29 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -45,10 +45,12 @@ test-file-ops test-sysinfo test-io-opt test-syscall-smoke +test-vdso test-poll # diff=skip [section] I/O subsystem tests test-eventfd +test-eventfd-dup test-signalfd test-signalfd-hardening test-epoll @@ -83,8 +85,9 @@ test-clone3 # diff=skip test-fork-exec $TESTDIR/echo-test test-fork-lowbase -[section] COW fork isolation tests +[section] CoW fork isolation tests test-cow-fork +test-fork-synthetic-fd [section] O_CLOEXEC tests test-cloexec diff --git a/tests/test-cow-fork.c b/tests/test-cow-fork.c index 8770420..f7cc0c7 100644 --- a/tests/test-cow-fork.c +++ b/tests/test-cow-fork.c @@ -1,4 +1,4 @@ -/* COW fork memory isolation tests +/* CoW fork memory isolation tests * * Copyright 2026 elfuse contributors * Copyright 2025 Moritz Angermann, zw3rk pte. ltd. @@ -166,11 +166,11 @@ static void test_mmap_isolation(void) munmap(region, 4096); } -/* Test 4: Large region COW (verify no corruption) */ +/* Test 4: Large region CoW (verify no corruption) */ static void test_large_cow(void) { - TEST("fork: 1MiB COW integrity"); + TEST("fork: 1MiB CoW integrity"); int pipefd[2]; if (pipe(pipefd) != 0) { @@ -229,7 +229,7 @@ static void test_large_cow(void) int status; waitpid(pid, &status, 0); - EXPECT_TRUE(parent_ok && child_ok, "1MiB COW integrity failed"); + EXPECT_TRUE(parent_ok && child_ok, "1MiB CoW integrity failed"); munmap(buf, sz); } @@ -302,7 +302,7 @@ static void test_brk_isolation(void) int main(void) { - printf("test-cow-fork: COW fork memory isolation tests\n"); + printf("test-cow-fork: CoW fork memory isolation tests\n"); test_stack_isolation(); test_heap_isolation(); diff --git a/tests/test-eventfd-dup.c b/tests/test-eventfd-dup.c new file mode 100644 index 0000000..484c2d7 --- /dev/null +++ b/tests/test-eventfd-dup.c @@ -0,0 +1,65 @@ +/* test-eventfd-dup.c -- dup of eventfd shares state (Linux contract) + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Linux dup of an eventfd produces a second descriptor that points at the + * same kernel object; reads and writes on either fd see the same counter. + * elfuse used to give each dup'd guest_fd a fresh side-table slot, so + * dup'd eventfds diverged and breaking programs that signal across the + * pair. This test pins the contract by: + * - duping an eventfd initialised with counter=7, reading via the dup, + * verifying the dup observes the source's initial value + * - writing via the source, reading via the dup, verifying state shares + * - closing one end of the alias and continuing to operate on the other + */ + +#include +#include +#include +#include +#include +#include + +static int failures = 0; + +#define EXPECT(cond, msg) \ + do { \ + if (!(cond)) { \ + fprintf(stderr, "FAIL: %s\n", msg); \ + failures++; \ + } \ + } while (0) + +int main(void) +{ + int a = eventfd(7, EFD_CLOEXEC); + EXPECT(a >= 0, "eventfd(7) returned valid fd"); + int b = dup(a); + EXPECT(b >= 0, "dup(a) returned valid fd"); + + uint64_t v = 0; + EXPECT(read(b, &v, 8) == 8, "read 8 bytes from dup'd fd"); + EXPECT(v == 7, "dup'd fd observes source initial counter (7)"); + + uint64_t n = 42; + EXPECT(write(a, &n, 8) == 8, "write 42 to source fd"); + EXPECT(read(b, &v, 8) == 8, "read counter from dup'd fd"); + EXPECT(v == 42, "dup'd fd observes source write (42)"); + + close(a); + n = 99; + EXPECT(write(b, &n, 8) == 8, "write 99 to alias after closing source"); + EXPECT(read(b, &v, 8) == 8, "read after partial close"); + EXPECT(v == 99, "alias still functional after partial close"); + struct pollfd pfd = {.fd = b, .events = POLLIN}; + EXPECT(poll(&pfd, 1, 0) == 0, "alias is not readable after drain"); + close(b); + + if (failures) { + printf("test-eventfd-dup: %d FAIL\n", failures); + return 1; + } + puts("test-eventfd-dup: PASS"); + return 0; +} diff --git a/tests/test-fork-synthetic-fd.c b/tests/test-fork-synthetic-fd.c new file mode 100644 index 0000000..1e89a46 --- /dev/null +++ b/tests/test-fork-synthetic-fd.c @@ -0,0 +1,218 @@ +/* test-fork-synthetic-fd.c -- fork inheritance contract for synthetic fds + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The fork-IPC handoff does NOT serialize per-class side tables for + * eventfd/signalfd/timerfd/inotify/netlink/pidfd. Restoring the + * inherited host fd without that state leaves a half-functional slot, + * so fork-state.c explicitly drops these in the child. This test pins + * that contract: + * - urandom IS inherited (no per-class state to lose; cache is fresh + * in the child and arc4random_buf works) + * - eventfd / signalfd / timerfd / inotify are NOT inherited; the + * child sees EBADF and can recreate the fd at the same slot + * - the inherited host fd does not leak in the child + * + * Once a subsystem grows a serialize/restore path, the corresponding + * EBADF expectation here flips to a positive inheritance check. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int failures = 0; + +#define EXPECT(cond, msg) \ + do { \ + if (!(cond)) { \ + fprintf(stderr, "FAIL: %s\n", msg); \ + failures++; \ + } \ + } while (0) + +static int run_child(int (*fn)(int), int fd) +{ + pid_t pid = fork(); + if (pid < 0) + return -1; + if (pid == 0) + _exit(fn(fd)); + int status = 0; + if (waitpid(pid, &status, 0) < 0) + return -1; + return WIFEXITED(status) ? WEXITSTATUS(status) : -1; +} + +static int child_urandom_read(int fd) +{ + unsigned char b[8]; + if (read(fd, b, sizeof(b)) != (ssize_t) sizeof(b)) + return 1; + int seen_nonzero = 0; + for (size_t i = 0; i < sizeof(b); i++) + if (b[i] != 0) + seen_nonzero = 1; + return seen_nonzero ? 0 : 2; +} + +static int child_ebadf_read(int fd) +{ + char buf[8] = {0}; + errno = 0; + ssize_t n = read(fd, buf, sizeof(buf)); + if (n != -1) + return 1; + if (errno != EBADF) + return 2; + return 0; +} + +static int child_ebadf_reusable_at_same_fd(int fd) +{ + int rc = child_ebadf_read(fd); + if (rc != 0) + return rc; + int again = open("/dev/null", O_RDONLY | O_CLOEXEC); + if (again < 0) + return 3; + if (again != fd) { + close(again); + return 4; + } + close(again); + return 0; +} + +static int child_eventfd_recreate(int fd) +{ + /* The inherited eventfd slot should be FD_CLOSED in the child; we + * should be able to create a fresh eventfd that works normally. + */ + char buf[8]; + errno = 0; + if (read(fd, buf, sizeof(buf)) != -1 || errno != EBADF) + return 1; + close(fd); /* harmless on a closed slot */ + int e = eventfd(0, EFD_CLOEXEC); + if (e < 0) + return 2; + uint64_t one = 1; + if (write(e, &one, sizeof(one)) != (ssize_t) sizeof(one)) { + close(e); + return 3; + } + uint64_t got = 0; + if (read(e, &got, sizeof(got)) != (ssize_t) sizeof(got) || got != 1) { + close(e); + return 4; + } + close(e); + return 0; +} + +static void test_urandom_inherited(void) +{ + int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC); + EXPECT(fd >= 0, "open /dev/urandom"); + if (fd < 0) + return; + int rc = run_child(child_urandom_read, fd); + EXPECT(rc == 0, "child can read inherited /dev/urandom"); + close(fd); +} + +static void test_synthetic_dropped(const char *label, int (*opener)(void)) +{ + int fd = opener(); + EXPECT(fd >= 0, label); + if (fd < 0) + return; + int rc = run_child(child_ebadf_read, fd); + char msg[80]; + snprintf(msg, sizeof(msg), "child sees EBADF on inherited %s", label); + EXPECT(rc == 0, msg); + close(fd); +} + +static void test_eventfd_recreate(void) +{ + int fd = eventfd(0, EFD_CLOEXEC); + EXPECT(fd >= 0, "open eventfd"); + if (fd < 0) + return; + int rc = run_child(child_eventfd_recreate, fd); + EXPECT(rc == 0, "child can recreate eventfd after drop"); + close(fd); +} + +static void test_low_synthetic_dropped(void) +{ + int saved_stdin = dup(STDIN_FILENO); + EXPECT(saved_stdin >= 0, "save stdin"); + if (saved_stdin < 0) + return; + + EXPECT(close(STDIN_FILENO) == 0, "close stdin"); + int fd = eventfd(0, EFD_CLOEXEC); + EXPECT(fd == STDIN_FILENO, "eventfd reuses fd 0"); + if (fd == STDIN_FILENO) { + int rc = run_child(child_ebadf_reusable_at_same_fd, fd); + EXPECT(rc == 0, "child sees EBADF on low inherited eventfd"); + close(fd); + } else if (fd >= 0) { + close(fd); + } + + EXPECT(dup2(saved_stdin, STDIN_FILENO) == STDIN_FILENO, "restore stdin"); + close(saved_stdin); +} + +static int open_eventfd(void) +{ + return eventfd(0, EFD_CLOEXEC); +} +static int open_timerfd(void) +{ + return timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC); +} +static int open_signalfd(void) +{ + sigset_t s; + sigemptyset(&s); + sigaddset(&s, SIGUSR1); + return signalfd(-1, &s, SFD_CLOEXEC); +} +static int open_inotify(void) +{ + return inotify_init1(IN_CLOEXEC); +} + +int main(void) +{ + printf("test-fork-synthetic-fd: synthetic fd fork inheritance contract\n"); + test_urandom_inherited(); + test_synthetic_dropped("eventfd", open_eventfd); + test_synthetic_dropped("timerfd", open_timerfd); + test_synthetic_dropped("signalfd", open_signalfd); + test_synthetic_dropped("inotify", open_inotify); + test_eventfd_recreate(); + test_low_synthetic_dropped(); + if (failures) { + printf("test-fork-synthetic-fd: %d FAIL\n", failures); + return 1; + } + puts("test-fork-synthetic-fd: PASS"); + return 0; +} diff --git a/tests/test-large-io-boundary.c b/tests/test-large-io-boundary.c index 28b76e7..891dd7b 100644 --- a/tests/test-large-io-boundary.c +++ b/tests/test-large-io-boundary.c @@ -182,12 +182,55 @@ static void test_large_read_from_split_block(void) EXPECT_TRUE(ok, "read returned short count or corrupted data"); } +static void test_urandom_read_crosses_boundary(void) +{ + TEST("/dev/urandom partial read at mapping boundary"); + + size_t page = (size_t) sysconf(_SC_PAGESIZE); + unsigned char *map = mmap(NULL, page * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (map == MAP_FAILED) { + FAIL("mmap failed"); + return; + } + if (munmap(map + page, page) != 0) { + munmap(map, page); + FAIL("munmap guard failed"); + return; + } + + memset(map, 0, page); + + int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC); + if (fd < 0) { + munmap(map, page); + FAIL("open failed"); + return; + } + + ssize_t ret = read(fd, map, page * 2); + close(fd); + + bool any_nonzero = false; + for (size_t i = 0; i < page; i++) { + if (map[i] != 0) { + any_nonzero = true; + break; + } + } + + munmap(map, page); + EXPECT_TRUE(ret == (ssize_t) page && any_nonzero, + "urandom read did not preserve partial boundary result"); +} + int main(void) { printf("large I/O boundary tests\n\n"); test_large_write(); test_large_read_from_split_block(); + test_urandom_read_crosses_boundary(); SUMMARY("test-large-io-boundary"); return fails ? 1 : 0; diff --git a/tests/test-matrix.sh b/tests/test-matrix.sh index e6a6140..ad6921b 100755 --- a/tests/test-matrix.sh +++ b/tests/test-matrix.sh @@ -494,7 +494,7 @@ run_unit_tests() printf "\nNegative tests\n" test_check "$runner" "test-negative" "0 failed" "$bindir/test-negative" - printf "\nCOW fork isolation\n" + printf "\nCoW fork isolation\n" test_check "$runner" "test-cow-fork" "PASS" "$bindir/test-cow-fork" printf "\nGuard page / mmap edge cases\n" diff --git a/tests/test-syscall-smoke.c b/tests/test-syscall-smoke.c index 809998f..8419467 100644 --- a/tests/test-syscall-smoke.c +++ b/tests/test-syscall-smoke.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -57,6 +58,10 @@ #define SYS_sigaltstack 132 #endif +#ifndef O_PATH +#define O_PATH 010000000 +#endif + #ifndef SYS_set_tid_address #define SYS_set_tid_address 96 #endif @@ -623,6 +628,242 @@ static void test_sysv_semaphore_ops(void) } } +static void test_urandom_byte_reads(void) +{ + TEST("/dev/urandom byte reads"); + int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC); + if (fd < 0) { + FAIL("open"); + return; + } + + unsigned char bytes[32]; + for (size_t i = 0; i < sizeof(bytes); i++) { + ssize_t n = read(fd, &bytes[i], 1); + if (n != 1) { + close(fd); + FAIL("read"); + return; + } + } + close(fd); + + bool all_same = true; + for (size_t i = 1; i < sizeof(bytes); i++) { + if (bytes[i] != bytes[0]) { + all_same = false; + break; + } + } + if (all_same) { + FAIL("entropy stream did not vary"); + return; + } + PASS(); +} + +static void test_urandom_open_flags(void) +{ + TEST("/dev/urandom open flags"); + + errno = 0; + int dirfd = open("/dev/urandom", O_RDONLY | O_DIRECTORY); + if (dirfd >= 0) { + close(dirfd); + FAIL("O_DIRECTORY open succeeded"); + return; + } + if (errno != ENOTDIR) { + FAIL("O_DIRECTORY errno"); + return; + } + + int pathfd = open("/dev/urandom", O_PATH | O_CLOEXEC); + if (pathfd < 0) { + FAIL("O_PATH open"); + return; + } + unsigned char b = 0; + errno = 0; + ssize_t n = read(pathfd, &b, 1); + int saved_errno = errno; + close(pathfd); + if (n != -1 || saved_errno != EBADF) { + FAIL("O_PATH read"); + return; + } + + int wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC); + if (wfd < 0) { + FAIL("O_WRONLY open"); + return; + } + int fl = fcntl(wfd, F_GETFL); + errno = 0; + n = read(wfd, &b, 1); + saved_errno = errno; + close(wfd); + if (fl < 0 || (fl & O_ACCMODE) != O_WRONLY) { + FAIL("O_WRONLY F_GETFL"); + return; + } + if (n != -1 || saved_errno != EBADF) { + FAIL("O_WRONLY read"); + return; + } + + wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC); + if (wfd < 0) { + FAIL("O_WRONLY open readv"); + return; + } + struct iovec wv[2] = {{&b, 1}, {&b, 1}}; + errno = 0; + n = readv(wfd, wv, 2); + saved_errno = errno; + close(wfd); + if (n != -1 || saved_errno != EBADF) { + FAIL("O_WRONLY readv"); + return; + } + + wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC); + if (wfd < 0) { + FAIL("O_WRONLY open oversized readv"); + return; + } + struct iovec huge_wv[2] = {{&b, SSIZE_MAX}, {&b, 1}}; + errno = 0; + n = readv(wfd, huge_wv, 2); + saved_errno = errno; + close(wfd); + if (n != -1 || saved_errno != EBADF) { + FAIL("O_WRONLY oversized readv"); + return; + } + + wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC); + if (wfd < 0) { + FAIL("O_WRONLY open oversized single readv"); + return; + } + struct iovec huge_one_wv = {&b, (size_t) SSIZE_MAX + 1}; + errno = 0; + n = readv(wfd, &huge_one_wv, 1); + saved_errno = errno; + close(wfd); + if (n != -1 || saved_errno != EBADF) { + FAIL("O_WRONLY oversized single readv"); + return; + } + + int rfd = open("/dev/urandom", O_RDONLY | O_CLOEXEC); + if (rfd < 0) { + FAIL("O_RDONLY open readv"); + return; + } + unsigned char rb[2] = {0}; + struct iovec rv[2] = {{&rb[0], 1}, {&rb[1], 1}}; + n = readv(rfd, rv, 2); + if (n != 2) { + close(rfd); + FAIL("O_RDONLY readv"); + return; + } + + struct iovec huge[2] = {{&b, SSIZE_MAX}, {&b, 1}}; + errno = 0; + n = readv(rfd, huge, 2); + saved_errno = errno; + if (n != -1 || saved_errno != EINVAL) { + close(rfd); + FAIL("oversized readv"); + return; + } + + struct iovec huge_one = {&b, (size_t) SSIZE_MAX + 1}; + errno = 0; + n = readv(rfd, &huge_one, 1); + saved_errno = errno; + if (n != -1 || saved_errno != EINVAL) { + close(rfd); + FAIL("oversized single readv"); + return; + } + + pid_t pid = fork(); + if (pid < 0) { + close(rfd); + FAIL("fork inherited urandom"); + return; + } + if (pid == 0) { + unsigned char child_b = 0; + _exit(read(rfd, &child_b, 1) == 1 ? 0 : 1); + } + int status = 0; + waitpid(pid, &status, 0); + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + close(rfd); + FAIL("inherited urandom read"); + return; + } + + int p[2]; + if (pipe(p) != 0) { + close(rfd); + FAIL("urandom fork pipe"); + return; + } + unsigned char seed = 0; + if (read(rfd, &seed, 1) != 1) { + close(rfd); + close(p[0]); + close(p[1]); + FAIL("prime urandom cache before fork"); + return; + } + pid = fork(); + if (pid < 0) { + close(rfd); + close(p[0]); + close(p[1]); + FAIL("fork urandom cache isolation"); + return; + } + if (pid == 0) { + close(p[0]); + unsigned char child_buf[64]; + ssize_t got = read(rfd, child_buf, sizeof(child_buf)); + ssize_t put = got == (ssize_t) sizeof(child_buf) + ? write(p[1], child_buf, sizeof(child_buf)) + : -1; + close(p[1]); + _exit(put == (ssize_t) sizeof(child_buf) ? 0 : 1); + } + close(p[1]); + unsigned char parent_buf[64]; + unsigned char child_buf[64]; + ssize_t parent_n = read(rfd, parent_buf, sizeof(parent_buf)); + ssize_t child_n = read(p[0], child_buf, sizeof(child_buf)); + close(p[0]); + status = 0; + waitpid(pid, &status, 0); + close(rfd); + if (parent_n != (ssize_t) sizeof(parent_buf) || + child_n != (ssize_t) sizeof(child_buf) || !WIFEXITED(status) || + WEXITSTATUS(status) != 0) { + FAIL("urandom fork cache isolation read"); + return; + } + if (memcmp(parent_buf, child_buf, sizeof(parent_buf)) == 0) { + FAIL("urandom fork duplicated cached bytes"); + return; + } + + PASS(); +} + int main(int argc, char **argv) { printf("test-syscall-smoke: direct syscall smoke coverage\n\n"); @@ -642,6 +883,8 @@ int main(int argc, char **argv) test_memory_stubs(); test_accept4(); test_sysv_semaphore_ops(); + test_urandom_byte_reads(); + test_urandom_open_flags(); SUMMARY("test-syscall-smoke"); return fails > 0 ? 1 : 0; diff --git a/tests/test-vdso.c b/tests/test-vdso.c new file mode 100644 index 0000000..83aab76 --- /dev/null +++ b/tests/test-vdso.c @@ -0,0 +1,242 @@ +/* test-vdso.c -- vDSO ELF correctness and symbol-resolution probe + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Confirms the synthetic vDSO emitted by src/core/vdso.c: + * 1. is published via AT_SYSINFO_EHDR + * 2. parses as a valid ELF shared object + * 3. exports the four __kernel_* symbols at addresses inside the page + * 4. carries GNU symbol versioning naming LINUX_2.6.39 so glibc/musl + * dl_vdso_vsym() can resolve unversioned lookups + * 5. trampolines actually execute (call __kernel_clock_gettime and + * compare the result against a direct SVC clock_gettime) + * + * Static binary so the standard test driver runs it under elfuse with + * no sysroot. The probe walks the vDSO's dynamic linker structure + * itself rather than relying on dlsym (which is unavailable in static + * builds anyway), so a regression in the elf layout fails this test + * regardless of which libc would later consume it. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int failures = 0; + +#define FAIL(msg) \ + do { \ + fprintf(stderr, "FAIL: %s\n", msg); \ + failures++; \ + } while (0) + +#define EXPECT(cond, msg) \ + do { \ + if (!(cond)) \ + FAIL(msg); \ + } while (0) + +/* SysV ELF hash, matches the implementation in src/core/vdso.c. */ +static uint32_t elf_hash(const char *name) +{ + uint32_t h = 0, g; + while (*name) { + h = (h << 4) + (unsigned char) *name++; + g = h & 0xf0000000U; + if (g) + h ^= g >> 24; + h &= ~g; + } + return h; +} + +static const Elf64_Sym *lookup_sym(const Elf64_Ehdr *ehdr, + const Elf64_Sym *symtab, + const char *strtab, + const uint32_t *hash, + const char *name) +{ + uint32_t nbucket = hash[0]; + uint32_t nchain = hash[1]; + const uint32_t *bucket = &hash[2]; + const uint32_t *chain = &bucket[nbucket]; + uint32_t h = elf_hash(name) % nbucket; + for (uint32_t i = bucket[h]; i && i < nchain; i = chain[i]) { + if (strcmp(&strtab[symtab[i].st_name], name) == 0) + return &symtab[i]; + } + (void) ehdr; + return NULL; +} + +typedef struct { + const Elf64_Sym *symtab; + const char *strtab; + const uint32_t *hash; + const uint16_t *versym; + const Elf64_Verdef *verdef; + size_t strsz; + int verdef_count; +} vdso_t; + +static int parse_vdso(const Elf64_Ehdr *ehdr, vdso_t *v) +{ + memset(v, 0, sizeof(*v)); + const Elf64_Phdr *phdr = + (const Elf64_Phdr *) ((const uint8_t *) ehdr + ehdr->e_phoff); + const Elf64_Dyn *dyn = NULL; + for (int i = 0; i < ehdr->e_phnum; i++) { + if (phdr[i].p_type == PT_DYNAMIC) { + dyn = + (const Elf64_Dyn *) ((const uint8_t *) ehdr + phdr[i].p_offset); + break; + } + } + if (!dyn) + return -1; + for (; dyn->d_tag != DT_NULL; dyn++) { + const uint8_t *p = (const uint8_t *) ehdr + dyn->d_un.d_ptr; + switch (dyn->d_tag) { + case DT_SYMTAB: + v->symtab = (const Elf64_Sym *) p; + break; + case DT_STRTAB: + v->strtab = (const char *) p; + break; + case DT_STRSZ: + v->strsz = (size_t) dyn->d_un.d_val; + break; + case DT_HASH: + v->hash = (const uint32_t *) p; + break; + case DT_VERSYM: + v->versym = (const uint16_t *) p; + break; + case DT_VERDEF: + v->verdef = (const Elf64_Verdef *) p; + break; + case DT_VERDEFNUM: + v->verdef_count = (int) dyn->d_un.d_val; + break; + default: + break; + } + } + return (v->symtab && v->strtab && v->hash) ? 0 : -1; +} + +static const char *verdef_name_for_ndx(const vdso_t *v, uint16_t ndx) +{ + const Elf64_Verdef *vd = v->verdef; + for (int i = 0; i < v->verdef_count && vd; i++) { + if (vd->vd_ndx == ndx) { + const Elf64_Verdaux *aux = + (const Elf64_Verdaux *) ((const uint8_t *) vd + vd->vd_aux); + return &v->strtab[aux->vda_name]; + } + if (!vd->vd_next) + break; + vd = (const Elf64_Verdef *) ((const uint8_t *) vd + vd->vd_next); + } + return NULL; +} + +typedef int (*clock_gettime_fn)(clockid_t, struct timespec *); + +static void test_vdso(void) +{ + unsigned long base = getauxval(AT_SYSINFO_EHDR); + EXPECT(base != 0, "AT_SYSINFO_EHDR is zero"); + if (!base) + return; + printf("AT_SYSINFO_EHDR = 0x%lx\n", base); + + const Elf64_Ehdr *ehdr = (const Elf64_Ehdr *) base; + EXPECT(memcmp(ehdr->e_ident, + "\x7f" + "ELF", + 4) == 0, + "vDSO ELF magic"); + EXPECT(ehdr->e_machine == EM_AARCH64, "vDSO e_machine"); + EXPECT(ehdr->e_type == ET_DYN, "vDSO e_type"); + + vdso_t v; + EXPECT(parse_vdso(ehdr, &v) == 0, "vDSO dynamic section parse"); + if (!v.symtab || !v.strtab || !v.hash) + return; + + /* All four __kernel_* symbols must resolve and land in the vDSO page. */ + static const char *names[] = { + "__kernel_rt_sigreturn", "__kernel_clock_getres", + "__kernel_clock_gettime", "__kernel_gettimeofday"}; + const Elf64_Sym *syms[4] = {0}; + for (int i = 0; i < 4; i++) { + syms[i] = lookup_sym(ehdr, v.symtab, v.strtab, v.hash, names[i]); + char buf[64]; + snprintf(buf, sizeof(buf), "lookup %s", names[i]); + EXPECT(syms[i] != NULL, buf); + if (!syms[i]) + continue; + uint64_t addr = base + syms[i]->st_value; + snprintf(buf, sizeof(buf), "%s address in vDSO page", names[i]); + EXPECT(addr >= base && addr < base + 0x1000, buf); + } + + /* Symbol versioning: every defined symbol must point at LINUX_2.6.39. */ + EXPECT(v.versym != NULL, "vDSO DT_VERSYM present"); + EXPECT(v.verdef != NULL, "vDSO DT_VERDEF present"); + if (v.versym && v.verdef) { + for (int i = 0; i < 4; i++) { + if (!syms[i]) + continue; + uint32_t sym_idx = (uint32_t) (syms[i] - v.symtab); + uint16_t ndx = v.versym[sym_idx]; + const char *ver = verdef_name_for_ndx(&v, ndx); + char buf[80]; + snprintf(buf, sizeof(buf), "%s versioned LINUX_2.6.39", names[i]); + EXPECT(ver && strcmp(ver, "LINUX_2.6.39") == 0, buf); + } + } + + /* Direct call into the vDSO trampoline. Must agree with SVC. */ + const Elf64_Sym *cg = + lookup_sym(ehdr, v.symtab, v.strtab, v.hash, "__kernel_clock_gettime"); + if (cg) { + clock_gettime_fn fn = + (clock_gettime_fn) (uintptr_t) (base + cg->st_value); + struct timespec via_vdso = {0}, via_svc = {0}; + int r1 = fn(CLOCK_MONOTONIC, &via_vdso); + int r2 = (int) syscall(SYS_clock_gettime, CLOCK_MONOTONIC, &via_svc); + EXPECT(r1 == 0, "vDSO clock_gettime returned 0"); + EXPECT(r2 == 0, "SVC clock_gettime returned 0"); + /* Both should produce a sane monotonic value within ~10ms of each + * other (allowing for the gap between the two calls). + */ + int64_t delta_ns = + ((int64_t) via_svc.tv_sec - via_vdso.tv_sec) * 1000000000LL + + (via_svc.tv_nsec - via_vdso.tv_nsec); + if (delta_ns < 0) + delta_ns = -delta_ns; + EXPECT(delta_ns < 10000000, "vDSO and SVC clock_gettime agree"); + printf("vDSO/SVC clock_gettime delta = %" PRId64 " ns\n", delta_ns); + } +} + +int main(void) +{ + printf("test-vdso: vDSO ELF + symbol-versioning probe\n"); + test_vdso(); + if (failures) { + printf("test-vdso: %d FAIL\n", failures); + return 1; + } + puts("test-vdso: PASS"); + return 0; +}