From a24fc536a910cdab33b6eff11d509c9300eb5a7d Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Wed, 27 May 2026 13:50:28 +0800
Subject: [PATCH] Speedup vDSO CNTVCT and amortized urandom

vDSO clock_gettime drops from 1256 ns SVC trap to 2.5ns via CNTVCT-based
fast path (493x speedup, 20x under the sub-50 ns design target). The
trampoline emits a 28-instruction A64 sequence that reads CNTVCT_EL0,
LDAR-acquires the vvar initialized flag, and interpolates wall clock
from the anchor as delta * 125 / 3 (Apple Silicon CNTFRQ = 24 MHz),
falling back to SVC on first call or CNTVCT regression. The first SVC
seeds the vvar via a three-state CAS (0 -> 2 -> 1) so concurrent first
calls cannot tear the anchor fields. The seed is gated on ELR_EL1
matching the trampoline's svc_fallback PC so an unrelated raw
clock_gettime syscall cannot poison the anchor from arbitrary X9.

/dev/urandom 1-byte reads drop from 5688 ns uncached to 2054 ns (2.77x)
via a new per-fd entropy cache: an arc4random_buf-refilled 4 KiB buffer
per FD_URANDOM slot. The cache is zeroed on close via a type-to-cleanup
registry that also closes pre-existing dup and fork-state race windows
for every synthetic fd type.

eventfd dup shares state across aliases per the Linux contract
(refcounted slot plus eventfd_owner[FD_TABLE_SIZE] table). The dup path
holds fd_lock and sfd_lock together for the bind commit so racing close
cannot leak the refcount; the source identity is pinned via snapshotted
host fd so a racing close-and-rebind of the source cannot bind to the
wrong slot. tests/test-eventfd-dup pins the shared-state contract.

fork_ipc_send_fd_table filters eventfd, signalfd, timerfd, inotify,
netlink, pidfd, and epoll out of the SCM_RIGHTS payload. macOS rejects
kqueue fds across SCM_RIGHTS and per-class side-table state is not
transferable, so a clean drop is the only honest contract.
tests/test-fork-synthetic-fd pins it.

Startup decomposition: ELFUSE_STARTUP_TRACE=1 emits per-step wall time
for VM bring-up (17 steps on test-hello, dominated by hv_vcpu_create and
guest_init at roughly 0.9 ms each). Zero overhead when unset.
---
 src/core/bootstrap.c           |  39 +++
 src/core/guest.c               |  19 ++
 src/core/rosetta.c             |   2 +-
 src/core/startup-trace.h       |  66 +++++
 src/core/vdso.c                | 446 ++++++++++++++++++++++++++++++---
 src/core/vdso.h                |  30 ++-
 src/runtime/fork-state.c       |  48 +++-
 src/runtime/forkipc.c          |   2 +-
 src/syscall/abi.h              |   1 +
 src/syscall/fd.c               | 176 ++++++++++++-
 src/syscall/fd.h               |  15 ++
 src/syscall/fdtable.c          |  72 +++++-
 src/syscall/fs.c               | 107 +++++---
 src/syscall/fuse.c             |  16 +-
 src/syscall/inotify.c          |   1 +
 src/syscall/internal.h         |  62 ++++-
 src/syscall/io.c               | 247 ++++++++++++++----
 src/syscall/io.h               |   2 +
 src/syscall/net-msg.c          |   4 +-
 src/syscall/net.c              |   4 +-
 src/syscall/netlink.c          |   1 +
 src/syscall/proc-pidfd.c       |   7 +
 src/syscall/proc-pidfd.h       |   1 +
 src/syscall/signal.c           |   2 +-
 src/syscall/syscall.c          |   3 +
 src/syscall/time.c             |  22 ++
 tests/manifest.txt             |   5 +-
 tests/test-cow-fork.c          |  10 +-
 tests/test-eventfd-dup.c       |  65 +++++
 tests/test-fork-synthetic-fd.c | 218 ++++++++++++++++
 tests/test-large-io-boundary.c |  43 ++++
 tests/test-matrix.sh           |   2 +-
 tests/test-syscall-smoke.c     | 243 ++++++++++++++++++
 tests/test-vdso.c              | 242 ++++++++++++++++++
 34 files changed, 2056 insertions(+), 167 deletions(-)
 create mode 100644 src/core/startup-trace.h
 create mode 100644 tests/test-eventfd-dup.c
 create mode 100644 tests/test-fork-synthetic-fd.c
 create mode 100644 tests/test-vdso.c

diff --git a/src/core/bootstrap.c b/src/core/bootstrap.c
index c6522df..595c0d3 100644
--- a/src/core/bootstrap.c
+++ b/src/core/bootstrap.c
@@ -21,6 +21,7 @@
 #include "core/bootstrap.h"
 #include "core/rosetta.h"
 #include "core/stack.h"
+#include "core/startup-trace.h"
 #include "core/vdso.h"
 
 #include "runtime/thread.h"
@@ -334,14 +335,17 @@ int guest_bootstrap_prepare(guest_t *g,
     mem_region_t regions[MAX_BOOT_REGIONS];
     int nregions = 0;
     uint64_t native_vdso;
+    uint64_t t0;
 
     memset(boot, 0, sizeof(*boot));
     *guest_initialized = false;
 
+    t0 = startup_trace_now_ns();
     if (elf_load(elf_host_path, &boot->elf_info) < 0) {
         log_error("failed to load ELF: %s", elf_host_path);
         return -1;
     }
+    startup_trace_step("elf_load", t0);
 
     bool want_rosetta = false;
     if (boot->elf_info.e_machine == EM_X86_64) {
@@ -374,10 +378,12 @@ int guest_bootstrap_prepare(guest_t *g,
      * the request is non-fatal in either direction.
      */
     uint32_t req_ipa = want_rosetta ? 48 : 0;
+    t0 = startup_trace_now_ns();
     if (guest_init(g, 0, req_ipa) < 0) {
         log_error("failed to initialize guest");
         return -1;
     }
+    startup_trace_step("guest_init", t0);
     *guest_initialized = true;
     g->is_rosetta = want_rosetta;
     proc_set_rosetta_active(want_rosetta);
@@ -405,11 +411,13 @@ int guest_bootstrap_prepare(guest_t *g,
     } else {
         boot->elf_load_base =
             (boot->elf_info.e_type == ET_DYN) ? PIE_LOAD_BASE : 0;
+        t0 = startup_trace_now_ns();
         if (elf_map_segments(&boot->elf_info, elf_host_path, g->host_base,
                              g->guest_size, boot->elf_load_base) < 0) {
             log_error("failed to map ELF segments");
             return -1;
         }
+        startup_trace_step("elf_map_segments", t0);
 
         /* Track the lowest loaded ELF address so the legacy fork IPC path
          * copies low-linked ET_EXECs (e.g. linked at 0x200000) in full.
@@ -427,8 +435,10 @@ int guest_bootstrap_prepare(guest_t *g,
             g->stack_top = STACK_TOP_DEFAULT;
         g->stack_base = g->stack_top - STACK_SIZE;
 
+        t0 = startup_trace_now_ns();
         if (!load_interpreter(g, sysroot, boot))
             return -1;
+        startup_trace_step("load_interpreter", t0);
     }
 
     if (shim_bin_len > BLOCK_2MIB) {
@@ -436,6 +446,7 @@ int guest_bootstrap_prepare(guest_t *g,
         return -1;
     }
 
+    t0 = startup_trace_now_ns();
     memcpy((uint8_t *) g->host_base + g->shim_base, shim_bin, shim_bin_len);
     log_debug("shim loaded at offset 0x%llx (%zu bytes)",
               (unsigned long long) g->shim_base, shim_bin_len);
@@ -448,12 +459,15 @@ int guest_bootstrap_prepare(guest_t *g,
     }
     sys_icache_invalidate((uint8_t *) g->host_base + g->shim_base,
                           shim_bin_len);
+    startup_trace_step("shim_load_icache", t0);
 
+    t0 = startup_trace_now_ns();
     if (!build_boot_regions(regions, &nregions, g, boot, shim_bin_len)) {
         log_error("too many memory regions (%d >= %d)", nregions,
                   MAX_BOOT_REGIONS);
         return -1;
     }
+    startup_trace_step("build_boot_regions", t0);
 
     /* Rosetta path: append the rosetta image as a non-identity region so the
      * page-table builder maps VA 0x800000000000 -> primary buffer GPA.
@@ -461,24 +475,29 @@ int guest_bootstrap_prepare(guest_t *g,
      * from the same pool that guest_build_page_tables is about to consume).
      */
     if (want_rosetta) {
+        t0 = startup_trace_now_ns();
         if (rosetta_prepare(g, elf_host_path, regions, &nregions,
                             MAX_BOOT_REGIONS, verbose, &rr) < 0) {
             log_error("rosetta_prepare failed for %s", elf_guest_path);
             return -1;
         }
+        startup_trace_step("rosetta_prepare", t0);
     }
 
+    t0 = startup_trace_now_ns();
     boot->ttbr0 = guest_build_page_tables(g, regions, nregions);
     if (!boot->ttbr0) {
         log_error("failed to build page tables");
         return -1;
     }
+    startup_trace_step("guest_build_page_tables", t0);
     /* No TLBI request here: the shim's _start does TLBI VMALLE1IS before
      * enabling the MMU (src/core/shim.S), and the per-vCPU accumulator is the
      * wrong place to stage a bring-up flush -- bootstrap may run on a thread
      * whose slot is later consumed by an unrelated syscall.
      */
 
+    t0 = startup_trace_now_ns();
     if (want_rosetta) {
         /* /proc/self/maps for a rosetta guest reports the rosetta translator
          * as a single anonymous region covering [VA, VA+size). The original
@@ -505,12 +524,14 @@ int guest_bootstrap_prepare(guest_t *g,
     }
 
     register_runtime_regions(g, shim_bin_len);
+    startup_trace_step("register_regions", t0);
 
     log_debug("TTBR0=0x%llx, IPA base=0x%llx", (unsigned long long) boot->ttbr0,
               (unsigned long long) g->ipa_base);
     if (verbose)
         log_initial_page_tables(g, boot->ttbr0);
 
+    t0 = startup_trace_now_ns();
     syscall_init();
     proc_init();
 
@@ -526,6 +547,7 @@ int guest_bootstrap_prepare(guest_t *g,
     proc_set_elf_path(elf_guest_path);
     if (sysroot)
         proc_set_sysroot(sysroot);
+    startup_trace_step("runtime_init", t0);
 
     /* rosetta_finalize pre-opens the x86_64 binary at fd 3, constructs the
      * binfmt_misc argv ([ROSETTA_PATH, binary, original_argv[1..]]), refreshes
@@ -536,18 +558,22 @@ int guest_bootstrap_prepare(guest_t *g,
     int rosetta_argc = 0;
     const char **rosetta_argv = NULL;
     if (want_rosetta) {
+        t0 = startup_trace_now_ns();
         if (rosetta_finalize(g, 0, elf_host_path, elf_host_path_temp,
                              elf_guest_path, guest_argc, guest_argv, &rr,
                              verbose, &rosetta_argc, &rosetta_argv, NULL) < 0) {
             log_error("rosetta_finalize failed");
             return -1;
         }
+        startup_trace_step("rosetta_finalize", t0);
     } else {
         proc_set_cmdline(guest_argc, guest_argv);
     }
     proc_set_environ((const char **) environ);
 
+    t0 = startup_trace_now_ns();
     native_vdso = vdso_build(g);
+    startup_trace_step("vdso_build", t0);
     linux_stack_auxv_t auxv;
     const elf_info_t *stack_elf =
         want_rosetta ? &rr.rosetta_info : &boot->elf_info;
@@ -555,6 +581,7 @@ int guest_bootstrap_prepare(guest_t *g,
     uint64_t stack_interp_base = want_rosetta ? 0 : boot->interp_base;
     int stack_argc = want_rosetta ? rosetta_argc : guest_argc;
     const char **stack_argv = want_rosetta ? rosetta_argv : guest_argv;
+    t0 = startup_trace_now_ns();
     boot->stack_pointer = build_linux_stack(
         g, g->stack_top, stack_argc, stack_argv, (const char **) environ,
         stack_elf, stack_elf_load_base, stack_interp_base, native_vdso, -1,
@@ -564,6 +591,7 @@ int guest_bootstrap_prepare(guest_t *g,
         free(rosetta_argv);
         return -1;
     }
+    startup_trace_step("build_linux_stack", t0);
     /* rosetta_argv was copied into the guest stack; the host allocation is
      * no longer needed. The strings themselves are constants (ROSETTA_PATH)
      * or owned by the caller (binary_path, guest_argv entries) so freeing
@@ -599,6 +627,7 @@ int guest_bootstrap_create_vcpu(guest_t *g,
 {
     uint64_t sctlr;
     uint64_t sctlr_with_mmu;
+    uint64_t t0;
     /* Rosetta needs TTBR1 walks enabled and TBI1=1 so the kbuf window at
      * KBUF_VA_BASE (bits-63-set) resolves and TaggedPointer extraction keeps
      * working. Aarch64 guests stay on the EPD1=1 variant which keeps the
@@ -613,7 +642,9 @@ int guest_bootstrap_create_vcpu(guest_t *g,
     hv_vcpu_t vcpu;
     hv_vcpu_exit_t *vexit;
 
+    t0 = startup_trace_now_ns();
     HV_CHECK(hv_vcpu_create(&vcpu, &vexit, NULL));
+    startup_trace_step("hv_vcpu_create", t0);
     g->vcpu = vcpu;
     g->exit = vexit;
     *out_vcpu = vcpu;
@@ -621,6 +652,7 @@ int guest_bootstrap_create_vcpu(guest_t *g,
 
     thread_register_main(vcpu, vexit, proc_get_pid(), el1_sp);
 
+    t0 = startup_trace_now_ns();
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_VBAR_EL1, shim_ipa + 0x800));
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_MAIR_EL1, 0xFF00));
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TCR_EL1, tcr_value));
@@ -632,6 +664,12 @@ int guest_bootstrap_create_vcpu(guest_t *g,
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL0, sp_ipa));
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, el1_sp));
 
+    /* CNTKCTL_EL1.EL0VCTEN | EL0PCTEN: allow EL0 to read CNTVCT_EL0 /
+     * CNTPCT_EL0. Required by the vDSO clock_gettime fast path (and is the
+     * default on native Linux), without which the guest gets 0 back from MRS.
+     */
+    HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CNTKCTL_EL1, 0x3ULL));
+
     HV_CHECK(hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, &sctlr));
     log_debug("SCTLR_EL1 default=0x%llx", (unsigned long long) sctlr);
 
@@ -645,6 +683,7 @@ int guest_bootstrap_create_vcpu(guest_t *g,
     sctlr_with_mmu = SCTLR_RES1 | SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_DZE |
                      SCTLR_UCT | SCTLR_UCI;
     HV_CHECK(hv_vcpu_set_reg(vcpu, HV_REG_X0, sctlr_with_mmu));
+    startup_trace_step("hv_vcpu_configure", t0);
 
     log_debug(
         "vCPU configured: PC=0x%llx SCTLR=0x%llx VBAR=0x%llx TTBR0=0x%llx "
diff --git a/src/core/guest.c b/src/core/guest.c
index 6393b00..6098828 100644
--- a/src/core/guest.c
+++ b/src/core/guest.c
@@ -38,6 +38,7 @@
 #include <unistd.h>
 
 #include "core/guest.h"
+#include "core/startup-trace.h"
 #include "debug/log.h"
 #include "utils.h"
 #include "runtime/thread.h" /* thread_destroy_all_vcpus */
@@ -202,6 +203,8 @@ static uint64_t *pt_at(const guest_t *g, uint64_t gpa)
 
 int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
 {
+    uint64_t t0;
+
     memset(g, 0, sizeof(*g));
     g->shm_fd = -1;
     g->ipa_base = GUEST_IPA_BASE;
@@ -257,6 +260,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
      * seconds max wait) to handle this gracefully.
      */
     hv_return_t ret = HV_ERROR;
+    t0 = startup_trace_now_ns();
     for (int attempt = 0; attempt < 30; attempt++) {
         hv_vm_config_t config = hv_vm_config_create();
         hv_vm_config_set_ipa_size(config, vm_ipa);
@@ -266,6 +270,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
             break;
         usleep(500000); /* 500ms between attempts */
     }
+    startup_trace_step("hv_vm_create", t0);
     if (ret != HV_SUCCESS) {
         log_error("guest: hv_vm_create failed: %d (ipa_bits=%u)", (int) ret,
                   vm_ipa);
@@ -307,8 +312,10 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
          * physical memory. Do NOT memset because that would touch every
          * page and defeat demand paging.
          */
+        t0 = startup_trace_now_ns();
         g->host_base = mmap(NULL, try_size, PROT_READ | PROT_WRITE,
                             MAP_ANON | MAP_PRIVATE, -1, 0);
+        startup_trace_step("primary_mmap", t0);
         if (g->host_base == MAP_FAILED) {
             perror("guest: mmap");
             g->host_base = NULL;
@@ -320,6 +327,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
          * path instead of SCM_RIGHTS fd passing.
          */
         char tmppath[] = "/tmp/elfuse-XXXXXX";
+        t0 = startup_trace_now_ns();
         int sfd = mkstemp(tmppath);
         if (sfd >= 0) {
             unlink(tmppath); /* Unlink immediately; fd keeps file alive */
@@ -335,9 +343,12 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
                 close(sfd);
             }
         }
+        startup_trace_step("cow_shm_upgrade", t0);
 
+        t0 = startup_trace_now_ns();
         ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, try_size,
                         HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC);
+        startup_trace_step("hv_vm_map", t0);
         if (ret == HV_SUCCESS) {
             mapped_size = try_size;
             mapped = true;
@@ -380,6 +391,8 @@ int guest_init_from_shm(guest_t *g,
                         uint64_t size,
                         uint32_t ipa_bits)
 {
+    uint64_t t0;
+
     memset(g, 0, sizeof(*g));
     g->shm_fd = -1; /* Child does not own the shm */
     g->ipa_base = GUEST_IPA_BASE;
@@ -403,8 +416,10 @@ int guest_init_from_shm(guest_t *g,
      * the parent's frozen snapshot; writes are private to this process.
      * macOS CoW is page-granular: only modified pages are duplicated.
      */
+    t0 = startup_trace_now_ns();
     g->host_base =
         mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, shm_fd, 0);
+    startup_trace_step("shm_mmap", t0);
     if (g->host_base == MAP_FAILED) {
         perror("guest: mmap shm");
         g->host_base = NULL;
@@ -417,6 +432,7 @@ int guest_init_from_shm(guest_t *g,
 
     /* Create HVF VM with the same IPA width as the parent */
     hv_return_t ret = HV_ERROR;
+    t0 = startup_trace_now_ns();
     for (int attempt = 0; attempt < 30; attempt++) {
         hv_vm_config_t config = hv_vm_config_create();
         hv_vm_config_set_ipa_size(config, ipa_bits);
@@ -426,6 +442,7 @@ int guest_init_from_shm(guest_t *g,
             break;
         usleep(500000);
     }
+    startup_trace_step("hv_vm_create_shm", t0);
     if (ret != HV_SUCCESS) {
         log_error("guest: hv_vm_create (shm) failed: %d", (int) ret);
         munmap(g->host_base, size);
@@ -433,8 +450,10 @@ int guest_init_from_shm(guest_t *g,
         return -1;
     }
 
+    t0 = startup_trace_now_ns();
     ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, size,
                     HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC);
+    startup_trace_step("hv_vm_map_shm", t0);
     if (ret != HV_SUCCESS) {
         log_error("guest: hv_vm_map (shm) failed: %d", (int) ret);
         hv_vm_destroy();
diff --git a/src/core/rosetta.c b/src/core/rosetta.c
index 32588b4..caeabae 100644
--- a/src/core/rosetta.c
+++ b/src/core/rosetta.c
@@ -469,7 +469,7 @@ int rosetta_finalize(guest_t *g,
      * goto fail must be introduced below, or the fail handler would
      * double-close it.
      */
-    int bin_guest_fd = fd_alloc_at(3, FD_REGULAR, bin_host_fd);
+    int bin_guest_fd = fd_alloc_at(3, FD_REGULAR, bin_host_fd, NULL);
     if (bin_guest_fd < 0) {
         log_error("rosetta_finalize: fd_alloc_at(3) failed");
         goto fail;
diff --git a/src/core/startup-trace.h b/src/core/startup-trace.h
new file mode 100644
index 0000000..b2b75d8
--- /dev/null
+++ b/src/core/startup-trace.h
@@ -0,0 +1,66 @@
+/* Startup tracing helpers
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Lightweight per-step wall-time tracer for VM bring-up. Gated by the
+ * ELFUSE_STARTUP_TRACE environment variable so a release-build run pays
+ * exactly one getenv + one branch per step when disabled. The helpers are
+ * static inline so each translation unit can use them without pulling in a
+ * separate object; the getenv check resolves once per translation unit but
+ * the resolution itself is idempotent.
+ */
+
+#ifndef ELFUSE_STARTUP_TRACE_H
+#define ELFUSE_STARTUP_TRACE_H
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+/* File-scope cache (one copy per translation unit including this header).
+ * pthread_once serializes concurrent first callers and supplies the
+ * memory ordering that makes the cached value safely visible to all
+ * subsequent readers without explicit atomics.
+ */
+static pthread_once_t startup_trace_once = PTHREAD_ONCE_INIT;
+static bool startup_trace_value;
+
+static inline void startup_trace_resolve(void)
+{
+    const char *v = getenv("ELFUSE_STARTUP_TRACE");
+    startup_trace_value = v && v[0] && strcmp(v, "0") != 0;
+}
+
+static inline bool startup_trace_enabled(void)
+{
+    pthread_once(&startup_trace_once, startup_trace_resolve);
+    return startup_trace_value;
+}
+
+static inline uint64_t startup_trace_now_ns(void)
+{
+    if (!startup_trace_enabled())
+        return 0;
+    struct timespec ts;
+    if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
+        return 0;
+    return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec;
+}
+
+static inline void startup_trace_step(const char *label, uint64_t start_ns)
+{
+    if (start_ns == 0)
+        return;
+    uint64_t end_ns = startup_trace_now_ns();
+    if (end_ns < start_ns)
+        return;
+    fprintf(stderr, "startup %-28s %8.3f ms\n", label,
+            (double) (end_ns - start_ns) / 1000000.0);
+}
+
+#endif /* ELFUSE_STARTUP_TRACE_H */
diff --git a/src/core/vdso.c b/src/core/vdso.c
index 444be88..f50c5f8 100644
--- a/src/core/vdso.c
+++ b/src/core/vdso.c
@@ -4,7 +4,7 @@
  * Copyright 2025 Moritz Angermann, zw3rk pte. ltd.
  * SPDX-License-Identifier: Apache-2.0
  *
- * Builds a minimal vDSO ELF image in guest memory exposing
+ * Builds a minimal vDSO ELF image in guest memory exposing versioned
  * __kernel_{rt_sigreturn,clock_getres,clock_gettime,gettimeofday}. Each entry
  * point is an SVC trampoline that traps back to the host for the actual work.
  *
@@ -13,7 +13,9 @@
  * CNTVCT_EL0 from the macOS frame of reference while the guest reads it through
  * HVF's CNTVOFF_EL2 virtualization, so the seqlock interpolation produced bogus
  * times (year 26382). The fast path is gone; SVC is correct and the trap cost
- * is negligible compared to the work clock_gettime callers tend to do anyway.
+ * is still one syscall round trip, but the versioned ELF metadata lets modern
+ * libcs find the trampoline instead of falling back to their generic syscall
+ * path.
  */
 
 #include <stdint.h>
@@ -44,11 +46,28 @@ typedef struct {
     uint64_t st_value, st_size;
 } elf64_sym_t;
 
+typedef struct {
+    uint16_t vd_version;
+    uint16_t vd_flags;
+    uint16_t vd_ndx;
+    uint16_t vd_cnt;
+    uint32_t vd_hash;
+    uint32_t vd_aux;
+    uint32_t vd_next;
+} elf64_verdef_t;
+
+typedef struct {
+    uint32_t vda_name;
+    uint32_t vda_next;
+} elf64_verdaux_t;
+
 /* ELF constants */
 #define SHT_STRTAB 3
 #define SHT_HASH 5
 #define SHT_DYNAMIC 6
 #define SHT_DYNSYM 11
+#define SHT_GNU_VERDEF 0x6ffffffd
+#define SHT_GNU_VERSYM 0x6fffffff
 #define SHF_ALLOC (1ULL << 1)
 #define SHF_EXECINSTR (1ULL << 2)
 #define DT_NULL 0
@@ -57,8 +76,13 @@ typedef struct {
 #define DT_SYMTAB 6
 #define DT_STRSZ 10
 #define DT_SYMENT 11
+#define DT_VERSYM 0x6ffffff0
+#define DT_VERDEF 0x6ffffffc
+#define DT_VERDEFNUM 0x6ffffffd
 #define STB_GLOBAL 1
 #define STT_FUNC 2
+#define VER_DEF_CURRENT 1
+#define VDSO_LINUX_VERSION_INDEX 2
 #define ELF_ST_INFO(bind, type) (((bind) << 4) | ((type) & 0xf))
 
 /* Layout.
@@ -75,50 +99,115 @@ typedef struct {
 #define VDSO_OFF_PHDR 0x040
 #define VDSO_OFF_PHDR1 0x078
 
-/* .text trampolines (each 12 bytes: mov x8, #N; svc #0; ret). */
-#define TEXT_OFF_SIGRET 0x0B0
-#define TEXT_OFF_GETRES 0x0BC
-#define TEXT_OFF_GETTIME 0x0C8
-#define TEXT_OFF_GETTOD 0x0D4
-#define TEXT_END 0x0E0
+/* vvar at fixed offset; host writes the wall-clock anchor on first
+ * clock_gettime SVC, after the guest trampoline has stored its own
+ * CNTVCT_EL0 read into X9. Layout:
+ *   +0   uint32 initialized (host sets 1 after anchor_sec/anchor_nsec)
+ *   +4   uint32 pad
+ *   +8   uint64 anchor_cntvct (guest frame, written by host from X9)
+ *   +16  uint64 anchor_sec
+ *   +24  uint64 anchor_nsec
+ */
+#define VDSO_OFF_VVAR 0x0B0
+#define VVAR_OFF_INITIALIZED 0x00
+#define VVAR_OFF_ANCHOR_CNTVCT 0x08
+#define VVAR_OFF_ANCHOR_SEC 0x10
+#define VVAR_OFF_ANCHOR_NSEC 0x18
+#define VVAR_SIZE 0x20
+
+/* .text trampolines. rt_sigreturn / clock_getres / gettimeofday are 12-byte
+ * SVC trampolines. clock_gettime is the CNTVCT-based fast-path trampoline
+ * (112 bytes = 28 instructions including the svc_fallback tail). The
+ * trampoline uses LDAR on the vvar initialized flag, treats both states
+ * 0 (unseeded) and 2 (host-side reservation in vdso_seed_anchor) as
+ * fall-back, and guards the CNTVCT-anchor subtraction against unsigned
+ * underflow via SUBS + B.LO.
+ */
+#define TEXT_OFF_SIGRET 0x0D0
+#define TEXT_OFF_GETRES 0x0DC
+#define TEXT_OFF_GETTIME 0x0E8
+#define TEXT_GETTIME_SIZE 0x70
+#define TEXT_OFF_GETTOD (TEXT_OFF_GETTIME + TEXT_GETTIME_SIZE)
+#define TEXT_END (TEXT_OFF_GETTOD + 12)
+/* Address of the SVC inside __kernel_clock_gettime's svc_fallback (offset
+ * 0x68 within the trampoline). The host's sys_clock_gettime uses this
+ * value to gate vvar seeding: only a trap whose ELR_EL1 equals SVC_PC + 4
+ * came from the trampoline and may carry a trustworthy CNTVCT in X9.
+ */
+#define VDSO_CLOCK_GETTIME_SVC_PC (TEXT_OFF_GETTIME + 0x68)
+
+/* dynstr, dynsym, hash, GNU version metadata, dynamic, shdr follow.
+ * TEXT_END is 0x164 after the fast-path expansion; pad to 8-byte align.
+ */
+#define VDSO_OFF_DYNSTR 0x168
 
-/* dynstr, dynsym, hash, dynamic, shdr follow */
-#define VDSO_OFF_DYNSTR 0x0E0
-#define DYNSTR_SIZE 90
+/* Padded to 8-byte align: 0x168 + 103 = 0x1CF, pad to 0x1D0 */
+#define VDSO_OFF_DYNSYM 0x1D0
 
-/* Padded to 4-byte align: 0x0E0 + 90 = 0x13A, pad to 0x13C */
-#define VDSO_OFF_DYNSYM 0x13C
+/* 5 * 24 = 120, 0x1D0 + 120 = 0x248 */
+#define VDSO_OFF_HASH 0x248
 
-/* 5 * 24 = 120, 0x13C + 120 = 0x1B4 */
-#define VDSO_OFF_HASH 0x1B4
+/* 2+1+5 = 8 words * 4 = 32, 0x248 + 32 = 0x268 */
+#define VDSO_OFF_VERSYM 0x268
 
-/* 2+1+5 = 8 words * 4 = 32, 0x1B4 + 32 = 0x1D4, pad to 0x1D8 */
-#define VDSO_OFF_DYNAMIC 0x1D8
+/* 5 * 2 = 10, 0x268 + 10 = 0x272, pad to 0x278 */
+#define VDSO_OFF_VERDEF 0x278
 
-/* 6 * 16 = 96, 0x1D8 + 96 = 0x238 */
-#define VDSO_OFF_SHDR 0x238
+/* Verdef + verdaux = 28, 0x278 + 28 = 0x294, pad to 0x298 */
+#define VDSO_OFF_DYNAMIC 0x298
 
-/* 6 * 64 = 384, 0x238 + 384 = 0x3B8 (fits in 4KiB) */
+/* 9 * 16 = 144, 0x298 + 144 = 0x328 */
+#define VDSO_OFF_SHDR 0x328
+
+/* 8 * 64 = 512, 0x328 + 512 = 0x528 (fits in 4 KiB) */
 #define VDSO_NUM_SYMS 4
 #define HASH_NCHAIN (VDSO_NUM_SYMS + 1)
 #define HASH_NBUCKET 1
 #define HASH_SIZE ((2 + HASH_NBUCKET + HASH_NCHAIN) * sizeof(uint32_t))
+#define VERSYM_SIZE ((VDSO_NUM_SYMS + 1) * sizeof(uint16_t))
+#define VERDEF_SIZE (sizeof(elf64_verdef_t) + sizeof(elf64_verdaux_t))
+#define VDSO_NUM_DYN 9
 
 /* .dynstr data */
 static const char dynstr_data[] =
     "\0__kernel_rt_sigreturn"
     "\0__kernel_clock_getres"
     "\0__kernel_clock_gettime"
-    "\0__kernel_gettimeofday";
-
-/* Symbol name offsets */
-static const uint32_t sym_name_offsets[VDSO_NUM_SYMS] = {1, 23, 45, 68};
+    "\0__kernel_gettimeofday"
+    "\0LINUX_2.6.39";
+#define DYNSTR_SIZE sizeof(dynstr_data)
+
+/* Symbol name offsets, derived from preceding string-literal lengths so a
+ * future edit to dynstr_data shifts them in lockstep instead of silently
+ * breaking the version lookup (sizeof("\0X") - 1 == bytes contributed when
+ * X is concatenated into dynstr_data; only the very last literal's trailing
+ * NUL survives concatenation).
+ */
+#define DYNSTR_BYTES_RT_SIGRETURN (sizeof("\0__kernel_rt_sigreturn") - 1)
+#define DYNSTR_BYTES_CLOCK_GETRES (sizeof("\0__kernel_clock_getres") - 1)
+#define DYNSTR_BYTES_CLOCK_GETTIME (sizeof("\0__kernel_clock_gettime") - 1)
+#define DYNSTR_BYTES_GETTIMEOFDAY (sizeof("\0__kernel_gettimeofday") - 1)
+
+static const uint32_t sym_name_offsets[VDSO_NUM_SYMS] = {
+    1,
+    DYNSTR_BYTES_RT_SIGRETURN + 1,
+    DYNSTR_BYTES_RT_SIGRETURN + DYNSTR_BYTES_CLOCK_GETRES + 1,
+    DYNSTR_BYTES_RT_SIGRETURN + DYNSTR_BYTES_CLOCK_GETRES +
+        DYNSTR_BYTES_CLOCK_GETTIME + 1,
+};
+/* Skip the leading \0 of "\0LINUX_2.6.39" to land on 'L'. */
+#define VDSO_LINUX_VERSION_NAME_OFF                          \
+    (DYNSTR_BYTES_RT_SIGRETURN + DYNSTR_BYTES_CLOCK_GETRES + \
+     DYNSTR_BYTES_CLOCK_GETTIME + DYNSTR_BYTES_GETTIMEOFDAY + 1)
+
+_Static_assert(sizeof(dynstr_data) <= 104,
+               "dynstr_data outgrew the DYNSYM padding window");
 
 /* Symbol text offsets and sizes */
 static const uint32_t sym_text_off[VDSO_NUM_SYMS] = {
     TEXT_OFF_SIGRET, TEXT_OFF_GETRES, TEXT_OFF_GETTIME, TEXT_OFF_GETTOD};
-static const uint32_t sym_text_size[VDSO_NUM_SYMS] = {
-    12, 12, TEXT_OFF_GETTOD - TEXT_OFF_GETTIME, 12};
+static const uint32_t sym_text_size[VDSO_NUM_SYMS] = {12, 12, TEXT_GETTIME_SIZE,
+                                                      12};
 
 /* Emit a 12-byte SVC trampoline: mov x8, #syscall_nr; svc #0; ret. */
 static void emit_svc_trampoline(uint32_t *code, unsigned syscall_nr)
@@ -129,6 +218,209 @@ static void emit_svc_trampoline(uint32_t *code, unsigned syscall_nr)
     code[2] = 0xD65F03C0U; /* ret    */
 }
 
+/* CNTVCT-based fast-path trampoline for __kernel_clock_gettime. The guest
+ * always reads CNTVCT_EL0 into X9 first, then either falls through to a
+ * full SVC (CLOCK_REALTIME, unsupported clockids, vvar uninitialized) or
+ * interpolates wall_clock from the vvar anchor. The host's
+ * sys_clock_gettime handler reads X9 on the first SVC and seeds the vvar
+ * (anchor_cntvct = X9, anchor_sec/nsec = wall_clock), so subsequent calls
+ * skip the trap. CNTKCTL_EL1.EL0VCTEN is set in bootstrap to allow the
+ * MRS at EL0; without that the trampoline gets 0 back and the math
+ * collapses.
+ *
+ * Layout (vvar_off is byte offset from the trampoline's first instruction
+ * to VDSO_OFF_VVAR; resolved by emit_clock_gettime_trampoline below):
+ *
+ *   00: mrs  x9, cntvct_el0          ; always read first
+ *   04: cmp  w0, #1                  ; CLOCK_MONOTONIC?
+ *   08: b.ne svc_fallback
+ *   0C: adr  x2, vvar
+ *   10: ldr  w3, [x2, #INITIALIZED]
+ *   14: cbz  w3, svc_fallback        ; not seeded yet
+ *   18: ldr  x3, [x2, #ANCHOR_CNTVCT]
+ *   1C: ldr  x4, [x2, #ANCHOR_SEC]
+ *   20: ldr  x5, [x2, #ANCHOR_NSEC]
+ *   24: sub  x6, x9, x3              ; delta cycles (CNTFRQ = 24 MHz)
+ *   28: mov  x7, #125
+ *   2C: mul  x6, x6, x7              ; delta * 125
+ *   30: mov  x7, #3
+ *   34: udiv x6, x6, x7              ; delta_ns
+ *   38: add  x5, x5, x6              ; raw nsec
+ *   3C: mov  x7, #0xCA00
+ *   40: movk x7, #0x3B9A, lsl #16    ; x7 = 1e9
+ *   44: udiv x8, x5, x7              ; sec carry
+ *   48: msub x5, x8, x7, x5          ; nsec %= 1e9
+ *   4C: add  x4, x4, x8              ; final sec
+ *   50: stp  x4, x5, [x1]            ; store {sec, nsec}
+ *   54: mov  x0, #0
+ *   58: ret
+ *   5C: (svc_fallback: mov x8 #113; svc #0; ret)
+ *
+ * The svc_fallback tail lives in __kernel_clock_gettime's slot too so a
+ * single RET ends the function in either path.
+ */
+
+/* AArch64 instruction encoders (only the ones used here). */
+static uint32_t enc_movz_x(unsigned rd, uint16_t imm)
+{
+    return 0xD2800000U | ((uint32_t) imm << 5) | (rd & 0x1F);
+}
+
+static uint32_t enc_movk_x_lsl16(unsigned rd, uint16_t imm)
+{
+    return 0xF2A00000U | ((uint32_t) imm << 5) | (rd & 0x1F);
+}
+
+static uint32_t enc_adr(unsigned rd, int32_t pc_rel)
+{
+    uint32_t immlo = (uint32_t) (pc_rel & 0x3);
+    uint32_t immhi = (uint32_t) ((pc_rel >> 2) & 0x7FFFF);
+    return 0x10000000U | (immlo << 29) | (immhi << 5) | (rd & 0x1F);
+}
+
+/* B.cond imm19. cond is the 4-bit AArch64 condition (NE=0x1, LO=0x3, etc.). */
+#define COND_NE 0x1
+#define COND_LO 0x3
+static uint32_t enc_bcond_imm19(unsigned cond, int32_t pc_rel)
+{
+    uint32_t imm19 = (uint32_t) ((pc_rel >> 2) & 0x7FFFF);
+    return 0x54000000U | (imm19 << 5) | (cond & 0xF);
+}
+
+static uint32_t enc_ldr_x_imm12(unsigned rt, unsigned rn, uint32_t off_bytes)
+{
+    return 0xF9400000U | ((off_bytes / 8) << 10) | ((rn & 0x1F) << 5) |
+           (rt & 0x1F);
+}
+
+static uint32_t enc_add_x(unsigned rd, unsigned rn, unsigned rm)
+{
+    return 0x8B000000U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F);
+}
+
+static uint32_t enc_mul_x(unsigned rd, unsigned rn, unsigned rm)
+{
+    return 0x9B007C00U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F);
+}
+
+static uint32_t enc_udiv_x(unsigned rd, unsigned rn, unsigned rm)
+{
+    return 0x9AC00800U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F);
+}
+
+static uint32_t enc_msub_x(unsigned rd, unsigned rn, unsigned rm, unsigned ra)
+{
+    return 0x9B008000U | ((rm & 0x1F) << 16) | ((ra & 0x1F) << 10) |
+           ((rn & 0x1F) << 5) | (rd & 0x1F);
+}
+
+static uint32_t enc_stp_x_imm7(unsigned rt1,
+                               unsigned rt2,
+                               unsigned rn,
+                               int32_t off_bytes)
+{
+    int32_t imm7 = (off_bytes / 8) & 0x7F;
+    return 0xA9000000U | ((uint32_t) imm7 << 15) | ((rt2 & 0x1F) << 10) |
+           ((rn & 0x1F) << 5) | (rt1 & 0x1F);
+}
+
+static uint32_t enc_cmp_w_imm12(unsigned rn, uint32_t imm12)
+{
+    /* SUBS WZR, Wn, #imm12 */
+    return 0x7100001FU | ((imm12 & 0xFFF) << 10) | ((rn & 0x1F) << 5);
+}
+
+/* LDAR Wt, [Xn] -- acquire load of a 32-bit word. Pairs with the host's
+ * __atomic_store_n(initialized, ..., __ATOMIC_RELEASE) so that observing
+ * initialized != 0 also makes the prior anchor stores visible.
+ */
+static uint32_t enc_ldar_w(unsigned rt, unsigned rn)
+{
+    return 0x88DFFC00U | ((rn & 0x1F) << 5) | (rt & 0x1F);
+}
+
+/* SUBS Xd, Xn, Xm (set flags). */
+static uint32_t enc_subs_x(unsigned rd, unsigned rn, unsigned rm)
+{
+    return 0xEB000000U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F);
+}
+
+/* Emit the CNTVCT fast-path clock_gettime trampoline at page+pc_off; the
+ * vvar lives at page+vvar_off. The trampoline is exactly TEXT_GETTIME_SIZE
+ * bytes; the static_assert below catches drift.
+ */
+static void emit_clock_gettime_trampoline(uint32_t *code,
+                                          uint32_t pc_off,
+                                          uint32_t vvar_off)
+{
+    /* svc_fallback starts at offset 0x64 within the trampoline. The
+     * branch instructions live at offsets 0x08 (b.ne on clockid != 1),
+     * 0x18 (b.ne on initialized != 1), and 0x2C (b.lo on cntvct underflow).
+     * Each branch encoder takes a byte-relative offset (target - branch_pc)
+     * and shifts >> 2 internally for imm19.
+     */
+    int32_t svc_fallback_off = 0x64;
+    int32_t adr_pc_off = 0x0C;
+    int32_t vvar_rel = (int32_t) vvar_off - (int32_t) (pc_off + adr_pc_off);
+
+    code[0] = 0xD53BE049U;           /* mrs  x9, cntvct_el0      */
+    code[1] = enc_cmp_w_imm12(0, 1); /* cmp  w0, #1              */
+    code[2] =
+        enc_bcond_imm19(COND_NE, svc_fallback_off - 0x08); /* b.ne fallback */
+    code[3] = enc_adr(2, vvar_rel);  /* adr  x2, vvar            */
+    code[4] = enc_ldar_w(3, 2);      /* ldar w3, [x2]            */
+    code[5] = enc_cmp_w_imm12(3, 1); /* cmp  w3, #1              */
+    code[6] =
+        enc_bcond_imm19(COND_NE, svc_fallback_off - 0x18); /* b.ne fallback */
+    code[7] = enc_ldr_x_imm12(3, 2, VVAR_OFF_ANCHOR_CNTVCT);
+    code[8] = enc_ldr_x_imm12(4, 2, VVAR_OFF_ANCHOR_SEC);
+    code[9] = enc_ldr_x_imm12(5, 2, VVAR_OFF_ANCHOR_NSEC);
+    code[10] = enc_subs_x(6, 9, 3); /* subs x6, x9, x3 (delta)  */
+    code[11] =
+        enc_bcond_imm19(COND_LO, svc_fallback_off - 0x2C); /* b.lo fallback */
+    code[12] = enc_movz_x(7, 125);
+    code[13] = enc_mul_x(6, 6, 7); /* delta * 125              */
+    code[14] = enc_movz_x(7, 3);
+    code[15] = enc_udiv_x(6, 6, 7); /* delta_ns = delta*125/3   */
+    code[16] = enc_add_x(5, 5, 6);  /* nsec + delta_ns          */
+    code[17] = enc_movz_x(7, 0xCA00);
+    code[18] = enc_movk_x_lsl16(7, 0x3B9A); /* x7 = 1e9                 */
+    code[19] = enc_udiv_x(8, 5, 7);         /* sec_carry                */
+    code[20] = enc_msub_x(5, 8, 7, 5);      /* nsec %= 1e9              */
+    code[21] = enc_add_x(4, 4, 8);          /* sec += carry             */
+    code[22] = enc_stp_x_imm7(4, 5, 1, 0);  /* stp x4, x5, [x1]         */
+    code[23] = enc_movz_x(0, 0);            /* return 0                 */
+    code[24] = 0xD65F03C0U;                 /* ret                      */
+    /* svc_fallback at offset 0x64: mov x8, #113; svc #0; ret */
+    code[25] = enc_movz_x(8, 113);
+    code[26] = 0xD4000001U; /* svc #0                   */
+    code[27] = 0xD65F03C0U; /* ret                      */
+}
+
+_Static_assert(TEXT_GETTIME_SIZE == 28 * sizeof(uint32_t),
+               "clock_gettime trampoline size must match emitter");
+
+/* The public sigret offset declared in core/vdso.h must match the
+ * internal layout above; signal.c sets X30 to VDSO_BASE + VDSO_OFF_SIGRET
+ * as the return-from-handler target.
+ */
+_Static_assert(VDSO_OFF_SIGRET == TEXT_OFF_SIGRET,
+               "VDSO_OFF_SIGRET in core/vdso.h must equal TEXT_OFF_SIGRET");
+
+static uint32_t elf_hash(const char *name)
+{
+    uint32_t h = 0, g;
+
+    while (*name) {
+        h = (h << 4) + (unsigned char) *name++;
+        g = h & 0xf0000000U;
+        if (g)
+            h ^= g >> 24;
+        h &= ~g;
+    }
+    return h;
+}
+
 uint64_t vdso_build(guest_t *g)
 {
     uint8_t *page = (uint8_t *) guest_ptr(g, VDSO_BASE);
@@ -160,7 +452,7 @@ uint64_t vdso_build(guest_t *g)
     ehdr->e_phentsize = sizeof(elf64_phdr_t);
     ehdr->e_phnum = 2;
     ehdr->e_shentsize = sizeof(elf64_shdr_t);
-    ehdr->e_shnum = 6;
+    ehdr->e_shnum = 8;
     ehdr->e_shstrndx = 2;
 
     /* Program header 0: PT_LOAD. */
@@ -181,8 +473,8 @@ uint64_t vdso_build(guest_t *g)
     phdr1->p_offset = VDSO_OFF_DYNAMIC;
     phdr1->p_vaddr = VDSO_OFF_DYNAMIC;
     phdr1->p_paddr = VDSO_OFF_DYNAMIC;
-    phdr1->p_filesz = 6 * sizeof(elf64_dyn_t);
-    phdr1->p_memsz = 6 * sizeof(elf64_dyn_t);
+    phdr1->p_filesz = VDSO_NUM_DYN * sizeof(elf64_dyn_t);
+    phdr1->p_memsz = VDSO_NUM_DYN * sizeof(elf64_dyn_t);
     phdr1->p_align = 8;
 
     /* Text trampolines.  Each entry is the same 12-byte mov/svc/ret pattern
@@ -190,9 +482,14 @@ uint64_t vdso_build(guest_t *g)
      */
     emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_SIGRET), 139);
     emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_GETRES), 114);
-    emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_GETTIME), 113);
+    emit_clock_gettime_trampoline((uint32_t *) (page + TEXT_OFF_GETTIME),
+                                  TEXT_OFF_GETTIME, VDSO_OFF_VVAR);
     emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_GETTOD), 169);
 
+    /* vvar starts zero (initialized==0). The first __kernel_clock_gettime
+     * SVC fallback will let the host populate the anchor.
+     */
+
     /* Dynamic string table. */
     memcpy(page + VDSO_OFF_DYNSTR, dynstr_data, DYNSTR_SIZE);
 
@@ -221,6 +518,27 @@ uint64_t vdso_build(guest_t *g)
     }
     hash[2] = first_sym;
 
+    /* GNU symbol versioning. glibc's aarch64 vDSO resolver asks for
+     * LINUX_2.6.39 and ignores unversioned helpers.
+     */
+    uint16_t *versym = (uint16_t *) (page + VDSO_OFF_VERSYM);
+    versym[0] = 0;
+    for (int i = 1; i <= VDSO_NUM_SYMS; i++)
+        versym[i] = VDSO_LINUX_VERSION_INDEX;
+
+    elf64_verdef_t *verdef = (elf64_verdef_t *) (page + VDSO_OFF_VERDEF);
+    elf64_verdaux_t *verdaux =
+        (elf64_verdaux_t *) (page + VDSO_OFF_VERDEF + sizeof(*verdef));
+    verdef->vd_version = VER_DEF_CURRENT;
+    verdef->vd_flags = 0;
+    verdef->vd_ndx = VDSO_LINUX_VERSION_INDEX;
+    verdef->vd_cnt = 1;
+    verdef->vd_hash = elf_hash("LINUX_2.6.39");
+    verdef->vd_aux = sizeof(*verdef);
+    verdef->vd_next = 0;
+    verdaux->vda_name = VDSO_LINUX_VERSION_NAME_OFF;
+    verdaux->vda_next = 0;
+
     /* Dynamic table. */
     elf64_dyn_t *dyn = (elf64_dyn_t *) (page + VDSO_OFF_DYNAMIC);
     dyn[0] = (elf64_dyn_t) {DT_HASH, VDSO_OFF_HASH};
@@ -228,7 +546,10 @@ uint64_t vdso_build(guest_t *g)
     dyn[2] = (elf64_dyn_t) {DT_STRTAB, VDSO_OFF_DYNSTR};
     dyn[3] = (elf64_dyn_t) {DT_STRSZ, DYNSTR_SIZE};
     dyn[4] = (elf64_dyn_t) {DT_SYMENT, sizeof(elf64_sym_t)};
-    dyn[5] = (elf64_dyn_t) {DT_NULL, 0};
+    dyn[5] = (elf64_dyn_t) {DT_VERSYM, VDSO_OFF_VERSYM};
+    dyn[6] = (elf64_dyn_t) {DT_VERDEF, VDSO_OFF_VERDEF};
+    dyn[7] = (elf64_dyn_t) {DT_VERDEFNUM, 1};
+    dyn[8] = (elf64_dyn_t) {DT_NULL, 0};
 
     /* Section headers. */
     elf64_shdr_t *shdr = (elf64_shdr_t *) (page + VDSO_OFF_SHDR);
@@ -276,10 +597,71 @@ uint64_t vdso_build(guest_t *g)
     shdr[5].sh_flags = SHF_ALLOC;
     shdr[5].sh_addr = VDSO_OFF_DYNAMIC;
     shdr[5].sh_offset = VDSO_OFF_DYNAMIC;
-    shdr[5].sh_size = 6 * sizeof(elf64_dyn_t);
+    shdr[5].sh_size = VDSO_NUM_DYN * sizeof(elf64_dyn_t);
     shdr[5].sh_link = 2;
     shdr[5].sh_addralign = 8;
     shdr[5].sh_entsize = sizeof(elf64_dyn_t);
 
+    shdr[6].sh_name = 0;
+    shdr[6].sh_type = SHT_GNU_VERSYM;
+    shdr[6].sh_flags = SHF_ALLOC;
+    shdr[6].sh_addr = VDSO_OFF_VERSYM;
+    shdr[6].sh_offset = VDSO_OFF_VERSYM;
+    shdr[6].sh_size = VERSYM_SIZE;
+    shdr[6].sh_link = 3;
+    shdr[6].sh_addralign = 2;
+    shdr[6].sh_entsize = sizeof(uint16_t);
+
+    shdr[7].sh_name = 0;
+    shdr[7].sh_type = SHT_GNU_VERDEF;
+    shdr[7].sh_flags = SHF_ALLOC;
+    shdr[7].sh_addr = VDSO_OFF_VERDEF;
+    shdr[7].sh_offset = VDSO_OFF_VERDEF;
+    shdr[7].sh_size = VERDEF_SIZE;
+    shdr[7].sh_link = 2;
+    shdr[7].sh_info = 1;
+    shdr[7].sh_addralign = 4;
+
     return VDSO_BASE;
 }
+
+void vdso_seed_anchor(guest_t *g,
+                      uint64_t guest_cntvct,
+                      int64_t anchor_sec,
+                      int64_t anchor_nsec)
+{
+    uint8_t *page = (uint8_t *) guest_ptr(g, VDSO_BASE);
+    if (!page)
+        return;
+    uint32_t *initialized = (uint32_t *) (page + VDSO_OFF_VVAR);
+    uint8_t *vvar = page + VDSO_OFF_VVAR;
+
+    /* Three-state CAS reservation: 0 = unseeded, 2 = reserving (one host
+     * thread owns the anchor stores), 1 = ready. Multiple host threads can
+     * concurrently take the SVC fallback on the first guest call; without
+     * the reservation they race on the plain anchor stores. The CAS winner
+     * writes the fields and releases 1; losers bail. The guest trampoline
+     * loads initialized with LDAR and only takes the fast path on
+     * initialized == 1, so state 2 still routes to the SVC fallback.
+     */
+    uint32_t expected = 0;
+    if (!__atomic_compare_exchange_n(initialized, &expected, 2,
+                                     /* weak */ false, __ATOMIC_ACQUIRE,
+                                     __ATOMIC_RELAXED))
+        return;
+
+    *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_CNTVCT) = guest_cntvct;
+    *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_SEC) = (uint64_t) anchor_sec;
+    *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_NSEC) = (uint64_t) anchor_nsec;
+
+    /* The release-store on initialized pairs with the trampoline's LDAR
+     * load on the same address; observing 1 also makes the anchor fields
+     * visible to the guest.
+     */
+    __atomic_store_n(initialized, 1, __ATOMIC_RELEASE);
+}
+
+uint64_t vdso_clock_gettime_svc_pc(void)
+{
+    return VDSO_BASE + VDSO_CLOCK_GETTIME_SVC_PC;
+}
diff --git a/src/core/vdso.h b/src/core/vdso.h
index e3a41d5..b1ea9c2 100644
--- a/src/core/vdso.h
+++ b/src/core/vdso.h
@@ -17,12 +17,36 @@
 /* Guest address where the vDSO is placed (one 4KiB page, below PT pool) */
 #define VDSO_BASE 0x0000F000ULL
 #define VDSO_SIZE 0x00001000ULL /* 4KiB */
-#define VDSO_OFF_TEXT 0x0B0     /* Offset of .text (trampoline code) */
+/* Offset of __kernel_rt_sigreturn (the signal trampoline glibc/musl jumps
+ * to via X30/LR after the handler returns). Must match TEXT_OFF_SIGRET in
+ * src/core/vdso.c; kept here so signal.c can target it without including
+ * the vDSO internals.
+ */
+#define VDSO_OFF_SIGRET 0x0D0
 
 /* Build a minimal vDSO ELF image at VDSO_BASE in guest memory.
  * The image contains a valid ELF header, one LOAD program header, SHT_DYNSYM
- * and SHT_STRTAB sections, and a __kernel_rt_sigreturn symbol pointing to
- * a small trampoline (mov x8, #139; svc #0).
+ * and SHT_STRTAB sections, and a __kernel_rt_sigreturn symbol pointing to a
+ * small trampoline (mov x8, #139; svc #0).
  * Returns the GVA of the ELF header (== VDSO_BASE), or 0 on failure.
  */
 uint64_t vdso_build(guest_t *g);
+
+/* If the vvar anchor has not been seeded yet, install the supplied cntvct as
+ * the guest-frame anchor paired with the given wall_clock. Idempotent:
+ * subsequent calls with initialized==1 are no-ops. Used by sys_clock_gettime
+ * to upgrade the first __kernel_clock_gettime SVC fallback into a permanent
+ * vvar fast path.
+ */
+void vdso_seed_anchor(guest_t *g,
+                      uint64_t guest_cntvct,
+                      int64_t anchor_sec,
+                      int64_t anchor_nsec);
+
+/* GVA at which the trampoline's svc_fallback issues its SVC. Used by
+ * sys_clock_gettime to verify a clock_gettime trap actually came from the vDSO
+ * fallback path (and thus carries a guest-frame CNTVCT in X9) versus an
+ * unrelated raw syscall(SYS_clock_gettime, ...). The trap returns to SVC_PC
+ * + 4, so callers compare ELR_EL1 against that.
+ */
+uint64_t vdso_clock_gettime_svc_pc(void);
diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c
index f9746cd..edf1758 100644
--- a/src/runtime/fork-state.c
+++ b/src/runtime/fork-state.c
@@ -21,6 +21,7 @@
 #include "debug/log.h"
 #include "syscall/abi.h"
 #include "syscall/internal.h"
+#include "syscall/io.h"
 #include "syscall/mem.h"
 #include "syscall/proc.h"
 
@@ -249,9 +250,19 @@ int fork_ipc_send_fd_table(int ipc_sock)
         if (fd_table[i].type == FD_CLOSED)
             continue;
 
+        /* Synthetic-fd types are filtered here; see fd_type_is_synthetic
+         * in syscall/internal.h for the rationale (kqueue cannot cross
+         * SCM_RIGHTS on macOS, per-class side tables are not serialized).
+         * The child sees these slots as FD_CLOSED and recreates them via
+         * the appropriate syscall.
+         */
+        int t = fd_table[i].type;
+        if (fd_type_is_synthetic(t))
+            continue;
+
         int host_fd;
         bool was_duped = false;
-        if (fd_table[i].type != FD_STDIO) {
+        if (t != FD_STDIO) {
             int duped = dup(fd_table[i].host_fd);
             if (duped < 0)
                 continue;
@@ -315,8 +326,11 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g)
         return -1;
     }
 
-    if (num_fds == 0)
+    if (num_fds == 0) {
+        for (int fd = 0; fd < 3; fd++)
+            fd_mark_closed(fd);
         return 0;
+    }
 
     ipc_fd_entry_t *fd_entries = calloc(num_fds, sizeof(ipc_fd_entry_t));
     if (!fd_entries)
@@ -328,6 +342,16 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g)
         return -1;
     }
 
+    bool low_fd_present[3] = {false, false, false};
+    for (uint32_t i = 0; i < num_fds; i++) {
+        int gfd = fd_entries[i].guest_fd;
+        if (RANGE_CHECK(gfd, 0, 3) && !fd_type_is_synthetic(fd_entries[i].type))
+            low_fd_present[gfd] = true;
+    }
+    for (int fd = 0; fd < 3; fd++)
+        if (!low_fd_present[fd])
+            fd_mark_closed(fd);
+
     int *host_fds = calloc(num_fds, sizeof(int));
     if (!host_fds) {
         free(fd_entries);
@@ -364,12 +388,30 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g)
             memcpy(fd_table[gfd].proc_path, fd_entries[i].proc_path,
                    sizeof(fd_table[gfd].proc_path));
             fd_table[gfd].seals = fd_entries[i].seals;
+        } else if (fd_type_is_synthetic(fd_entries[i].type)) {
+            /* Defense in depth: the parent's fork_ipc_send_fd_table
+             * already filters synthetic types out of the SCM_RIGHTS
+             * payload (see fd_type_is_synthetic in syscall/internal.h).
+             * If anything still arrives here, drop the inherited host
+             * fd and leave the slot FD_CLOSED so the child must
+             * recreate the fd via the appropriate syscall.
+             */
+            log_debug(
+                "fork-child: dropping unexpected synthetic-type fd %d (type "
+                "%d)",
+                gfd, fd_entries[i].type);
+            close(host_fds[i]);
+            fd_mark_closed(gfd);
+            continue;
         } else {
-            fd_alloc_at(gfd, fd_entries[i].type, host_fds[i]);
+            void (*cleanup)(int) = fd_cleanup_for_type(fd_entries[i].type);
+            fd_alloc_at(gfd, fd_entries[i].type, host_fds[i], cleanup);
             fd_table[gfd].linux_flags = fd_entries[i].linux_flags;
             memcpy(fd_table[gfd].proc_path, fd_entries[i].proc_path,
                    sizeof(fd_table[gfd].proc_path));
             fd_table[gfd].seals = fd_entries[i].seals;
+            if (fd_entries[i].type == FD_URANDOM)
+                urandom_fd_reset_cache(gfd);
 
             if (fd_entries[i].type != FD_DIR)
                 continue;
diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c
index 963cb61..59c9ffe 100644
--- a/src/runtime/forkipc.c
+++ b/src/runtime/forkipc.c
@@ -1272,7 +1272,7 @@ int64_t sys_clone(hv_vcpu_t vcpu,
      *
      * Rosetta guests are excluded from CoW even when shm-backed: rosetta's
      * JIT state (TLS slabs, code caches, indirect-call tables, block lists)
-     * is process-local and corrupts when COW-shared. The legacy region-copy
+     * is process-local and corrupts when CoW-shared. The legacy region-copy
      * path preserves the parent's JIT state independently per child.
      */
     bool use_shm = (g->shm_fd >= 0) && !g->is_rosetta;
diff --git a/src/syscall/abi.h b/src/syscall/abi.h
index eda9bc7..122b351 100644
--- a/src/syscall/abi.h
+++ b/src/syscall/abi.h
@@ -639,6 +639,7 @@ typedef struct {
 #define FD_FUSE_DEV 14
 #define FD_FUSE_FILE 15
 #define FD_FUSE_DIR 16
+#define FD_URANDOM 17
 #define FD_VIRTUAL_PATH_MAX 64
 
 /* File sealing flags (F_SEAL_*) for memfd_create. Tracked per-FD. */
diff --git a/src/syscall/fd.c b/src/syscall/fd.c
index c1f828f..f06b0d2 100644
--- a/src/syscall/fd.c
+++ b/src/syscall/fd.c
@@ -104,6 +104,7 @@ void timerfd_init(void)
 {
     for (int i = 0; i < TIMERFD_MAX; i++)
         timerfd_state[i].guest_fd = -1;
+    fd_register_cleanup(FD_TIMERFD, timerfd_close);
 }
 
 static int timerfd_find(int guest_fd)
@@ -514,10 +515,20 @@ static void timerfd_close(int guest_fd)
 #define LINUX_EFD_NONBLOCK 0x800  /* Same as O_NONBLOCK */
 #define LINUX_EFD_SEMAPHORE 1
 
-/* Per-eventfd state */
+/* Per-eventfd state. The slot is shared across guest_fds that point at it (via
+ * dup/dup2/fcntl F_DUPFD), matching the Linux contract that dup'd eventfd fds
+ * share the same kernel object. eventfd_owner[gfd] maps a guest_fd to its slot;
+ * multiple guest_fds can map to the same slot. The slot owns its own read end
+ * for readiness/drain/blocking operations so it does not depend on any one
+ * guest fd remaining open. The slot is freed when refcount drops to zero. The
+ * slot's guest_fd field is retained for sfd_alloc_slot's
+ * "free if guest_fd == -1" convention and tracks the most recently allocated
+ * primary owner.
+ */
 #define EVENTFD_MAX 32
 static struct {
-    int guest_fd;     /* Guest fd (-1 if unused) */
+    int guest_fd;     /* Primary guest fd, -1 when slot is free */
+    int refcount;     /* Number of guest_fds bound to this slot */
     int pipe_rd;      /* Read end of self-pipe (for poll/epoll readiness) */
     int pipe_wr;      /* Write end of self-pipe */
     uint64_t counter; /* Accumulated event counter */
@@ -525,16 +536,22 @@ static struct {
     int nonblock;     /* O_NONBLOCK */
 } eventfd_state[EVENTFD_MAX];
 
+static int eventfd_owner[FD_TABLE_SIZE]; /* guest_fd -> slot, or -1 */
+
 void eventfd_init(void)
 {
     for (int i = 0; i < EVENTFD_MAX; i++)
         eventfd_state[i].guest_fd = -1;
+    for (int i = 0; i < FD_TABLE_SIZE; i++)
+        eventfd_owner[i] = -1;
+    fd_register_cleanup(FD_EVENTFD, eventfd_close);
 }
 
 static int eventfd_find(int guest_fd)
 {
-    return sfd_find_slot(eventfd_state, EVENTFD_MAX, sizeof(eventfd_state[0]),
-                         guest_fd);
+    if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE))
+        return -1;
+    return eventfd_owner[guest_fd];
 }
 
 static int eventfd_slot_alloc(void)
@@ -542,6 +559,19 @@ static int eventfd_slot_alloc(void)
     return sfd_alloc_slot(eventfd_state, EVENTFD_MAX, sizeof(eventfd_state[0]));
 }
 
+static void eventfd_release_ref_locked(int slot)
+{
+    if (--eventfd_state[slot].refcount <= 0) {
+        close(eventfd_state[slot].pipe_rd);
+        close(eventfd_state[slot].pipe_wr);
+        eventfd_state[slot].guest_fd = -1;
+        eventfd_state[slot].counter = 0;
+        eventfd_state[slot].refcount = 0;
+        eventfd_state[slot].pipe_rd = -1;
+        eventfd_state[slot].pipe_wr = -1;
+    }
+}
+
 int64_t sys_eventfd2(unsigned int initval, int flags)
 {
     if (flags & ~(LINUX_EFD_CLOEXEC | LINUX_EFD_NONBLOCK | LINUX_EFD_SEMAPHORE))
@@ -564,9 +594,22 @@ int64_t sys_eventfd2(unsigned int initval, int flags)
         return linux_errno();
     }
 
+    int state_rd = dup(pipefd[0]);
+    if (state_rd < 0 || fd_set_nonblock(state_rd) < 0 ||
+        fd_set_cloexec(state_rd) < 0) {
+        int saved_errno = errno;
+        if (state_rd >= 0)
+            close(state_rd);
+        close(pipefd[0]);
+        close(pipefd[1]);
+        errno = saved_errno;
+        return linux_errno();
+    }
+
     /* Allocate guest fd: use read end as the host fd so epoll/poll sees it */
     int gfd = fd_alloc(FD_EVENTFD, pipefd[0], eventfd_close);
     if (gfd < 0) {
+        close(state_rd);
         close(pipefd[0]);
         close(pipefd[1]);
         return -LINUX_EMFILE;
@@ -577,17 +620,20 @@ int64_t sys_eventfd2(unsigned int initval, int flags)
     if (slot < 0) {
         pthread_mutex_unlock(&sfd_lock);
         fd_mark_closed(gfd);
+        close(state_rd);
         close(pipefd[0]);
         close(pipefd[1]);
         return -LINUX_ENOMEM;
     }
 
     eventfd_state[slot].guest_fd = gfd;
-    eventfd_state[slot].pipe_rd = pipefd[0];
+    eventfd_state[slot].refcount = 1;
+    eventfd_state[slot].pipe_rd = state_rd;
     eventfd_state[slot].pipe_wr = pipefd[1];
     eventfd_state[slot].counter = (uint64_t) initval;
     eventfd_state[slot].semaphore = (flags & LINUX_EFD_SEMAPHORE) ? 1 : 0;
     eventfd_state[slot].nonblock = (flags & LINUX_EFD_NONBLOCK) ? 1 : 0;
+    eventfd_owner[gfd] = slot;
     pthread_mutex_unlock(&sfd_lock);
 
     fd_table[gfd].linux_flags =
@@ -610,14 +656,117 @@ static void eventfd_close(int guest_fd)
     pthread_mutex_lock(&sfd_lock);
     int slot = eventfd_find(guest_fd);
     if (slot >= 0) {
-        close(eventfd_state[slot].pipe_wr);
-        /* pipe_rd is closed by sys_close() as host_fd */
-        eventfd_state[slot].guest_fd = -1;
-        eventfd_state[slot].counter = 0;
+        eventfd_owner[guest_fd] = -1;
+        eventfd_release_ref_locked(slot);
     }
     pthread_mutex_unlock(&sfd_lock);
 }
 
+/* Bind an additional guest_fd to the same slot as src_fd, sharing the
+ * counter and pipe state. Two races to defeat:
+ *
+ *   - Source identity. duplicate_guest_fd() snapshots src_fd under
+ *     fd_lock, releases it, then calls us. Between those points src_fd
+ *     could be closed and rebound to a different eventfd. We carry the
+ *     caller's snapshot of fd_table[src_fd].host_fd as src_host_fd and verify
+ *     under fd_lock + sfd_lock that the source fd still has that host fd and
+ *     still maps to a live eventfd slot.
+ *
+ *   - Destination close. fd_alloc_*_relaxed publishes the new guest_fd
+ *     with eventfd_close as cleanup before we install the owner mapping.
+ *     A racing close would run eventfd_close, see owner == -1, skip the
+ *     refcount decrement, and leak the slot. We defeat this by reserving a
+ *     slot ref before publishing the destination, then holding fd_lock +
+ *     sfd_lock together while we verify fd_table[new] is still FD_EVENTFD with
+ *     the host_fd we allocated and set eventfd_owner. Any close that already
+ *     ran is observed here as FD_CLOSED, and we abandon the bind cleanly with
+ *     no leak.
+ */
+int eventfd_dup_fd(int src_fd,
+                   int src_host_fd,
+                   int min_guest_fd,
+                   int fixed_guest_fd,
+                   bool fixed_slot,
+                   int linux_flags)
+{
+    /* Pin the source under fd_lock + sfd_lock and dup the slot-owned
+     * readiness fd. The slot fd is independent of any guest alias, so closing
+     * the source later cannot invalidate eventfd_state[slot].pipe_rd.
+     */
+    pthread_mutex_lock(&fd_lock);
+    pthread_mutex_lock(&sfd_lock);
+    int slot = eventfd_find(src_fd);
+    if (slot < 0 || fd_table[src_fd].type != FD_EVENTFD ||
+        fd_table[src_fd].host_fd != src_host_fd ||
+        eventfd_state[slot].refcount <= 0) {
+        pthread_mutex_unlock(&sfd_lock);
+        pthread_mutex_unlock(&fd_lock);
+        errno = EBADF;
+        return -1;
+    }
+    eventfd_state[slot].refcount++;
+    int new_host_fd = dup(eventfd_state[slot].pipe_rd);
+    int original_pipe_rd = eventfd_state[slot].pipe_rd;
+    if (new_host_fd < 0)
+        eventfd_release_ref_locked(slot);
+    pthread_mutex_unlock(&sfd_lock);
+    pthread_mutex_unlock(&fd_lock);
+    if (new_host_fd < 0)
+        return -1;
+
+    /* Publish the destination fd with eventfd_close as cleanup. The
+     * eventfd_owner mapping is still -1, so a racing close here observes
+     * owner == -1 and does nothing; we detect that below.
+     */
+    int new_guest_fd = fixed_slot
+                           ? fd_alloc_at_relaxed(fixed_guest_fd, FD_EVENTFD,
+                                                 new_host_fd, eventfd_close)
+                           : fd_alloc_from_relaxed(min_guest_fd, FD_EVENTFD,
+                                                   new_host_fd, eventfd_close);
+    if (new_guest_fd < 0) {
+        close(new_host_fd);
+        pthread_mutex_lock(&sfd_lock);
+        eventfd_release_ref_locked(slot);
+        pthread_mutex_unlock(&sfd_lock);
+        if (fixed_slot)
+            errno = EBADF;
+        return -1;
+    }
+
+    /* Commit the bind under both locks in the documented order
+     * (fd_lock then sfd_lock). If a close already ran, fd_table[new].type
+     * is FD_CLOSED and we just bail with -EBADF; the host_fd is already
+     * gone via sys_close. Otherwise verify the source slot is still
+     * alive and unchanged, then install owner for the reserved ref.
+     */
+    pthread_mutex_lock(&fd_lock);
+    pthread_mutex_lock(&sfd_lock);
+    if (fd_table[new_guest_fd].type != FD_EVENTFD ||
+        fd_table[new_guest_fd].host_fd != new_host_fd ||
+        eventfd_state[slot].refcount <= 0 ||
+        eventfd_state[slot].pipe_rd != original_pipe_rd) {
+        pthread_mutex_unlock(&sfd_lock);
+        pthread_mutex_unlock(&fd_lock);
+        /* If the destination is still open but the source went away,
+         * tear it down. (If the destination already closed itself, the
+         * snapshot below sees FD_CLOSED and is a no-op.)
+         */
+        fd_entry_t snap;
+        if (fd_snapshot_and_close(new_guest_fd, &snap))
+            fd_cleanup_entry(new_guest_fd, &snap);
+        pthread_mutex_lock(&sfd_lock);
+        eventfd_release_ref_locked(slot);
+        pthread_mutex_unlock(&sfd_lock);
+        errno = EBADF;
+        return -1;
+    }
+    eventfd_owner[new_guest_fd] = slot;
+    fd_table[new_guest_fd].linux_flags = linux_flags;
+    pthread_mutex_unlock(&sfd_lock);
+    pthread_mutex_unlock(&fd_lock);
+    return new_guest_fd;
+}
+
 /* Read from eventfd: return 8-byte counter value, then reset to 0.
  * In EFD_SEMAPHORE mode, return 1 and decrement counter by 1.
  */
@@ -657,8 +806,12 @@ int64_t eventfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
             return linux_errno();
 
         pthread_mutex_lock(&sfd_lock);
-        /* Re-validate: slot may have been freed by eventfd_close() */
-        if (eventfd_state[slot].guest_fd != guest_fd) {
+        /* Re-validate via the owner table, not eventfd_state[slot].guest_fd:
+         * dup'd aliases bind multiple guest_fds to the same slot, so a
+         * legitimate caller's guest_fd may not equal the primary owner.
+         */
+        if (eventfd_owner[guest_fd] != slot ||
+            eventfd_state[slot].refcount <= 0) {
             pthread_mutex_unlock(&sfd_lock);
             return -LINUX_EBADF;
         }
@@ -809,6 +962,7 @@ void signalfd_init(void)
 {
     for (int i = 0; i < SIGNALFD_MAX; i++)
         signalfd_state[i].guest_fd = -1;
+    fd_register_cleanup(FD_SIGNALFD, signalfd_close);
 }
 
 static int signalfd_find(int guest_fd)
diff --git a/src/syscall/fd.h b/src/syscall/fd.h
index e087ed4..faaf958 100644
--- a/src/syscall/fd.h
+++ b/src/syscall/fd.h
@@ -33,6 +33,21 @@ int64_t sys_timerfd_gettime(guest_t *g, int fd, uint64_t curr_value_gva);
 /* eventfd (emulated via pipe + counter) */
 int64_t sys_eventfd2(unsigned int initval, int flags);
 
+/* Duplicate an eventfd into a new guest_fd slot, sharing the counter and
+ * pipe state with src_fd. Mirrors the Linux contract that dup'd eventfds
+ * share the same underlying kernel object. src_host_fd must be the host
+ * fd snapshotted from fd_table[src_fd].host_fd by the caller; the
+ * implementation uses it to verify under fd_lock + sfd_lock that the source
+ * fd still refers to the same live eventfd between the caller's snapshot and
+ * the dup commit. Returns the new guest_fd or -1 with errno set.
+ */
+int eventfd_dup_fd(int src_fd,
+                   int src_host_fd,
+                   int min_guest_fd,
+                   int fixed_guest_fd,
+                   bool fixed_slot,
+                   int linux_flags);
+
 /* signalfd (emulated via synthetic signal reads) */
 int64_t sys_signalfd4(guest_t *g,
                       int fd,
diff --git a/src/syscall/fdtable.c b/src/syscall/fdtable.c
index 5455f41..9c388c4 100644
--- a/src/syscall/fdtable.c
+++ b/src/syscall/fdtable.c
@@ -169,26 +169,29 @@ int fd_alloc(int type, int host_fd, void (*cleanup)(int))
 /* Allocate the lowest available FD >= minfd. Returns -1 if none available
  * or RLIMIT_NOFILE would be exceeded.
  */
-int fd_alloc_from(int minfd, int type, int host_fd)
+int fd_alloc_from(int minfd, int type, int host_fd, void (*cleanup)(int))
 {
     pthread_mutex_lock(&fd_lock);
-    int fd = fd_alloc_locked(minfd, type, host_fd, NULL);
+    int fd = fd_alloc_locked(minfd, type, host_fd, cleanup);
     pthread_mutex_unlock(&fd_lock);
     return fd;
 }
 
-int fd_alloc_from_relaxed(int minfd, int type, int host_fd)
+int fd_alloc_from_relaxed(int minfd,
+                          int type,
+                          int host_fd,
+                          void (*cleanup)(int))
 {
     if (!thread_is_single_active())
-        return fd_alloc_from(minfd, type, host_fd);
-    return fd_alloc_locked(minfd, type, host_fd, NULL);
+        return fd_alloc_from(minfd, type, host_fd, cleanup);
+    return fd_alloc_locked(minfd, type, host_fd, cleanup);
 }
 
 /* Allocate a specific FD slot. Enforces RLIMIT_NOFILE. Properly cleans up any
  * existing entry (including DIR* for directory FDs) before overwriting. Returns
  * -1 if out of range.
  */
-int fd_alloc_at(int fd, int type, int host_fd)
+int fd_alloc_at(int fd, int type, int host_fd, void (*cleanup)(int))
 {
     if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE))
         return -1;
@@ -204,7 +207,7 @@ int fd_alloc_at(int fd, int type, int host_fd)
     pthread_mutex_lock(&fd_lock);
     if (fd_table[fd].type != FD_CLOSED)
         old = fd_table[fd];
-    fd_init_entry(fd, type, host_fd, NULL);
+    fd_init_entry(fd, type, host_fd, cleanup);
     pthread_mutex_unlock(&fd_lock);
 
     /* Clean up old resources outside fd_lock */
@@ -214,19 +217,19 @@ int fd_alloc_at(int fd, int type, int host_fd)
     return fd;
 }
 
-int fd_alloc_at_relaxed(int fd, int type, int host_fd)
+int fd_alloc_at_relaxed(int fd, int type, int host_fd, void (*cleanup)(int))
 {
     if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE))
         return -1;
     if (fd >= rlimit_nofile_cur)
         return -1;
     if (!thread_is_single_active())
-        return fd_alloc_at(fd, type, host_fd);
+        return fd_alloc_at(fd, type, host_fd, cleanup);
 
     if (fd_table[fd].type != FD_CLOSED)
-        return fd_alloc_at(fd, type, host_fd);
+        return fd_alloc_at(fd, type, host_fd, cleanup);
 
-    fd_init_entry(fd, type, host_fd, NULL);
+    fd_init_entry(fd, type, host_fd, cleanup);
     return fd;
 }
 
@@ -334,6 +337,53 @@ bool fd_snapshot(int guest_fd, fd_entry_t *out)
     return ok;
 }
 
+int fd_snapshot_and_dup(int guest_fd, fd_entry_t *out)
+{
+    out->type = FD_CLOSED;
+    if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE))
+        return -1;
+    pthread_mutex_lock(&fd_lock);
+    if (!fd_snapshot_locked(guest_fd, out, false)) {
+        pthread_mutex_unlock(&fd_lock);
+        return -1;
+    }
+    int host = (out->host_fd >= 0) ? dup(out->host_fd) : -1;
+    pthread_mutex_unlock(&fd_lock);
+    return host;
+}
+
+int fd_get_type(int guest_fd)
+{
+    if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE))
+        return FD_CLOSED;
+    pthread_mutex_lock(&fd_lock);
+    int type = fd_table[guest_fd].type;
+    pthread_mutex_unlock(&fd_lock);
+    return type;
+}
+
+/* Sized to cover all FD_* constants in abi.h plus a small headroom. Indexed
+ * by type. Each slot defaults to NULL (no per-type cleanup). Modules that
+ * own a type call fd_register_cleanup() at init time; dup and fork-restore
+ * paths read back the binding via fd_cleanup_for_type().
+ */
+#define FD_TYPE_REGISTRY_SIZE 32
+static void (*fd_type_cleanup[FD_TYPE_REGISTRY_SIZE])(int);
+
+void fd_register_cleanup(int type, void (*cleanup)(int))
+{
+    if (type < 0 || type >= FD_TYPE_REGISTRY_SIZE)
+        return;
+    fd_type_cleanup[type] = cleanup;
+}
+
+void (*fd_cleanup_for_type(int type))(int)
+{
+    if (type < 0 || type >= FD_TYPE_REGISTRY_SIZE)
+        return NULL;
+    return fd_type_cleanup[type];
+}
+
 /* Look up a guest FD and return a dup'd host fd that the caller owns.
  * The dup is performed under fd_lock so that close() on another thread
  * cannot invalidate the host fd between lookup and dup. Caller must
diff --git a/src/syscall/fs.c b/src/syscall/fs.c
index ce951eb..426c6df 100644
--- a/src/syscall/fs.c
+++ b/src/syscall/fs.c
@@ -27,6 +27,7 @@
 #include "runtime/procemu.h"
 
 #include "syscall/abi.h"
+#include "syscall/fd.h" /* eventfd_dup_fd */
 #include "syscall/fuse.h"
 #include "syscall/fs.h"
 #include "syscall/internal.h"
@@ -62,6 +63,16 @@ static int opened_fd_type(int host_fd, int linux_flags)
     return FD_REGULAR;
 }
 
+static int intercepted_fd_type(const char *path, int host_fd, int linux_flags)
+{
+    int type = opened_fd_type(host_fd, linux_flags);
+    if (type < 0)
+        return type;
+    if (type == FD_REGULAR && path && !strcmp(path, "/dev/urandom"))
+        return FD_URANDOM;
+    return type;
+}
+
 static const char *proc_virtual_dir_path(const char *path,
                                          char *buf,
                                          size_t bufsz);
@@ -168,16 +179,11 @@ static const char *proc_virtual_dir_path(const char *path,
     return virt;
 }
 
-static int dup_fd_type(int guest_fd)
-{
-    return fd_table[guest_fd].type == FD_STDIO ? FD_REGULAR
-                                               : fd_table[guest_fd].type;
-}
-
 static int fd_alloc_opened_host(int host_fd,
                                 int type,
                                 int linux_flags,
-                                int min_guest_fd)
+                                int min_guest_fd,
+                                void (*cleanup)(int))
 {
     DIR *dir = NULL;
 
@@ -193,9 +199,10 @@ static int fd_alloc_opened_host(int host_fd,
         }
     }
 
-    int guest_fd = min_guest_fd >= 0
-                       ? fd_alloc_from_relaxed(min_guest_fd, type, host_fd)
-                       : fd_alloc_from_relaxed(0, type, host_fd);
+    int guest_fd =
+        min_guest_fd >= 0
+            ? fd_alloc_from_relaxed(min_guest_fd, type, host_fd, cleanup)
+            : fd_alloc_from_relaxed(0, type, host_fd, cleanup);
     if (guest_fd < 0) {
         int saved_errno = errno;
         if (dir)
@@ -249,7 +256,7 @@ int64_t sys_openat_path(guest_t *g,
                 return linux_errno();
             }
             int guest_fd =
-                fd_alloc_opened_host(sidecar_fd, type, linux_flags, -1);
+                fd_alloc_opened_host(sidecar_fd, type, linux_flags, -1, NULL);
             if (guest_fd < 0) {
                 close_keep_errno(sidecar_fd);
                 return linux_errno();
@@ -278,7 +285,8 @@ int64_t sys_openat_path(guest_t *g,
             close_keep_errno(host_fd);
             return linux_errno();
         }
-        int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1);
+        int guest_fd =
+            fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL);
         if (guest_fd < 0) {
             close_keep_errno(host_fd);
             return linux_errno();
@@ -303,15 +311,17 @@ int64_t sys_openat_path(guest_t *g,
              * /proc files use fd_alloc_from(128) to avoid races with
              * concurrent GC finalizers that may close stale low-numbered fds.
              */
-            int type = opened_fd_type(intercepted, linux_flags);
+            int type = intercepted_fd_type(tx.intercept_path, intercepted,
+                                           linux_flags);
             if (type < 0) {
                 close_keep_errno(intercepted);
                 return linux_errno();
             }
             int min_guest_fd =
                 (!strncmp(tx.intercept_path, "/dev/", 5)) ? -1 : 128;
-            int guest_fd = fd_alloc_opened_host(intercepted, type, linux_flags,
-                                                min_guest_fd);
+            int guest_fd =
+                fd_alloc_opened_host(intercepted, type, linux_flags,
+                                     min_guest_fd, fd_cleanup_for_type(type));
             if (guest_fd < 0) {
                 close_keep_errno(intercepted);
                 return linux_errno();
@@ -336,7 +346,8 @@ int64_t sys_openat_path(guest_t *g,
             close_keep_errno(host_fd);
             return linux_errno();
         }
-        int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1);
+        int guest_fd =
+            fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL);
         if (guest_fd < 0) {
             close_keep_errno(host_fd);
             return linux_errno();
@@ -358,7 +369,7 @@ int64_t sys_openat_path(guest_t *g,
         close_keep_errno(host_fd);
         return linux_errno();
     }
-    int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1);
+    int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL);
     if (guest_fd < 0) {
         close_keep_errno(host_fd);
         return linux_errno();
@@ -436,14 +447,16 @@ static void discard_allocated_fd(int guest_fd)
         fd_cleanup_entry(guest_fd, &snap);
 }
 
-static void copy_fd_alias_metadata(int src_fd, int dst_fd, int linux_flags)
+static void install_fd_alias_metadata(int dst_fd,
+                                      const fd_entry_t *src_snap,
+                                      int linux_flags)
 {
-    int preserved_flags = fd_table[src_fd].linux_flags &
+    int preserved_flags = src_snap->linux_flags &
                           (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
                            LINUX_O_DIRECT | LINUX_O_LARGEFILE);
     fd_table[dst_fd].linux_flags = preserved_flags | linux_flags;
-    fd_table[dst_fd].seals = fd_table[src_fd].seals;
-    memcpy(fd_table[dst_fd].proc_path, fd_table[src_fd].proc_path,
+    fd_table[dst_fd].seals = src_snap->seals;
+    memcpy(fd_table[dst_fd].proc_path, src_snap->proc_path,
            sizeof(fd_table[dst_fd].proc_path));
 }
 
@@ -457,28 +470,44 @@ static int duplicate_guest_fd(int src_fd,
                               bool fixed_slot,
                               int linux_flags)
 {
-    if (RANGE_CHECK(src_fd, 0, FD_TABLE_SIZE)) {
-        int t = fd_table[src_fd].type;
-        if (t == FD_FUSE_DEV || t == FD_FUSE_FILE || t == FD_FUSE_DIR)
-            return fuse_dup_fd(src_fd, min_guest_fd, fixed_guest_fd, fixed_slot,
-                               linux_flags);
-    }
-
-    host_fd_ref_t host_ref;
-    if (host_fd_ref_open(src_fd, &host_ref) < 0) {
+    /* Snapshot the source entry and dup its host fd in a single fd_lock
+     * critical section so the type, host fd, and metadata captured here
+     * cannot drift apart under a racing close + reopen.
+     */
+    fd_entry_t src_snap;
+    int new_host_fd = fd_snapshot_and_dup(src_fd, &src_snap);
+    if (new_host_fd < 0 && src_snap.type == FD_CLOSED) {
         errno = EBADF;
         return -1;
     }
-
-    int new_type = dup_fd_type(src_fd);
-    int new_host_fd = dup(host_ref.fd);
-    host_fd_ref_close(&host_ref);
+    if (src_snap.type == FD_FUSE_DEV || src_snap.type == FD_FUSE_FILE ||
+        src_snap.type == FD_FUSE_DIR) {
+        if (new_host_fd >= 0)
+            close_keep_errno(new_host_fd);
+        return fuse_dup_fd(src_fd, min_guest_fd, fixed_guest_fd, fixed_slot,
+                           linux_flags);
+    }
+    /* eventfd dup must share the underlying counter and pipe state across
+     * the source and destination fds (Linux contract). Pass src_snap's
+     * host_fd through so eventfd_dup_fd can verify the source fd still
+     * refers to the same live eventfd between the snapshot here and the
+     * bind there.
+     */
+    if (src_snap.type == FD_EVENTFD) {
+        if (new_host_fd >= 0)
+            close_keep_errno(new_host_fd);
+        return eventfd_dup_fd(src_fd, src_snap.host_fd, min_guest_fd,
+                              fixed_guest_fd, fixed_slot, linux_flags);
+    }
     if (new_host_fd < 0)
         return -1;
 
-    int guest_fd =
-        fixed_slot ? fd_alloc_at_relaxed(fixed_guest_fd, new_type, new_host_fd)
-                   : fd_alloc_from_relaxed(min_guest_fd, new_type, new_host_fd);
+    int new_type = (src_snap.type == FD_STDIO) ? FD_REGULAR : src_snap.type;
+    void (*cleanup)(int) = fd_cleanup_for_type(new_type);
+    int guest_fd = fixed_slot ? fd_alloc_at_relaxed(fixed_guest_fd, new_type,
+                                                    new_host_fd, cleanup)
+                              : fd_alloc_from_relaxed(min_guest_fd, new_type,
+                                                      new_host_fd, cleanup);
     if (guest_fd < 0) {
         if (fixed_slot)
             errno = EBADF;
@@ -486,7 +515,7 @@ static int duplicate_guest_fd(int src_fd,
         return -1;
     }
 
-    copy_fd_alias_metadata(src_fd, guest_fd, linux_flags);
+    install_fd_alias_metadata(guest_fd, &src_snap, linux_flags);
     if (clone_dir_stream_if_needed(src_fd, guest_fd, new_host_fd) < 0) {
         int saved_errno = errno;
         discard_allocated_fd(guest_fd);
@@ -600,7 +629,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
             return linux_errno();
         int linux_fl = mac_to_linux_status_flags(mac_fl);
         if (snap.type == FD_REGULAR || snap.type == FD_DIR ||
-            snap.type == FD_PATH)
+            snap.type == FD_PATH || snap.type == FD_URANDOM)
             linux_fl = (linux_fl & ~O_ACCMODE) | (snap.linux_flags & 3);
         linux_fl |= snap.linux_flags &
                     (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
diff --git a/src/syscall/fuse.c b/src/syscall/fuse.c
index ae248e1..157191a 100644
--- a/src/syscall/fuse.c
+++ b/src/syscall/fuse.c
@@ -1281,6 +1281,9 @@ void fuse_init(void)
     memset(fuse_file_bindings, 0, sizeof(fuse_file_bindings));
     fuse_next_mount_id = 100;
     pthread_mutex_unlock(&fuse_lock);
+    fd_register_cleanup(FD_FUSE_DEV, fuse_fd_cleanup);
+    fd_register_cleanup(FD_FUSE_FILE, fuse_fd_cleanup);
+    fd_register_cleanup(FD_FUSE_DIR, fuse_fd_cleanup);
 }
 
 int fuse_proc_open(int linux_flags)
@@ -2540,9 +2543,15 @@ int fuse_dup_fd(int src_fd,
         return -1;
     }
 
-    int guest_fd = fixed_slot
-                       ? fd_alloc_at_relaxed(fixed_guest_fd, snap.type, -1)
-                       : fd_alloc_from_relaxed(min_guest_fd, snap.type, -1);
+    /* Install cleanup atomically with the type. Without this, a racing
+     * close between fd_alloc_*_relaxed publishing the slot and the later
+     * fd_table[guest_fd].cleanup assignment would skip fuse_fd_cleanup
+     * and leak the session or file ref.
+     */
+    int guest_fd = fixed_slot ? fd_alloc_at_relaxed(fixed_guest_fd, snap.type,
+                                                    -1, fuse_fd_cleanup)
+                              : fd_alloc_from_relaxed(min_guest_fd, snap.type,
+                                                      -1, fuse_fd_cleanup);
     if (guest_fd < 0) {
         if (fixed_slot)
             errno = EBADF;
@@ -2588,7 +2597,6 @@ int fuse_dup_fd(int src_fd,
                           (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
                            LINUX_O_DIRECT | LINUX_O_LARGEFILE);
     fd_table[guest_fd].linux_flags = preserved_flags | linux_flags;
-    fd_table[guest_fd].cleanup = fuse_fd_cleanup;
     pthread_mutex_unlock(&fuse_lock);
     return guest_fd;
 }
diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c
index 7513e5c..d9b54dd 100644
--- a/src/syscall/inotify.c
+++ b/src/syscall/inotify.c
@@ -111,6 +111,7 @@ void inotify_init(void)
 {
     for (int i = 0; i < INOTIFY_MAX; i++)
         inotify_state[i].guest_fd = -1;
+    fd_register_cleanup(FD_INOTIFY, inotify_close);
 }
 
 static int inotify_find(int guest_fd)
diff --git a/src/syscall/internal.h b/src/syscall/internal.h
index 2760ce9..ca38b62 100644
--- a/src/syscall/internal.h
+++ b/src/syscall/internal.h
@@ -59,32 +59,78 @@ void fdtable_init(void);
  */
 int fd_alloc(int type, int host_fd, void (*cleanup)(int));
 
-/* Allocate the lowest available FD >= minfd. Returns -1 if none available. */
-int fd_alloc_from(int minfd, int type, int host_fd);
+/* Allocate the lowest available FD >= minfd. Returns -1 if none available.
+ * cleanup is set atomically under fd_lock (pass NULL for plain fds).
+ */
+int fd_alloc_from(int minfd, int type, int host_fd, void (*cleanup)(int));
 
 /* Allocate the lowest available FD >= minfd with a single-thread fast path.
  * Falls back to fd_alloc_from() when multiple guest threads are active.
  */
-int fd_alloc_from_relaxed(int minfd, int type, int host_fd);
+int fd_alloc_from_relaxed(int minfd,
+                          int type,
+                          int host_fd,
+                          void (*cleanup)(int));
 
-/* Allocate a specific FD slot. Returns -1 if out of range. */
-int fd_alloc_at(int fd, int type, int host_fd);
+/* Allocate a specific FD slot.
+ * Returns -1 if out of range.
+ * cleanup is set atomically under fd_lock (pass NULL for plain fds).
+ */
+int fd_alloc_at(int fd, int type, int host_fd, void (*cleanup)(int));
 
 /* Allocate a specific FD slot with a single-thread fast path.
  * Falls back to fd_alloc_at() when replacement/cleanup must stay serialized.
  */
-int fd_alloc_at_relaxed(int fd, int type, int host_fd);
+int fd_alloc_at_relaxed(int fd, int type, int host_fd, void (*cleanup)(int));
 
 /* Look up a guest FD. Returns host FD or -1 if invalid.
  * Unsafe for concurrent use; see fd_snapshot/fd_to_host_dup.
  */
 int fd_to_host(int guest_fd);
 
-/* Snapshot an fd entry under fd_lock. Thread-safe alternative to
- * direct fd_table[] access. Returns true on success, false if closed.
+/* Snapshot an fd entry under fd_lock. Thread-safe alternative to direct
+ * fd_table[] access.
+ * Returns true on success, false if closed.
  */
 bool fd_snapshot(int guest_fd, fd_entry_t *out);
 
+/* Snapshot an fd entry AND dup its host fd in a single fd_lock critical
+ * section. Eliminates the TOCTOU window between reading the type/metadata
+ * and duplicating the host fd in the dup(2) path. Returns the dup'd host
+ * fd (owned by the caller) on success, -1 on failure. On success the
+ * snapshot in *out is consistent with the dup'd host fd.
+ */
+int fd_snapshot_and_dup(int guest_fd, fd_entry_t *out);
+
+/* Read just the fd type under fd_lock. Returns FD_CLOSED for out-of-range or
+ * closed slots. Cheaper than fd_snapshot when only the type is needed for
+ * dispatch (sys_read/sys_readv/sys_writev fast paths).
+ */
+int fd_get_type(int guest_fd);
+
+/* Type -> cleanup registry. Modules that own a synthetic fd type register
+ * their cleanup at init time; dup and fork-restore paths look up the
+ * cleanup from the type so the binding stays consistent without each path
+ * re-deriving the dispatch table.
+ */
+void fd_register_cleanup(int type, void (*cleanup)(int));
+void (*fd_cleanup_for_type(int type))(int);
+
+/* True for fd types whose host backing (kqueue for timerfd/inotify, pipe
+ * halves for eventfd/signalfd/netlink/pidfd, epoll instance) cannot be
+ * meaningfully inherited across fork IPC: macOS SCM_RIGHTS rejects kqueue
+ * fds, and the per-class side-table state (eventfd counter, signalfd mask,
+ * pidfd target, epoll set, ...) is not serialized. The child must recreate
+ * such fds via the appropriate syscall, so the parent filters them from the
+ * SCM_RIGHTS payload and the receiver drops any that still arrive.
+ */
+static inline bool fd_type_is_synthetic(int type)
+{
+    return type == FD_EVENTFD || type == FD_SIGNALFD || type == FD_TIMERFD ||
+           type == FD_INOTIFY || type == FD_NETLINK || type == FD_PIDFD ||
+           type == FD_EPOLL;
+}
+
 /* Look up a guest FD and return a dup'd host fd owned by the caller.
  * Thread-safe: dup is performed under fd_lock. Returns -1 on failure.
  * Caller MUST close() the returned fd when done.
diff --git a/src/syscall/io.c b/src/syscall/io.c
index ee183dd..f901ba7 100644
--- a/src/syscall/io.c
+++ b/src/syscall/io.c
@@ -19,6 +19,7 @@
 #include <errno.h>
 #include <stdbool.h>
 #include <limits.h>
+#include <pthread.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
@@ -43,6 +44,7 @@
 
 #define SYSCALL_IOV_MAX 1024
 #define SYSCALL_IOV_STACK_MAX 64
+#define URANDOM_CACHE_SIZE 4096
 
 /* Linux terminal struct types. */
 
@@ -60,6 +62,15 @@ typedef struct {
     uint8_t c_cc[19];
 } linux_termios_t;
 
+typedef struct {
+    uint8_t buf[URANDOM_CACHE_SIZE];
+    size_t off;
+    size_t len;
+} urandom_cache_t;
+
+static pthread_mutex_t urandom_lock = PTHREAD_MUTEX_INITIALIZER;
+static urandom_cache_t urandom_cache[FD_TABLE_SIZE];
+
 _Static_assert(sizeof(linux_termios_t) == 36,
                "aarch64 Linux TCGETS struct termios must be 36 bytes");
 
@@ -123,6 +134,120 @@ static int64_t io_return_zero(host_fd_ref_t *host_ref)
     return 0;
 }
 
+void urandom_fd_reset_cache(int guest_fd)
+{
+    if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE))
+        return;
+
+    pthread_mutex_lock(&urandom_lock);
+    memset(&urandom_cache[guest_fd], 0, sizeof(urandom_cache[guest_fd]));
+    pthread_mutex_unlock(&urandom_lock);
+}
+
+void urandom_fd_cleanup(int guest_fd)
+{
+    if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE))
+        return;
+
+    urandom_fd_reset_cache(guest_fd);
+}
+
+static int64_t urandom_check_readable(int guest_fd)
+{
+    fd_entry_t snap;
+    if (!fd_snapshot(guest_fd, &snap) || snap.type != FD_URANDOM)
+        return -LINUX_EBADF;
+    if ((snap.linux_flags & 3) == LINUX_O_WRONLY)
+        return -LINUX_EBADF;
+    return 0;
+}
+
+static int64_t urandom_fill_iov(int guest_fd,
+                                const struct iovec *iov,
+                                int iovcnt)
+{
+    int64_t err = urandom_check_readable(guest_fd);
+    if (err < 0)
+        return err;
+
+    size_t total = 0;
+    for (int i = 0; i < iovcnt; i++) {
+        if (iov[i].iov_len > (size_t) SSIZE_MAX - total)
+            return -LINUX_EINVAL;
+        total += iov[i].iov_len;
+    }
+    if (total == 0)
+        return 0;
+
+    pthread_mutex_lock(&urandom_lock);
+    urandom_cache_t *c = &urandom_cache[guest_fd];
+    size_t done = 0;
+    for (int i = 0; i < iovcnt && done < total; i++) {
+        uint8_t *dst = iov[i].iov_base;
+        size_t iov_done = 0;
+        size_t iov_len = iov[i].iov_len;
+        if (iov_len > total - done)
+            iov_len = total - done;
+        while (iov_done < iov_len) {
+            if (c->off == c->len) {
+                arc4random_buf(c->buf, sizeof(c->buf));
+                c->off = 0;
+                c->len = sizeof(c->buf);
+            }
+            size_t chunk = c->len - c->off;
+            if (chunk > iov_len - iov_done)
+                chunk = iov_len - iov_done;
+            memcpy(dst + iov_done, c->buf + c->off, chunk);
+            c->off += chunk;
+            iov_done += chunk;
+            done += chunk;
+        }
+    }
+    pthread_mutex_unlock(&urandom_lock);
+    return (int64_t) done;
+}
+
+static int64_t validate_iov_total(guest_t *g, uint64_t iov_gva, int iovcnt)
+{
+    if (iovcnt <= 0 || iovcnt > SYSCALL_IOV_MAX)
+        return -LINUX_EINVAL;
+
+    size_t total = 0;
+    for (int i = 0; i < iovcnt; i++) {
+        linux_iovec_t giov;
+        if (guest_read_small(g, iov_gva + (uint64_t) i * sizeof(giov), &giov,
+                             sizeof(giov)) < 0)
+            return -LINUX_EFAULT;
+        if (giov.iov_len > (uint64_t) SSIZE_MAX - total)
+            return -LINUX_EINVAL;
+        total += (size_t) giov.iov_len;
+    }
+    return 0;
+}
+
+static int64_t urandom_read(guest_t *g,
+                            int guest_fd,
+                            uint64_t buf_gva,
+                            uint64_t count)
+{
+    if (count > SSIZE_MAX)
+        count = SSIZE_MAX;
+    if (count == 0) {
+        struct iovec empty = {0};
+        return urandom_fill_iov(guest_fd, &empty, 1);
+    }
+
+    uint64_t avail = 0;
+    void *dst = guest_ptr_bound(g, buf_gva, &avail, MEM_PERM_W, count);
+    if (!dst)
+        return -LINUX_EFAULT;
+    if (count > avail)
+        count = avail;
+
+    struct iovec iov = {.iov_base = dst, .iov_len = (size_t) count};
+    return urandom_fill_iov(guest_fd, &iov, 1);
+}
+
 static bool rosetta_ioctl_target_fd(guest_t *g, int host_fd)
 {
     if (!g->is_rosetta)
@@ -689,12 +814,11 @@ static int64_t io_write_result(ssize_t ret)
 
 int64_t sys_write(guest_t *g, int fd, uint64_t buf_gva, uint64_t count)
 {
-    if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) {
-        if (fd_table[fd].type == FD_FUSE_DEV)
-            return fuse_dev_write(g, fd, buf_gva, count);
-        if (fd_table[fd].type == FD_EVENTFD)
-            return eventfd_write(fd, g, buf_gva, count);
-    }
+    int type = fd_get_type(fd);
+    if (type == FD_FUSE_DEV)
+        return fuse_dev_write(g, fd, buf_gva, count);
+    if (type == FD_EVENTFD)
+        return eventfd_write(fd, g, buf_gva, count);
 
     host_fd_ref_t host_ref;
     int64_t err = host_fd_ref_open_checked(fd, &host_ref, true);
@@ -741,21 +865,28 @@ int64_t sys_write(guest_t *g, int fd, uint64_t buf_gva, uint64_t count)
 
 int64_t sys_read(guest_t *g, int fd, uint64_t buf_gva, uint64_t count)
 {
-    if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) {
-        if (fd_table[fd].type == FD_FUSE_DEV)
-            return fuse_dev_read(fd, g, buf_gva, count);
-        if (fd_table[fd].type == FD_FUSE_FILE)
-            return fuse_read_fd(g, fd, buf_gva, count);
-        if (fd_table[fd].type == FD_EVENTFD)
-            return eventfd_read(fd, g, buf_gva, count);
-        if (fd_table[fd].type == FD_SIGNALFD)
-            return signalfd_read(fd, g, buf_gva, count);
-        if (fd_table[fd].type == FD_TIMERFD)
-            return timerfd_read(fd, g, buf_gva, count);
-        if (fd_table[fd].type == FD_INOTIFY)
-            return inotify_read(fd, g, buf_gva, count);
-        if (fd_table[fd].type == FD_NETLINK)
-            return netlink_read(fd, g, buf_gva, count);
+    /* Read the type once under fd_lock so a concurrent close/reopen cannot
+     * make different dispatch checks disagree. Each handler still
+     * re-validates internally and returns EBADF if its slot changed.
+     */
+    int type = fd_get_type(fd);
+    switch (type) {
+    case FD_FUSE_DEV:
+        return fuse_dev_read(fd, g, buf_gva, count);
+    case FD_FUSE_FILE:
+        return fuse_read_fd(g, fd, buf_gva, count);
+    case FD_EVENTFD:
+        return eventfd_read(fd, g, buf_gva, count);
+    case FD_SIGNALFD:
+        return signalfd_read(fd, g, buf_gva, count);
+    case FD_TIMERFD:
+        return timerfd_read(fd, g, buf_gva, count);
+    case FD_INOTIFY:
+        return inotify_read(fd, g, buf_gva, count);
+    case FD_NETLINK:
+        return netlink_read(fd, g, buf_gva, count);
+    case FD_URANDOM:
+        return urandom_read(g, fd, buf_gva, count);
     }
 
     host_fd_ref_t host_ref;
@@ -914,11 +1045,23 @@ static int64_t build_host_iov(guest_t *g,
                 free(guest_iov);
             return -LINUX_EFAULT;
         }
-        /* Cap to contiguous permitted bytes */
+        /* Cap to contiguous permitted bytes. When the guest iov entry
+         * spans a non-contiguous boundary (different mapping or
+         * permission), zero every subsequent host iov length so the
+         * host readv/writev returns a POSIX-compliant short I/O rather
+         * than silently packing the truncated tail of buffer i into
+         * buffer i+1 -- which corrupts the guest's data layout.
+         */
         uint64_t len = guest_iov[i].iov_len;
-        if (len > avail)
-            len = avail;
         host_iov[i].iov_base = base;
+        if (len > avail) {
+            host_iov[i].iov_len = avail;
+            for (int j = i + 1; j < iovcnt; j++) {
+                host_iov[j].iov_base = NULL;
+                host_iov[j].iov_len = 0;
+            }
+            break;
+        }
         host_iov[i].iov_len = len;
     }
     if (guest_iov != stack_giov)
@@ -981,29 +1124,49 @@ int64_t sys_readv(guest_t *g, int fd, uint64_t iov_gva, int iovcnt)
         int64_t err = single_guest_iov(g, iov_gva, &giov);
         if (err < 0)
             return err;
+        if (fd_get_type(fd) == FD_URANDOM &&
+            giov.iov_len > (uint64_t) SSIZE_MAX) {
+            err = urandom_check_readable(fd);
+            if (err < 0)
+                return err;
+            return -LINUX_EINVAL;
+        }
         return sys_read(g, fd, giov.iov_base, giov.iov_len);
     }
 
     /* Special FD types need their custom read handlers because glibc may use
      * readv() instead of read() for the same logical operation. Delegate
-     * to the first iov entry's buffer.  Use the first iov's length (not
-     * the sum of all iovs) because the data goes into giov[0].iov_base
-     * which is only giov[0].iov_len bytes long.
+     * scalar special fds to the first iov entry's buffer. Use the first iov's
+     * length (not the sum of all iovs) because the data goes into
+     * giov[0].iov_base which is only giov[0].iov_len bytes long.
      */
-    if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) {
-        int type = fd_table[fd].type;
-        if (type == FD_EVENTFD || type == FD_SIGNALFD || type == FD_TIMERFD ||
-            type == FD_INOTIFY) {
-            if (iovcnt <= 0)
-                return -LINUX_EINVAL;
-            /* Use guest_read for the iov array since guest_ptr alone is unsafe
-             * if the array spans a 2MiB block boundary.
-             */
-            linux_iovec_t giov;
-            if (guest_read_small(g, iov_gva, &giov, sizeof(giov)) < 0)
-                return -LINUX_EFAULT;
-            return sys_read(g, fd, giov.iov_base, giov.iov_len);
-        }
+    int type = fd_get_type(fd);
+    if (type == FD_URANDOM) {
+        int64_t err = urandom_check_readable(fd);
+        if (err < 0)
+            return err;
+        err = validate_iov_total(g, iov_gva, iovcnt);
+        if (err < 0)
+            return err;
+        host_iov_buf_t host_iov;
+        err = host_iov_prepare(g, iov_gva, iovcnt, MEM_PERM_W, &host_iov);
+        if (err < 0)
+            return err;
+        int64_t ret = urandom_fill_iov(fd, host_iov.iov, iovcnt);
+        host_iov_free(&host_iov);
+        return ret;
+    }
+    if (type == FD_EVENTFD || type == FD_SIGNALFD || type == FD_TIMERFD ||
+        type == FD_INOTIFY) {
+        if (iovcnt <= 0)
+            return -LINUX_EINVAL;
+        /* Use guest_read for the iov array since guest_ptr alone is unsafe
+         * if the array spans a 2MiB block boundary.
+         */
+        linux_iovec_t giov;
+        if (guest_read_small(g, iov_gva, &giov, sizeof(giov)) < 0)
+            return -LINUX_EFAULT;
+        return sys_read(g, fd, giov.iov_base, giov.iov_len);
     }
 
     host_fd_ref_t host_ref;
@@ -1051,7 +1214,7 @@ int64_t sys_writev(guest_t *g, int fd, uint64_t iov_gva, int iovcnt)
      * sum of all iovs) because the data is at giov.iov_base which is only
      * giov.iov_len bytes.  eventfd expects exactly 8 bytes.
      */
-    if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_EVENTFD) {
+    if (fd_get_type(fd) == FD_EVENTFD) {
         if (iovcnt <= 0)
             return -LINUX_EINVAL;
         linux_iovec_t giov;
diff --git a/src/syscall/io.h b/src/syscall/io.h
index 05a3321..399b551 100644
--- a/src/syscall/io.h
+++ b/src/syscall/io.h
@@ -22,6 +22,8 @@
 /* read/write and their positional variants. */
 int64_t sys_write(guest_t *g, int fd, uint64_t buf_gva, uint64_t count);
 int64_t sys_read(guest_t *g, int fd, uint64_t buf_gva, uint64_t count);
+void urandom_fd_cleanup(int guest_fd);
+void urandom_fd_reset_cache(int guest_fd);
 int64_t sys_pread64(guest_t *g,
                     int fd,
                     uint64_t buf_gva,
diff --git a/src/syscall/net-msg.c b/src/syscall/net-msg.c
index ecc9f71..96221ff 100644
--- a/src/syscall/net-msg.c
+++ b/src/syscall/net-msg.c
@@ -98,7 +98,7 @@ static void recvmsg_close_host_rights(const void *data_src, size_t data_len)
 
 int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags)
 {
-    if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_NETLINK)
+    if (fd_get_type(fd) == FD_NETLINK)
         return netlink_sendmsg(fd, g, msg_gva, linux_flags);
 
     host_fd_ref_t host_ref;
@@ -339,7 +339,7 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags)
 
 int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
 {
-    if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_NETLINK)
+    if (fd_get_type(fd) == FD_NETLINK)
         return netlink_recvmsg(fd, g, msg_gva, flags);
 
     host_fd_ref_t host_ref;
diff --git a/src/syscall/net.c b/src/syscall/net.c
index b80ca18..05b0c76 100644
--- a/src/syscall/net.c
+++ b/src/syscall/net.c
@@ -215,7 +215,7 @@ int64_t sys_socketpair(guest_t *g,
 int64_t sys_bind(guest_t *g, int fd, uint64_t addr_gva, uint32_t addrlen)
 {
     /* Netlink sockets use synthetic fd; dispatch to netlink handler */
-    if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_NETLINK)
+    if (fd_get_type(fd) == FD_NETLINK)
         return netlink_bind(fd, g, addr_gva, addrlen);
 
     host_fd_ref_t host_ref;
@@ -469,7 +469,7 @@ int64_t sys_connect(guest_t *g, int fd, uint64_t addr_gva, uint32_t addrlen)
             return linux_errno();
         }
 
-        if (fd_alloc_at(fd, FD_SOCKET, pair[0]) < 0) {
+        if (fd_alloc_at(fd, FD_SOCKET, pair[0], NULL) < 0) {
             close(pair[0]);
             close(pair[1]);
             host_fd_ref_close(&host_ref);
diff --git a/src/syscall/netlink.c b/src/syscall/netlink.c
index a1b555e..32c3ec3 100644
--- a/src/syscall/netlink.c
+++ b/src/syscall/netlink.c
@@ -396,6 +396,7 @@ static int nl_build_getaddr(netlink_state_t *ns)
 void netlink_init(void)
 {
     memset(nl_state, 0, sizeof(nl_state));
+    fd_register_cleanup(FD_NETLINK, netlink_close);
 }
 
 int64_t netlink_socket(int protocol, int type)
diff --git a/src/syscall/proc-pidfd.c b/src/syscall/proc-pidfd.c
index 62480f3..635eb88 100644
--- a/src/syscall/proc-pidfd.c
+++ b/src/syscall/proc-pidfd.c
@@ -50,6 +50,13 @@ static pidfd_entry_t *pidfd_find_guest_fd_entry(int guest_fd)
     return NULL;
 }
 
+static void pidfd_cleanup(int guest_fd);
+
+void pidfd_init(void)
+{
+    fd_register_cleanup(FD_PIDFD, pidfd_cleanup);
+}
+
 static void pidfd_cleanup(int guest_fd)
 {
     pthread_mutex_lock(&pidfd_lock);
diff --git a/src/syscall/proc-pidfd.h b/src/syscall/proc-pidfd.h
index 8d02df4..79e55e5 100644
--- a/src/syscall/proc-pidfd.h
+++ b/src/syscall/proc-pidfd.h
@@ -10,6 +10,7 @@
 
 #include "core/guest.h"
 
+void pidfd_init(void);
 int pidfd_create(guest_t *g, int64_t target_pid);
 void proc_pidfd_notify_exit(int64_t exited_pid);
 int64_t proc_pidfd_lookup_pid(int guest_fd);
diff --git a/src/syscall/signal.c b/src/syscall/signal.c
index 2156638..2ac58be 100644
--- a/src/syscall/signal.c
+++ b/src/syscall/signal.c
@@ -1447,7 +1447,7 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code)
      * glibc leaves sa_restorer uninitialized (garbage); musl sets it to
      * __restore_rt.  Match the kernel: always use the vDSO trampoline.
      */
-    hv_vcpu_set_reg(vcpu, HV_REG_X30, VDSO_BASE + VDSO_OFF_TEXT);
+    hv_vcpu_set_reg(vcpu, HV_REG_X30, VDSO_BASE + VDSO_OFF_SIGRET);
 
     if (act->sa_flags & LINUX_SA_SIGINFO) {
         /* X1 = pointer to siginfo, X2 = pointer to ucontext */
diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c
index 68cad6d..81a51f7 100644
--- a/src/syscall/syscall.c
+++ b/src/syscall/syscall.c
@@ -56,6 +56,7 @@
 #include "syscall/poll.h"
 #include "syscall/path.h"
 #include "syscall/proc.h"
+#include "syscall/proc-pidfd.h"
 #include "syscall/signal.h"
 #include "syscall/sys.h"
 #include "syscall/sysvipc.h"
@@ -95,6 +96,8 @@ void syscall_init(void)
     inotify_init();
     netlink_init();
     fuse_init();
+    pidfd_init();
+    fd_register_cleanup(FD_URANDOM, urandom_fd_cleanup);
     wakeup_pipe_init();
 }
 
diff --git a/src/syscall/time.c b/src/syscall/time.c
index 8a76c4b..d29932b 100644
--- a/src/syscall/time.c
+++ b/src/syscall/time.c
@@ -15,6 +15,7 @@
 
 #include "utils.h"
 
+#include "core/vdso.h"
 #include "runtime/thread.h" /* current_thread, guest_tid */
 #include "syscall/abi.h"
 #include "syscall/internal.h"
@@ -253,6 +254,27 @@ int64_t sys_clock_gettime(guest_t *g, int clockid, uint64_t tp_gva)
     if (guest_write_small(g, tp_gva, &ts, sizeof(ts)) < 0)
         return -LINUX_EFAULT;
 
+    /* If this trap came from the __kernel_clock_gettime vDSO svc_fallback,
+     * the trampoline parked the guest's CNTVCT_EL0 read in X9 before
+     * issuing SVC, and ELR_EL1 holds the address immediately after that
+     * SVC. Pair X9 with the wall_clock we just computed and seed the vvar
+     * so subsequent calls hit the fast path. Skip the seed for any other
+     * trap (raw syscall(SYS_clock_gettime, ...) from guest code, etc.):
+     * X9 is then arbitrary guest state, and seeding from it would poison
+     * the anchor and break every later fast-path call.
+     */
+    if (clockid == 1 /* CLOCK_MONOTONIC */ && current_thread) {
+        uint64_t elr = 0;
+        uint64_t guest_cntvct = 0;
+        if (hv_vcpu_get_sys_reg(current_thread->vcpu, HV_SYS_REG_ELR_EL1,
+                                &elr) == HV_SUCCESS &&
+            elr == vdso_clock_gettime_svc_pc() + 4 &&
+            hv_vcpu_get_reg(current_thread->vcpu, HV_REG_X9, &guest_cntvct) ==
+                HV_SUCCESS &&
+            guest_cntvct != 0)
+            vdso_seed_anchor(g, guest_cntvct, ts.tv_sec, ts.tv_nsec);
+    }
+
     return 0;
 }
 
diff --git a/tests/manifest.txt b/tests/manifest.txt
index ff9631b..e1f6c29 100644
--- a/tests/manifest.txt
+++ b/tests/manifest.txt
@@ -45,10 +45,12 @@ test-file-ops
 test-sysinfo
 test-io-opt
 test-syscall-smoke
+test-vdso
 test-poll                      # diff=skip
 
 [section] I/O subsystem tests
 test-eventfd
+test-eventfd-dup
 test-signalfd
 test-signalfd-hardening
 test-epoll
@@ -83,8 +85,9 @@ test-clone3                    # diff=skip
 test-fork-exec $TESTDIR/echo-test
 test-fork-lowbase
 
-[section] COW fork isolation tests
+[section] CoW fork isolation tests
 test-cow-fork
+test-fork-synthetic-fd
 
 [section] O_CLOEXEC tests
 test-cloexec
diff --git a/tests/test-cow-fork.c b/tests/test-cow-fork.c
index 8770420..f7cc0c7 100644
--- a/tests/test-cow-fork.c
+++ b/tests/test-cow-fork.c
@@ -1,4 +1,4 @@
-/* COW fork memory isolation tests
+/* CoW fork memory isolation tests
  *
  * Copyright 2026 elfuse contributors
  * Copyright 2025 Moritz Angermann, zw3rk pte. ltd.
@@ -166,11 +166,11 @@ static void test_mmap_isolation(void)
     munmap(region, 4096);
 }
 
-/* Test 4: Large region COW (verify no corruption) */
+/* Test 4: Large region CoW (verify no corruption) */
 
 static void test_large_cow(void)
 {
-    TEST("fork: 1MiB COW integrity");
+    TEST("fork: 1MiB CoW integrity");
 
     int pipefd[2];
     if (pipe(pipefd) != 0) {
@@ -229,7 +229,7 @@ static void test_large_cow(void)
     int status;
     waitpid(pid, &status, 0);
 
-    EXPECT_TRUE(parent_ok && child_ok, "1MiB COW integrity failed");
+    EXPECT_TRUE(parent_ok && child_ok, "1MiB CoW integrity failed");
     munmap(buf, sz);
 }
 
@@ -302,7 +302,7 @@ static void test_brk_isolation(void)
 
 int main(void)
 {
-    printf("test-cow-fork: COW fork memory isolation tests\n");
+    printf("test-cow-fork: CoW fork memory isolation tests\n");
 
     test_stack_isolation();
     test_heap_isolation();
diff --git a/tests/test-eventfd-dup.c b/tests/test-eventfd-dup.c
new file mode 100644
index 0000000..484c2d7
--- /dev/null
+++ b/tests/test-eventfd-dup.c
@@ -0,0 +1,65 @@
+/* test-eventfd-dup.c -- dup of eventfd shares state (Linux contract)
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Linux dup of an eventfd produces a second descriptor that points at the
+ * same kernel object; reads and writes on either fd see the same counter.
+ * elfuse used to give each dup'd guest_fd a fresh side-table slot, so
+ * dup'd eventfds diverged and breaking programs that signal across the
+ * pair. This test pins the contract by:
+ *   - duping an eventfd initialised with counter=7, reading via the dup,
+ *     verifying the dup observes the source's initial value
+ *   - writing via the source, reading via the dup, verifying state shares
+ *   - closing one end of the alias and continuing to operate on the other
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <poll.h>
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+static int failures = 0;
+
+#define EXPECT(cond, msg)                       \
+    do {                                        \
+        if (!(cond)) {                          \
+            fprintf(stderr, "FAIL: %s\n", msg); \
+            failures++;                         \
+        }                                       \
+    } while (0)
+
+int main(void)
+{
+    int a = eventfd(7, EFD_CLOEXEC);
+    EXPECT(a >= 0, "eventfd(7) returned valid fd");
+    int b = dup(a);
+    EXPECT(b >= 0, "dup(a) returned valid fd");
+
+    uint64_t v = 0;
+    EXPECT(read(b, &v, 8) == 8, "read 8 bytes from dup'd fd");
+    EXPECT(v == 7, "dup'd fd observes source initial counter (7)");
+
+    uint64_t n = 42;
+    EXPECT(write(a, &n, 8) == 8, "write 42 to source fd");
+    EXPECT(read(b, &v, 8) == 8, "read counter from dup'd fd");
+    EXPECT(v == 42, "dup'd fd observes source write (42)");
+
+    close(a);
+    n = 99;
+    EXPECT(write(b, &n, 8) == 8, "write 99 to alias after closing source");
+    EXPECT(read(b, &v, 8) == 8, "read after partial close");
+    EXPECT(v == 99, "alias still functional after partial close");
+    struct pollfd pfd = {.fd = b, .events = POLLIN};
+    EXPECT(poll(&pfd, 1, 0) == 0, "alias is not readable after drain");
+    close(b);
+
+    if (failures) {
+        printf("test-eventfd-dup: %d FAIL\n", failures);
+        return 1;
+    }
+    puts("test-eventfd-dup: PASS");
+    return 0;
+}
diff --git a/tests/test-fork-synthetic-fd.c b/tests/test-fork-synthetic-fd.c
new file mode 100644
index 0000000..1e89a46
--- /dev/null
+++ b/tests/test-fork-synthetic-fd.c
@@ -0,0 +1,218 @@
+/* test-fork-synthetic-fd.c -- fork inheritance contract for synthetic fds
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The fork-IPC handoff does NOT serialize per-class side tables for
+ * eventfd/signalfd/timerfd/inotify/netlink/pidfd. Restoring the
+ * inherited host fd without that state leaves a half-functional slot,
+ * so fork-state.c explicitly drops these in the child. This test pins
+ * that contract:
+ *   - urandom IS inherited (no per-class state to lose; cache is fresh
+ *     in the child and arc4random_buf works)
+ *   - eventfd / signalfd / timerfd / inotify are NOT inherited; the
+ *     child sees EBADF and can recreate the fd at the same slot
+ *   - the inherited host fd does not leak in the child
+ *
+ * Once a subsystem grows a serialize/restore path, the corresponding
+ * EBADF expectation here flips to a positive inheritance check.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/eventfd.h>
+#include <sys/inotify.h>
+#include <sys/signalfd.h>
+#include <sys/timerfd.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+static int failures = 0;
+
+#define EXPECT(cond, msg)                       \
+    do {                                        \
+        if (!(cond)) {                          \
+            fprintf(stderr, "FAIL: %s\n", msg); \
+            failures++;                         \
+        }                                       \
+    } while (0)
+
+static int run_child(int (*fn)(int), int fd)
+{
+    pid_t pid = fork();
+    if (pid < 0)
+        return -1;
+    if (pid == 0)
+        _exit(fn(fd));
+    int status = 0;
+    if (waitpid(pid, &status, 0) < 0)
+        return -1;
+    return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
+}
+
+static int child_urandom_read(int fd)
+{
+    unsigned char b[8];
+    if (read(fd, b, sizeof(b)) != (ssize_t) sizeof(b))
+        return 1;
+    int seen_nonzero = 0;
+    for (size_t i = 0; i < sizeof(b); i++)
+        if (b[i] != 0)
+            seen_nonzero = 1;
+    return seen_nonzero ? 0 : 2;
+}
+
+static int child_ebadf_read(int fd)
+{
+    char buf[8] = {0};
+    errno = 0;
+    ssize_t n = read(fd, buf, sizeof(buf));
+    if (n != -1)
+        return 1;
+    if (errno != EBADF)
+        return 2;
+    return 0;
+}
+
+static int child_ebadf_reusable_at_same_fd(int fd)
+{
+    int rc = child_ebadf_read(fd);
+    if (rc != 0)
+        return rc;
+    int again = open("/dev/null", O_RDONLY | O_CLOEXEC);
+    if (again < 0)
+        return 3;
+    if (again != fd) {
+        close(again);
+        return 4;
+    }
+    close(again);
+    return 0;
+}
+
+static int child_eventfd_recreate(int fd)
+{
+    /* The inherited eventfd slot should be FD_CLOSED in the child; we
+     * should be able to create a fresh eventfd that works normally.
+     */
+    char buf[8];
+    errno = 0;
+    if (read(fd, buf, sizeof(buf)) != -1 || errno != EBADF)
+        return 1;
+    close(fd); /* harmless on a closed slot */
+    int e = eventfd(0, EFD_CLOEXEC);
+    if (e < 0)
+        return 2;
+    uint64_t one = 1;
+    if (write(e, &one, sizeof(one)) != (ssize_t) sizeof(one)) {
+        close(e);
+        return 3;
+    }
+    uint64_t got = 0;
+    if (read(e, &got, sizeof(got)) != (ssize_t) sizeof(got) || got != 1) {
+        close(e);
+        return 4;
+    }
+    close(e);
+    return 0;
+}
+
+static void test_urandom_inherited(void)
+{
+    int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
+    EXPECT(fd >= 0, "open /dev/urandom");
+    if (fd < 0)
+        return;
+    int rc = run_child(child_urandom_read, fd);
+    EXPECT(rc == 0, "child can read inherited /dev/urandom");
+    close(fd);
+}
+
+static void test_synthetic_dropped(const char *label, int (*opener)(void))
+{
+    int fd = opener();
+    EXPECT(fd >= 0, label);
+    if (fd < 0)
+        return;
+    int rc = run_child(child_ebadf_read, fd);
+    char msg[80];
+    snprintf(msg, sizeof(msg), "child sees EBADF on inherited %s", label);
+    EXPECT(rc == 0, msg);
+    close(fd);
+}
+
+static void test_eventfd_recreate(void)
+{
+    int fd = eventfd(0, EFD_CLOEXEC);
+    EXPECT(fd >= 0, "open eventfd");
+    if (fd < 0)
+        return;
+    int rc = run_child(child_eventfd_recreate, fd);
+    EXPECT(rc == 0, "child can recreate eventfd after drop");
+    close(fd);
+}
+
+static void test_low_synthetic_dropped(void)
+{
+    int saved_stdin = dup(STDIN_FILENO);
+    EXPECT(saved_stdin >= 0, "save stdin");
+    if (saved_stdin < 0)
+        return;
+
+    EXPECT(close(STDIN_FILENO) == 0, "close stdin");
+    int fd = eventfd(0, EFD_CLOEXEC);
+    EXPECT(fd == STDIN_FILENO, "eventfd reuses fd 0");
+    if (fd == STDIN_FILENO) {
+        int rc = run_child(child_ebadf_reusable_at_same_fd, fd);
+        EXPECT(rc == 0, "child sees EBADF on low inherited eventfd");
+        close(fd);
+    } else if (fd >= 0) {
+        close(fd);
+    }
+
+    EXPECT(dup2(saved_stdin, STDIN_FILENO) == STDIN_FILENO, "restore stdin");
+    close(saved_stdin);
+}
+
+static int open_eventfd(void)
+{
+    return eventfd(0, EFD_CLOEXEC);
+}
+static int open_timerfd(void)
+{
+    return timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC);
+}
+static int open_signalfd(void)
+{
+    sigset_t s;
+    sigemptyset(&s);
+    sigaddset(&s, SIGUSR1);
+    return signalfd(-1, &s, SFD_CLOEXEC);
+}
+static int open_inotify(void)
+{
+    return inotify_init1(IN_CLOEXEC);
+}
+
+int main(void)
+{
+    printf("test-fork-synthetic-fd: synthetic fd fork inheritance contract\n");
+    test_urandom_inherited();
+    test_synthetic_dropped("eventfd", open_eventfd);
+    test_synthetic_dropped("timerfd", open_timerfd);
+    test_synthetic_dropped("signalfd", open_signalfd);
+    test_synthetic_dropped("inotify", open_inotify);
+    test_eventfd_recreate();
+    test_low_synthetic_dropped();
+    if (failures) {
+        printf("test-fork-synthetic-fd: %d FAIL\n", failures);
+        return 1;
+    }
+    puts("test-fork-synthetic-fd: PASS");
+    return 0;
+}
diff --git a/tests/test-large-io-boundary.c b/tests/test-large-io-boundary.c
index 28b76e7..891dd7b 100644
--- a/tests/test-large-io-boundary.c
+++ b/tests/test-large-io-boundary.c
@@ -182,12 +182,55 @@ static void test_large_read_from_split_block(void)
     EXPECT_TRUE(ok, "read returned short count or corrupted data");
 }
 
+static void test_urandom_read_crosses_boundary(void)
+{
+    TEST("/dev/urandom partial read at mapping boundary");
+
+    size_t page = (size_t) sysconf(_SC_PAGESIZE);
+    unsigned char *map = mmap(NULL, page * 2, PROT_READ | PROT_WRITE,
+                              MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (map == MAP_FAILED) {
+        FAIL("mmap failed");
+        return;
+    }
+    if (munmap(map + page, page) != 0) {
+        munmap(map, page);
+        FAIL("munmap guard failed");
+        return;
+    }
+
+    memset(map, 0, page);
+
+    int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
+    if (fd < 0) {
+        munmap(map, page);
+        FAIL("open failed");
+        return;
+    }
+
+    ssize_t ret = read(fd, map, page * 2);
+    close(fd);
+
+    bool any_nonzero = false;
+    for (size_t i = 0; i < page; i++) {
+        if (map[i] != 0) {
+            any_nonzero = true;
+            break;
+        }
+    }
+
+    munmap(map, page);
+    EXPECT_TRUE(ret == (ssize_t) page && any_nonzero,
+                "urandom read did not preserve partial boundary result");
+}
+
 int main(void)
 {
     printf("large I/O boundary tests\n\n");
 
     test_large_write();
     test_large_read_from_split_block();
+    test_urandom_read_crosses_boundary();
 
     SUMMARY("test-large-io-boundary");
     return fails ? 1 : 0;
diff --git a/tests/test-matrix.sh b/tests/test-matrix.sh
index e6a6140..ad6921b 100755
--- a/tests/test-matrix.sh
+++ b/tests/test-matrix.sh
@@ -494,7 +494,7 @@ run_unit_tests()
     printf "\nNegative tests\n"
     test_check "$runner" "test-negative" "0 failed" "$bindir/test-negative"
 
-    printf "\nCOW fork isolation\n"
+    printf "\nCoW fork isolation\n"
     test_check "$runner" "test-cow-fork" "PASS" "$bindir/test-cow-fork"
 
     printf "\nGuard page / mmap edge cases\n"
diff --git a/tests/test-syscall-smoke.c b/tests/test-syscall-smoke.c
index 809998f..8419467 100644
--- a/tests/test-syscall-smoke.c
+++ b/tests/test-syscall-smoke.c
@@ -6,6 +6,7 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <limits.h>
 #include <netinet/in.h>
 #include <poll.h>
 #include <signal.h>
@@ -57,6 +58,10 @@
 #define SYS_sigaltstack 132
 #endif
 
+#ifndef O_PATH
+#define O_PATH 010000000
+#endif
+
 #ifndef SYS_set_tid_address
 #define SYS_set_tid_address 96
 #endif
@@ -623,6 +628,242 @@ static void test_sysv_semaphore_ops(void)
     }
 }
 
+static void test_urandom_byte_reads(void)
+{
+    TEST("/dev/urandom byte reads");
+    int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
+    if (fd < 0) {
+        FAIL("open");
+        return;
+    }
+
+    unsigned char bytes[32];
+    for (size_t i = 0; i < sizeof(bytes); i++) {
+        ssize_t n = read(fd, &bytes[i], 1);
+        if (n != 1) {
+            close(fd);
+            FAIL("read");
+            return;
+        }
+    }
+    close(fd);
+
+    bool all_same = true;
+    for (size_t i = 1; i < sizeof(bytes); i++) {
+        if (bytes[i] != bytes[0]) {
+            all_same = false;
+            break;
+        }
+    }
+    if (all_same) {
+        FAIL("entropy stream did not vary");
+        return;
+    }
+    PASS();
+}
+
+static void test_urandom_open_flags(void)
+{
+    TEST("/dev/urandom open flags");
+
+    errno = 0;
+    int dirfd = open("/dev/urandom", O_RDONLY | O_DIRECTORY);
+    if (dirfd >= 0) {
+        close(dirfd);
+        FAIL("O_DIRECTORY open succeeded");
+        return;
+    }
+    if (errno != ENOTDIR) {
+        FAIL("O_DIRECTORY errno");
+        return;
+    }
+
+    int pathfd = open("/dev/urandom", O_PATH | O_CLOEXEC);
+    if (pathfd < 0) {
+        FAIL("O_PATH open");
+        return;
+    }
+    unsigned char b = 0;
+    errno = 0;
+    ssize_t n = read(pathfd, &b, 1);
+    int saved_errno = errno;
+    close(pathfd);
+    if (n != -1 || saved_errno != EBADF) {
+        FAIL("O_PATH read");
+        return;
+    }
+
+    int wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC);
+    if (wfd < 0) {
+        FAIL("O_WRONLY open");
+        return;
+    }
+    int fl = fcntl(wfd, F_GETFL);
+    errno = 0;
+    n = read(wfd, &b, 1);
+    saved_errno = errno;
+    close(wfd);
+    if (fl < 0 || (fl & O_ACCMODE) != O_WRONLY) {
+        FAIL("O_WRONLY F_GETFL");
+        return;
+    }
+    if (n != -1 || saved_errno != EBADF) {
+        FAIL("O_WRONLY read");
+        return;
+    }
+
+    wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC);
+    if (wfd < 0) {
+        FAIL("O_WRONLY open readv");
+        return;
+    }
+    struct iovec wv[2] = {{&b, 1}, {&b, 1}};
+    errno = 0;
+    n = readv(wfd, wv, 2);
+    saved_errno = errno;
+    close(wfd);
+    if (n != -1 || saved_errno != EBADF) {
+        FAIL("O_WRONLY readv");
+        return;
+    }
+
+    wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC);
+    if (wfd < 0) {
+        FAIL("O_WRONLY open oversized readv");
+        return;
+    }
+    struct iovec huge_wv[2] = {{&b, SSIZE_MAX}, {&b, 1}};
+    errno = 0;
+    n = readv(wfd, huge_wv, 2);
+    saved_errno = errno;
+    close(wfd);
+    if (n != -1 || saved_errno != EBADF) {
+        FAIL("O_WRONLY oversized readv");
+        return;
+    }
+
+    wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC);
+    if (wfd < 0) {
+        FAIL("O_WRONLY open oversized single readv");
+        return;
+    }
+    struct iovec huge_one_wv = {&b, (size_t) SSIZE_MAX + 1};
+    errno = 0;
+    n = readv(wfd, &huge_one_wv, 1);
+    saved_errno = errno;
+    close(wfd);
+    if (n != -1 || saved_errno != EBADF) {
+        FAIL("O_WRONLY oversized single readv");
+        return;
+    }
+
+    int rfd = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
+    if (rfd < 0) {
+        FAIL("O_RDONLY open readv");
+        return;
+    }
+    unsigned char rb[2] = {0};
+    struct iovec rv[2] = {{&rb[0], 1}, {&rb[1], 1}};
+    n = readv(rfd, rv, 2);
+    if (n != 2) {
+        close(rfd);
+        FAIL("O_RDONLY readv");
+        return;
+    }
+
+    struct iovec huge[2] = {{&b, SSIZE_MAX}, {&b, 1}};
+    errno = 0;
+    n = readv(rfd, huge, 2);
+    saved_errno = errno;
+    if (n != -1 || saved_errno != EINVAL) {
+        close(rfd);
+        FAIL("oversized readv");
+        return;
+    }
+
+    struct iovec huge_one = {&b, (size_t) SSIZE_MAX + 1};
+    errno = 0;
+    n = readv(rfd, &huge_one, 1);
+    saved_errno = errno;
+    if (n != -1 || saved_errno != EINVAL) {
+        close(rfd);
+        FAIL("oversized single readv");
+        return;
+    }
+
+    pid_t pid = fork();
+    if (pid < 0) {
+        close(rfd);
+        FAIL("fork inherited urandom");
+        return;
+    }
+    if (pid == 0) {
+        unsigned char child_b = 0;
+        _exit(read(rfd, &child_b, 1) == 1 ? 0 : 1);
+    }
+    int status = 0;
+    waitpid(pid, &status, 0);
+    if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+        close(rfd);
+        FAIL("inherited urandom read");
+        return;
+    }
+
+    int p[2];
+    if (pipe(p) != 0) {
+        close(rfd);
+        FAIL("urandom fork pipe");
+        return;
+    }
+    unsigned char seed = 0;
+    if (read(rfd, &seed, 1) != 1) {
+        close(rfd);
+        close(p[0]);
+        close(p[1]);
+        FAIL("prime urandom cache before fork");
+        return;
+    }
+    pid = fork();
+    if (pid < 0) {
+        close(rfd);
+        close(p[0]);
+        close(p[1]);
+        FAIL("fork urandom cache isolation");
+        return;
+    }
+    if (pid == 0) {
+        close(p[0]);
+        unsigned char child_buf[64];
+        ssize_t got = read(rfd, child_buf, sizeof(child_buf));
+        ssize_t put = got == (ssize_t) sizeof(child_buf)
+                          ? write(p[1], child_buf, sizeof(child_buf))
+                          : -1;
+        close(p[1]);
+        _exit(put == (ssize_t) sizeof(child_buf) ? 0 : 1);
+    }
+    close(p[1]);
+    unsigned char parent_buf[64];
+    unsigned char child_buf[64];
+    ssize_t parent_n = read(rfd, parent_buf, sizeof(parent_buf));
+    ssize_t child_n = read(p[0], child_buf, sizeof(child_buf));
+    close(p[0]);
+    status = 0;
+    waitpid(pid, &status, 0);
+    close(rfd);
+    if (parent_n != (ssize_t) sizeof(parent_buf) ||
+        child_n != (ssize_t) sizeof(child_buf) || !WIFEXITED(status) ||
+        WEXITSTATUS(status) != 0) {
+        FAIL("urandom fork cache isolation read");
+        return;
+    }
+    if (memcmp(parent_buf, child_buf, sizeof(parent_buf)) == 0) {
+        FAIL("urandom fork duplicated cached bytes");
+        return;
+    }
+
+    PASS();
+}
+
 int main(int argc, char **argv)
 {
     printf("test-syscall-smoke: direct syscall smoke coverage\n\n");
@@ -642,6 +883,8 @@ int main(int argc, char **argv)
     test_memory_stubs();
     test_accept4();
     test_sysv_semaphore_ops();
+    test_urandom_byte_reads();
+    test_urandom_open_flags();
 
     SUMMARY("test-syscall-smoke");
     return fails > 0 ? 1 : 0;
diff --git a/tests/test-vdso.c b/tests/test-vdso.c
new file mode 100644
index 0000000..83aab76
--- /dev/null
+++ b/tests/test-vdso.c
@@ -0,0 +1,242 @@
+/* test-vdso.c -- vDSO ELF correctness and symbol-resolution probe
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Confirms the synthetic vDSO emitted by src/core/vdso.c:
+ *   1. is published via AT_SYSINFO_EHDR
+ *   2. parses as a valid ELF shared object
+ *   3. exports the four __kernel_* symbols at addresses inside the page
+ *   4. carries GNU symbol versioning naming LINUX_2.6.39 so glibc/musl
+ *      dl_vdso_vsym() can resolve unversioned lookups
+ *   5. trampolines actually execute (call __kernel_clock_gettime and
+ *      compare the result against a direct SVC clock_gettime)
+ *
+ * Static binary so the standard test driver runs it under elfuse with
+ * no sysroot. The probe walks the vDSO's dynamic linker structure
+ * itself rather than relying on dlsym (which is unavailable in static
+ * builds anyway), so a regression in the elf layout fails this test
+ * regardless of which libc would later consume it.
+ */
+
+#include <elf.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/auxv.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+static int failures = 0;
+
+#define FAIL(msg)                           \
+    do {                                    \
+        fprintf(stderr, "FAIL: %s\n", msg); \
+        failures++;                         \
+    } while (0)
+
+#define EXPECT(cond, msg) \
+    do {                  \
+        if (!(cond))      \
+            FAIL(msg);    \
+    } while (0)
+
+/* SysV ELF hash, matches the implementation in src/core/vdso.c. */
+static uint32_t elf_hash(const char *name)
+{
+    uint32_t h = 0, g;
+    while (*name) {
+        h = (h << 4) + (unsigned char) *name++;
+        g = h & 0xf0000000U;
+        if (g)
+            h ^= g >> 24;
+        h &= ~g;
+    }
+    return h;
+}
+
+static const Elf64_Sym *lookup_sym(const Elf64_Ehdr *ehdr,
+                                   const Elf64_Sym *symtab,
+                                   const char *strtab,
+                                   const uint32_t *hash,
+                                   const char *name)
+{
+    uint32_t nbucket = hash[0];
+    uint32_t nchain = hash[1];
+    const uint32_t *bucket = &hash[2];
+    const uint32_t *chain = &bucket[nbucket];
+    uint32_t h = elf_hash(name) % nbucket;
+    for (uint32_t i = bucket[h]; i && i < nchain; i = chain[i]) {
+        if (strcmp(&strtab[symtab[i].st_name], name) == 0)
+            return &symtab[i];
+    }
+    (void) ehdr;
+    return NULL;
+}
+
+typedef struct {
+    const Elf64_Sym *symtab;
+    const char *strtab;
+    const uint32_t *hash;
+    const uint16_t *versym;
+    const Elf64_Verdef *verdef;
+    size_t strsz;
+    int verdef_count;
+} vdso_t;
+
+static int parse_vdso(const Elf64_Ehdr *ehdr, vdso_t *v)
+{
+    memset(v, 0, sizeof(*v));
+    const Elf64_Phdr *phdr =
+        (const Elf64_Phdr *) ((const uint8_t *) ehdr + ehdr->e_phoff);
+    const Elf64_Dyn *dyn = NULL;
+    for (int i = 0; i < ehdr->e_phnum; i++) {
+        if (phdr[i].p_type == PT_DYNAMIC) {
+            dyn =
+                (const Elf64_Dyn *) ((const uint8_t *) ehdr + phdr[i].p_offset);
+            break;
+        }
+    }
+    if (!dyn)
+        return -1;
+    for (; dyn->d_tag != DT_NULL; dyn++) {
+        const uint8_t *p = (const uint8_t *) ehdr + dyn->d_un.d_ptr;
+        switch (dyn->d_tag) {
+        case DT_SYMTAB:
+            v->symtab = (const Elf64_Sym *) p;
+            break;
+        case DT_STRTAB:
+            v->strtab = (const char *) p;
+            break;
+        case DT_STRSZ:
+            v->strsz = (size_t) dyn->d_un.d_val;
+            break;
+        case DT_HASH:
+            v->hash = (const uint32_t *) p;
+            break;
+        case DT_VERSYM:
+            v->versym = (const uint16_t *) p;
+            break;
+        case DT_VERDEF:
+            v->verdef = (const Elf64_Verdef *) p;
+            break;
+        case DT_VERDEFNUM:
+            v->verdef_count = (int) dyn->d_un.d_val;
+            break;
+        default:
+            break;
+        }
+    }
+    return (v->symtab && v->strtab && v->hash) ? 0 : -1;
+}
+
+static const char *verdef_name_for_ndx(const vdso_t *v, uint16_t ndx)
+{
+    const Elf64_Verdef *vd = v->verdef;
+    for (int i = 0; i < v->verdef_count && vd; i++) {
+        if (vd->vd_ndx == ndx) {
+            const Elf64_Verdaux *aux =
+                (const Elf64_Verdaux *) ((const uint8_t *) vd + vd->vd_aux);
+            return &v->strtab[aux->vda_name];
+        }
+        if (!vd->vd_next)
+            break;
+        vd = (const Elf64_Verdef *) ((const uint8_t *) vd + vd->vd_next);
+    }
+    return NULL;
+}
+
+typedef int (*clock_gettime_fn)(clockid_t, struct timespec *);
+
+static void test_vdso(void)
+{
+    unsigned long base = getauxval(AT_SYSINFO_EHDR);
+    EXPECT(base != 0, "AT_SYSINFO_EHDR is zero");
+    if (!base)
+        return;
+    printf("AT_SYSINFO_EHDR = 0x%lx\n", base);
+
+    const Elf64_Ehdr *ehdr = (const Elf64_Ehdr *) base;
+    EXPECT(memcmp(ehdr->e_ident,
+                  "\x7f"
+                  "ELF",
+                  4) == 0,
+           "vDSO ELF magic");
+    EXPECT(ehdr->e_machine == EM_AARCH64, "vDSO e_machine");
+    EXPECT(ehdr->e_type == ET_DYN, "vDSO e_type");
+
+    vdso_t v;
+    EXPECT(parse_vdso(ehdr, &v) == 0, "vDSO dynamic section parse");
+    if (!v.symtab || !v.strtab || !v.hash)
+        return;
+
+    /* All four __kernel_* symbols must resolve and land in the vDSO page. */
+    static const char *names[] = {
+        "__kernel_rt_sigreturn", "__kernel_clock_getres",
+        "__kernel_clock_gettime", "__kernel_gettimeofday"};
+    const Elf64_Sym *syms[4] = {0};
+    for (int i = 0; i < 4; i++) {
+        syms[i] = lookup_sym(ehdr, v.symtab, v.strtab, v.hash, names[i]);
+        char buf[64];
+        snprintf(buf, sizeof(buf), "lookup %s", names[i]);
+        EXPECT(syms[i] != NULL, buf);
+        if (!syms[i])
+            continue;
+        uint64_t addr = base + syms[i]->st_value;
+        snprintf(buf, sizeof(buf), "%s address in vDSO page", names[i]);
+        EXPECT(addr >= base && addr < base + 0x1000, buf);
+    }
+
+    /* Symbol versioning: every defined symbol must point at LINUX_2.6.39. */
+    EXPECT(v.versym != NULL, "vDSO DT_VERSYM present");
+    EXPECT(v.verdef != NULL, "vDSO DT_VERDEF present");
+    if (v.versym && v.verdef) {
+        for (int i = 0; i < 4; i++) {
+            if (!syms[i])
+                continue;
+            uint32_t sym_idx = (uint32_t) (syms[i] - v.symtab);
+            uint16_t ndx = v.versym[sym_idx];
+            const char *ver = verdef_name_for_ndx(&v, ndx);
+            char buf[80];
+            snprintf(buf, sizeof(buf), "%s versioned LINUX_2.6.39", names[i]);
+            EXPECT(ver && strcmp(ver, "LINUX_2.6.39") == 0, buf);
+        }
+    }
+
+    /* Direct call into the vDSO trampoline. Must agree with SVC. */
+    const Elf64_Sym *cg =
+        lookup_sym(ehdr, v.symtab, v.strtab, v.hash, "__kernel_clock_gettime");
+    if (cg) {
+        clock_gettime_fn fn =
+            (clock_gettime_fn) (uintptr_t) (base + cg->st_value);
+        struct timespec via_vdso = {0}, via_svc = {0};
+        int r1 = fn(CLOCK_MONOTONIC, &via_vdso);
+        int r2 = (int) syscall(SYS_clock_gettime, CLOCK_MONOTONIC, &via_svc);
+        EXPECT(r1 == 0, "vDSO clock_gettime returned 0");
+        EXPECT(r2 == 0, "SVC clock_gettime returned 0");
+        /* Both should produce a sane monotonic value within ~10ms of each
+         * other (allowing for the gap between the two calls).
+         */
+        int64_t delta_ns =
+            ((int64_t) via_svc.tv_sec - via_vdso.tv_sec) * 1000000000LL +
+            (via_svc.tv_nsec - via_vdso.tv_nsec);
+        if (delta_ns < 0)
+            delta_ns = -delta_ns;
+        EXPECT(delta_ns < 10000000, "vDSO and SVC clock_gettime agree");
+        printf("vDSO/SVC clock_gettime delta = %" PRId64 " ns\n", delta_ns);
+    }
+}
+
+int main(void)
+{
+    printf("test-vdso: vDSO ELF + symbol-versioning probe\n");
+    test_vdso();
+    if (failures) {
+        printf("test-vdso: %d FAIL\n", failures);
+        return 1;
+    }
+    puts("test-vdso: PASS");
+    return 0;
+}