diff --git a/Makefile b/Makefile index 405651b..7f4814f 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,7 @@ SRCS := \ core/stack.c \ core/vdso.c \ core/bootstrap.c \ + core/rosetta.c \ core/sysroot.c \ runtime/thread.c \ runtime/futex.c \ diff --git a/mk/tests.mk b/mk/tests.mk index 155025f..f04a6f5 100644 --- a/mk/tests.mk +++ b/mk/tests.mk @@ -4,6 +4,8 @@ test-static-bins \ test-dynamic test-dynamic-coreutils test-glibc-dynamic \ test-glibc-coreutils test-perf \ + test-rosetta-cli test-rosetta-statics test-rosetta-failure-modes \ + test-rosetta-alpine test-rosetta-all bench-rosetta \ test-matrix test-matrix-elfuse-aarch64 test-matrix-qemu-aarch64 \ test-full test-multi-vcpu test-rwx test-sysroot-rename \ test-case-collision test-case-collision-fallback test-sysroot-create-paths \ @@ -20,6 +22,17 @@ test-hello: $(ELFUSE_BIN) $(TEST_HELLO_DEP) check-syscall-coverage: @python3 scripts/check-syscall-coverage.py +define RUN_OPTIONAL_SKIP77 + @set -e; \ + rc=0; \ + $(1) || rc=$$?; \ + if [ "$$rc" = 77 ]; then \ + printf "$(YELLOW)SKIP$(RESET) %s\n" "$(2)"; \ + elif [ "$$rc" != 0 ]; then \ + exit "$$rc"; \ + fi +endef + ## Run the unit test suite plus busybox applet validation check: $(ELFUSE_BIN) $(TEST_DEPS) check-syscall-coverage @bash tests/driver.sh -e $(ELFUSE_BIN) -d $(TEST_DIR) -v @@ -35,6 +48,8 @@ check: $(ELFUSE_BIN) $(TEST_DEPS) check-syscall-coverage @$(MAKE) --no-print-directory test-fuse-alpine @printf "\n$(BLUE)━━━ timeout=0 validation ━━━$(RESET)\n" @$(MAKE) --no-print-directory test-timeout-disable + @printf "\n$(BLUE)━━━ rosetta CLI gating ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-rosetta-cli test-sysroot-rename: $(ELFUSE_BIN) $(BUILD_DIR)/test-sysroot-rename @set -e; \ @@ -114,6 +129,40 @@ test-timeout-disable: $(ELFUSE_BIN) $(TEST_HELLO_DEP) test-gdbstub: $(ELFUSE_BIN) $(TEST_DIR)/test-hello @bash tests/test-gdbstub.sh -e $(ELFUSE_BIN) -v +## Run Rosetta CLI gating regressions without requiring Rosetta runtime support +test-rosetta-cli: $(ELFUSE_BIN) + @bash tests/test-rosetta-cli.sh $(ELFUSE_BIN) + +## Smoke test x86_64 statics through Rosetta. Requires Rosetta-for-Linux +## installed on the host and the Alpine x86_64 fixture tree staged via +## INCLUDE_X86_64=1 bash tests/fetch-fixtures.sh. Skips cleanly otherwise. +test-rosetta-statics: $(ELFUSE_BIN) + $(call RUN_OPTIONAL_SKIP77,bash tests/test-rosetta-statics.sh $(ELFUSE_BIN),test-rosetta-statics) + +## Probe known-unsupported scenarios (dynamic x86_64, mid-process execve, +## --gdb on x86_64, --no-rosetta). Verifies the failure path emits a +## stable error rather than crashing or succeeding silently. +test-rosetta-failure-modes: $(ELFUSE_BIN) + @bash tests/test-rosetta-failure-modes.sh $(ELFUSE_BIN) + +## Alpine x86_64 file-I/O + text-pipeline coverage. Lifts the matrix's +## busybox-applet style tests against the Alpine staticbin tree but stays +## lightweight enough for the rosetta-all aggregate. Skips cleanly when +## fixtures or Rosetta-for-Linux are missing. +test-rosetta-alpine: $(ELFUSE_BIN) + $(call RUN_OPTIONAL_SKIP77,bash tests/test-rosetta-alpine.sh $(ELFUSE_BIN),test-rosetta-alpine) + +## Run every Rosetta-specific test target in sequence. +test-rosetta-all: test-rosetta-cli test-rosetta-failure-modes \ + test-rosetta-statics test-rosetta-alpine + +## Wall-clock bench harness for x86_64-via-Rosetta workloads. Prints +## best-of-N samples plus the aarch64 reference where available. Set +## BENCH_ITERS= to change sample count (default 5). +BENCH_ITERS ?= 5 +bench-rosetta: $(ELFUSE_BIN) + $(call RUN_OPTIONAL_SKIP77,bash tests/bench-rosetta.sh $(ELFUSE_BIN) $(BENCH_ITERS),bench-rosetta) + ## Alias for check (backward compat) test-all: check @@ -407,6 +456,11 @@ test-matrix-elfuse-aarch64: $(ELFUSE_BIN) $(TEST_DEPS) test-matrix-qemu-aarch64: $(ELFUSE_BIN) $(TEST_DEPS) @bash tests/test-matrix.sh qemu-aarch64 +## Probe the x86_64-via-Rosetta matrix wiring. Fails closed until the runtime +## and fixture corpus are complete enough to execute real coverage. +test-matrix-elfuse-x86_64: $(ELFUSE_BIN) $(TEST_DEPS) + @bash tests/test-matrix.sh elfuse-x86_64 + # Full test suite ## Run the complete test suite (aarch64: unit + busybox + gdbstub + coreutils + static + dynamic) test-full: $(ELFUSE_BIN) diff --git a/src/core/bootstrap.c b/src/core/bootstrap.c index 68a292f..eb61c63 100644 --- a/src/core/bootstrap.c +++ b/src/core/bootstrap.c @@ -19,13 +19,16 @@ #include "utils.h" #include "core/bootstrap.h" +#include "core/rosetta.h" #include "core/stack.h" #include "core/vdso.h" #include "runtime/thread.h" #include "syscall/abi.h" +#include "syscall/fuse.h" #include "syscall/internal.h" +#include "syscall/path.h" #include "syscall/proc.h" #include "debug/log.h" @@ -50,6 +53,104 @@ static bool append_boot_region(mem_region_t *regions, return true; } +/* Emit one mem_region_t per PT_LOAD segment of an ELF image, offset by the + * caller-supplied load base. Returns false if the boot region array fills up. + */ +static bool append_elf_segment_regions(mem_region_t *regions, + int *nregions, + const elf_info_t *info, + uint64_t load_base) +{ + for (int i = 0; i < info->num_segments; i++) { + uint64_t seg_start = info->segments[i].gpa + load_base; + uint64_t seg_end = seg_start + info->segments[i].memsz; + if (!append_boot_region(regions, nregions, seg_start, seg_end, + elf_pf_to_prot(info->segments[i].flags))) { + return false; + } + } + return true; +} + +/* Register one semantic guest_region_t per PT_LOAD segment of an ELF image. + * va_load_base controls the guest-visible range, gpa_load_base controls the + * backing GPA recorded in region metadata, and path is used for + * /proc/self/maps reporting. + */ +static void register_elf_segment_regions(guest_t *g, + const elf_info_t *info, + uint64_t va_load_base, + uint64_t gpa_load_base, + const char *path) +{ + for (int i = 0; i < info->num_segments; i++) { + uint64_t seg_start = info->segments[i].gpa + va_load_base; + uint64_t seg_end = seg_start + info->segments[i].memsz; + uint64_t seg_gpa = info->segments[i].gpa + gpa_load_base; + guest_region_add_ex_gpa(g, seg_start, seg_end, seg_gpa, + elf_pf_to_prot(info->segments[i].flags), + LINUX_MAP_PRIVATE, info->segments[i].offset, + path, -1); + } +} + +/* Publish shim, shim-data, heap, stack-guard, and stack regions to the + * /proc/self/maps view, and invalidate the null page and stack-guard PTEs. + * Shared by guest_bootstrap_prepare and guest_bootstrap_rosetta_post_reset; + * the caller registers ELF or rosetta segments separately because those + * differ between aarch64 and rosetta guests. + */ +static void register_runtime_regions(guest_t *g, size_t shim_bin_len) +{ + guest_region_add(g, g->shim_base, g->shim_base + shim_bin_len, + LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0, + "[shim]"); + guest_region_add(g, g->shim_data_base, g->shim_data_base + BLOCK_2MIB, + LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0, + "[shim-data]"); + + if (g->brk_base < g->brk_current) { + guest_region_add(g, g->brk_base, g->brk_current, + LINUX_PROT_READ | LINUX_PROT_WRITE, + LINUX_MAP_PRIVATE | LINUX_MAP_ANONYMOUS, 0, "[heap]"); + } + + guest_invalidate_ptes(g, g->stack_base, g->stack_base + STACK_GUARD_SIZE); + guest_region_add(g, g->stack_base, g->stack_base + STACK_GUARD_SIZE, + LINUX_PROT_NONE, LINUX_MAP_PRIVATE | LINUX_MAP_ANONYMOUS, + 0, "[stack-guard]"); + guest_region_add(g, g->stack_base + STACK_GUARD_SIZE, g->stack_top, + LINUX_PROT_READ | LINUX_PROT_WRITE, + LINUX_MAP_PRIVATE | LINUX_MAP_ANONYMOUS, 0, "[stack]"); + guest_invalidate_ptes(g, 0, 0x1000); +} + +int guest_bootstrap_probe_elf(const char *elf_path, elf_info_t *info) +{ + memset(info, 0, sizeof(*info)); + + FILE *f = fopen(elf_path, "rb"); + if (!f) + return -1; + + elf64_ehdr_t ehdr; + size_t nread = fread(&ehdr, 1, sizeof(ehdr), f); + fclose(f); + if (nread < sizeof(ehdr)) + return -1; + + if (ehdr.e_ident[0] != ELFMAG0 || ehdr.e_ident[1] != ELFMAG1 || + ehdr.e_ident[2] != ELFMAG2 || ehdr.e_ident[3] != ELFMAG3) + return -1; + if (ehdr.e_ident[EI_CLASS] != ELFCLASS64 || + ehdr.e_ident[EI_DATA] != ELFDATA2LSB) + return -1; + + info->e_machine = ehdr.e_machine; + info->e_type = ehdr.e_type; + return 0; +} + static void invalidate_exec_segments(const elf_info_t *info, void *host_base, uint64_t load_base) @@ -93,18 +194,54 @@ static bool load_interpreter(guest_t *g, if (boot->elf_info.interp_path[0] == '\0') return true; + bool interp_host_temp = false; + char interp_host_candidate[LINUX_PATH_MAX]; elf_resolve_interp(sysroot, boot->elf_info.interp_path, - boot->interp_resolved, sizeof(boot->interp_resolved)); + interp_host_candidate, sizeof(interp_host_candidate)); + if (strcmp(interp_host_candidate, boot->elf_info.interp_path) == 0) { + path_translation_t tx; + if (path_translate_at(LINUX_AT_FDCWD, boot->elf_info.interp_path, + PATH_TR_NONE, &tx) < 0) { + log_error("failed to resolve interpreter: %s", + boot->elf_info.interp_path); + return false; + } + if (tx.fuse_path) { + int rc = + fuse_materialize_path(tx.intercept_path, boot->interp_resolved, + sizeof(boot->interp_resolved)); + if (rc < 0) { + log_error("failed to materialize interpreter: %s", + boot->elf_info.interp_path); + return false; + } + interp_host_temp = true; + } else { + str_copy_trunc(boot->interp_resolved, tx.host_path, + sizeof(boot->interp_resolved)); + } + } else { + str_copy_trunc(boot->interp_resolved, interp_host_candidate, + sizeof(boot->interp_resolved)); + } + str_copy_trunc( + boot->interp_display_path, + interp_host_temp ? boot->elf_info.interp_path : boot->interp_resolved, + sizeof(boot->interp_display_path)); log_debug("loading interpreter: %s", boot->interp_resolved); if (elf_load(boot->interp_resolved, &boot->interp_info) < 0) { log_error("failed to load interpreter: %s", boot->interp_resolved); + if (interp_host_temp) + unlink(boot->interp_resolved); return false; } if (boot->interp_info.e_machine != EM_AARCH64) { log_error("interpreter has unsupported machine type %u: %s", boot->interp_info.e_machine, boot->interp_resolved); + if (interp_host_temp) + unlink(boot->interp_resolved); return false; } @@ -112,8 +249,12 @@ static bool load_interpreter(guest_t *g, if (elf_map_segments(&boot->interp_info, boot->interp_resolved, g->host_base, g->guest_size, boot->interp_base) < 0) { log_error("failed to map interpreter segments"); + if (interp_host_temp) + unlink(boot->interp_resolved); return false; } + if (interp_host_temp) + unlink(boot->interp_resolved); log_debug( "interpreter loaded at base=0x%llx, entry=0x%llx, %d segments", @@ -143,24 +284,19 @@ static bool build_boot_regions(mem_region_t *regions, return false; } - for (int i = 0; i < boot->elf_info.num_segments; i++) { - if (!append_boot_region( - regions, nregions, - boot->elf_info.segments[i].gpa + boot->elf_load_base, - boot->elf_info.segments[i].gpa + - boot->elf_info.segments[i].memsz + boot->elf_load_base, - elf_pf_to_prot(boot->elf_info.segments[i].flags))) { - return false; - } - } - - for (int i = 0; i < boot->interp_info.num_segments; i++) { - if (!append_boot_region( - regions, nregions, - boot->interp_info.segments[i].gpa + boot->interp_base, - boot->interp_info.segments[i].gpa + - boot->interp_info.segments[i].memsz + boot->interp_base, - elf_pf_to_prot(boot->interp_info.segments[i].flags))) { + /* Rosetta guests never load the x86_64 ELF or its interpreter into + * guest memory; rosetta itself reads the target via fd 3 once it is + * running. Adding those segments to the page-table builder would emit + * ghost L2/L3 entries at the binary's x86_64 link address (typically + * 0x400000) pointing into uninitialised primary-buffer GPAs. The + * rosetta image's own segments are registered by rosetta_prepare's + * separate region append in the bootstrap caller. + */ + if (!g->is_rosetta) { + if (!append_elf_segment_regions(regions, nregions, &boot->elf_info, + boot->elf_load_base) || + !append_elf_segment_regions(regions, nregions, &boot->interp_info, + boot->interp_base)) { return false; } } @@ -182,7 +318,9 @@ static bool build_boot_regions(mem_region_t *regions, } int guest_bootstrap_prepare(guest_t *g, - const char *elf_path, + const char *elf_host_path, + bool elf_host_path_temp, + const char *elf_guest_path, const char *sysroot, int guest_argc, const char **guest_argv, @@ -200,57 +338,98 @@ int guest_bootstrap_prepare(guest_t *g, memset(boot, 0, sizeof(*boot)); *guest_initialized = false; - if (elf_load(elf_path, &boot->elf_info) < 0) { - log_error("failed to load ELF: %s", elf_path); + if (elf_load(elf_host_path, &boot->elf_info) < 0) { + log_error("failed to load ELF: %s", elf_host_path); return -1; } - if (boot->elf_info.e_machine != EM_AARCH64) { - log_error("unsupported ELF machine type %u (only aarch64 is supported)", - boot->elf_info.e_machine); + bool want_rosetta = false; + if (boot->elf_info.e_machine == EM_X86_64) { + if (!proc_rosetta_enabled()) { + log_error( + "x86_64 ELF rejected by --no-rosetta " + "(or ELFUSE_NO_ROSETTA=1): %s", + elf_guest_path); + return -1; + } + want_rosetta = true; + } else if (boot->elf_info.e_machine != EM_AARCH64) { + log_error("unsupported ELF machine type %u", boot->elf_info.e_machine); return -1; } log_debug( "ELF entry=0x%llx, %d segments, load range [0x%llx, 0x%llx), " - "machine=aarch64", + "machine=%s", (unsigned long long) boot->elf_info.entry, boot->elf_info.num_segments, (unsigned long long) boot->elf_info.load_min, - (unsigned long long) boot->elf_info.load_max); - - if (guest_init(g, 0, 0) < 0) { + (unsigned long long) boot->elf_info.load_max, + want_rosetta ? "x86_64-via-rosetta" : "aarch64"); + + /* Rosetta is statically linked at 0x800000000000 (128 TiB), beyond the + * 36 and 40-bit IPA ranges. Request 48-bit IPA up-front so the + * page-table builder can reach the rosetta segments. HVF clamps to its + * supported size; on M1 hosts the upstream hyper-linux audit confirms + * 48 is honoured even though the auto-detect default returns 36, so + * the request is non-fatal in either direction. + */ + uint32_t req_ipa = want_rosetta ? 48 : 0; + if (guest_init(g, 0, req_ipa) < 0) { log_error("failed to initialize guest"); return -1; } *guest_initialized = true; + g->is_rosetta = want_rosetta; + proc_set_rosetta_active(want_rosetta); log_debug("IPA size: %u bits (%llu GiB primary)", g->ipa_bits, (unsigned long long) (g->guest_size / (1024ULL * 1024 * 1024))); - boot->elf_load_base = (boot->elf_info.e_type == ET_DYN) ? PIE_LOAD_BASE : 0; - if (elf_map_segments(&boot->elf_info, elf_path, g->host_base, g->guest_size, - boot->elf_load_base) < 0) { - log_error("failed to map ELF segments"); - return -1; - } + rosetta_result_t rr; + memset(&rr, 0, sizeof(rr)); + + if (want_rosetta) { + /* Rosetta path: no x86_64 ELF segments are loaded into guest memory + * (rosetta itself does that lazily once it starts running). brk and + * stack use the same defaults the aarch64 path falls back to when + * the binary sits at low VAs; the x86_64 binary's load_max would be + * meaningless here because nothing of it actually lives in primary + * buffer GPA space. + */ + boot->elf_load_base = 0; + g->elf_load_min = ELF_DEFAULT_BASE; + g->brk_base = BRK_BASE_DEFAULT; + g->brk_current = g->brk_base; + g->stack_top = STACK_TOP_DEFAULT; + g->stack_base = g->stack_top - STACK_SIZE; + } else { + boot->elf_load_base = + (boot->elf_info.e_type == ET_DYN) ? PIE_LOAD_BASE : 0; + if (elf_map_segments(&boot->elf_info, elf_host_path, g->host_base, + g->guest_size, boot->elf_load_base) < 0) { + log_error("failed to map ELF segments"); + return -1; + } - /* Track the lowest loaded ELF address so the legacy fork IPC path - * copies low-linked ET_EXECs (e.g. linked at 0x200000) in full. - */ - g->elf_load_min = boot->elf_info.load_min + boot->elf_load_base; + /* Track the lowest loaded ELF address so the legacy fork IPC path + * copies low-linked ET_EXECs (e.g. linked at 0x200000) in full. + */ + g->elf_load_min = boot->elf_info.load_min + boot->elf_load_base; - g->brk_base = PAGE_ALIGN_UP(boot->elf_info.load_max + boot->elf_load_base); - if (g->brk_base < BRK_BASE_DEFAULT) - g->brk_base = BRK_BASE_DEFAULT; - g->brk_current = g->brk_base; + g->brk_base = + PAGE_ALIGN_UP(boot->elf_info.load_max + boot->elf_load_base); + if (g->brk_base < BRK_BASE_DEFAULT) + g->brk_base = BRK_BASE_DEFAULT; + g->brk_current = g->brk_base; - g->stack_top = ALIGN_UP(g->brk_base, BLOCK_2MIB) + STACK_SIZE; - if (g->stack_top < STACK_TOP_DEFAULT) - g->stack_top = STACK_TOP_DEFAULT; - g->stack_base = g->stack_top - STACK_SIZE; + g->stack_top = ALIGN_UP(g->brk_base, BLOCK_2MIB) + STACK_SIZE; + if (g->stack_top < STACK_TOP_DEFAULT) + g->stack_top = STACK_TOP_DEFAULT; + g->stack_base = g->stack_top - STACK_SIZE; - if (!load_interpreter(g, sysroot, boot)) - return -1; + if (!load_interpreter(g, sysroot, boot)) + return -1; + } if (shim_bin_len > BLOCK_2MIB) { log_error("shim binary too large (%zu bytes)", shim_bin_len); @@ -261,10 +440,12 @@ int guest_bootstrap_prepare(guest_t *g, log_debug("shim loaded at offset 0x%llx (%zu bytes)", (unsigned long long) g->shim_base, shim_bin_len); - invalidate_exec_segments(&boot->elf_info, g->host_base, - boot->elf_load_base); - invalidate_exec_segments(&boot->interp_info, g->host_base, - boot->interp_base); + if (!want_rosetta) { + invalidate_exec_segments(&boot->elf_info, g->host_base, + boot->elf_load_base); + invalidate_exec_segments(&boot->interp_info, g->host_base, + boot->interp_base); + } sys_icache_invalidate((uint8_t *) g->host_base + g->shim_base, shim_bin_len); @@ -274,6 +455,19 @@ int guest_bootstrap_prepare(guest_t *g, return -1; } + /* Rosetta path: append the rosetta image as a non-identity region so the + * page-table builder maps VA 0x800000000000 -> primary buffer GPA. + * rosetta_prepare also initialises the TTBR1 kbuf (page-table pages come + * from the same pool that guest_build_page_tables is about to consume). + */ + if (want_rosetta) { + if (rosetta_prepare(g, elf_host_path, regions, &nregions, + MAX_BOOT_REGIONS, verbose, &rr) < 0) { + log_error("rosetta_prepare failed for %s", elf_guest_path); + return -1; + } + } + boot->ttbr0 = guest_build_page_tables(g, regions, nregions); if (!boot->ttbr0) { log_error("failed to build page tables"); @@ -285,55 +479,32 @@ int guest_bootstrap_prepare(guest_t *g, * whose slot is later consumed by an unrelated syscall. */ - guest_region_add(g, g->shim_base, g->shim_base + shim_bin_len, - LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0, - "[shim]"); - guest_region_add(g, g->shim_data_base, g->shim_data_base + BLOCK_2MIB, - LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0, - "[shim-data]"); - - { + if (want_rosetta) { + /* /proc/self/maps for a rosetta guest reports the rosetta translator + * as a single anonymous region covering [VA, VA+size). The original + * x86_64 binary is not loaded into guest memory; rosetta exposes it + * via fd 3 once rosetta_finalize pre-opens it. + */ + register_elf_segment_regions(g, &rr.rosetta_info, 0, + g->rosetta_guest_base - g->rosetta_va_base, + ROSETTA_PATH); + } else { char elf_realpath[LINUX_PATH_MAX]; memset(elf_realpath, 0, sizeof(elf_realpath)); - if (!realpath(elf_path, elf_realpath)) - str_copy_trunc(elf_realpath, elf_path, sizeof(elf_realpath)); - - for (int i = 0; i < boot->elf_info.num_segments; i++) { - guest_region_add( - g, boot->elf_info.segments[i].gpa + boot->elf_load_base, - boot->elf_info.segments[i].gpa + - boot->elf_info.segments[i].memsz + boot->elf_load_base, - elf_pf_to_prot(boot->elf_info.segments[i].flags), - LINUX_MAP_PRIVATE, boot->elf_info.segments[i].offset, - elf_realpath); - } + if (elf_host_path_temp) + str_copy_trunc(elf_realpath, elf_guest_path, sizeof(elf_realpath)); + else if (!realpath(elf_host_path, elf_realpath)) + str_copy_trunc(elf_realpath, elf_host_path, sizeof(elf_realpath)); + + register_elf_segment_regions(g, &boot->elf_info, boot->elf_load_base, + boot->elf_load_base, elf_realpath); + register_elf_segment_regions(g, &boot->interp_info, boot->interp_base, + boot->interp_base, + boot->interp_display_path); } - for (int i = 0; i < boot->interp_info.num_segments; i++) { - guest_region_add( - g, boot->interp_info.segments[i].gpa + boot->interp_base, - boot->interp_info.segments[i].gpa + - boot->interp_info.segments[i].memsz + boot->interp_base, - elf_pf_to_prot(boot->interp_info.segments[i].flags), - LINUX_MAP_PRIVATE, boot->interp_info.segments[i].offset, - boot->interp_resolved); - } - - if (g->brk_base < g->brk_current) { - guest_region_add(g, g->brk_base, g->brk_current, - LINUX_PROT_READ | LINUX_PROT_WRITE, - LINUX_MAP_PRIVATE | LINUX_MAP_ANONYMOUS, 0, "[heap]"); - } - - guest_invalidate_ptes(g, g->stack_base, g->stack_base + STACK_GUARD_SIZE); - guest_region_add(g, g->stack_base, g->stack_base + STACK_GUARD_SIZE, - LINUX_PROT_NONE, LINUX_MAP_PRIVATE | LINUX_MAP_ANONYMOUS, - 0, "[stack-guard]"); - guest_region_add(g, g->stack_base + STACK_GUARD_SIZE, g->stack_top, - LINUX_PROT_READ | LINUX_PROT_WRITE, - LINUX_MAP_PRIVATE | LINUX_MAP_ANONYMOUS, 0, "[stack]"); - guest_invalidate_ptes(g, 0, 0x1000); + register_runtime_regions(g, shim_bin_len); log_debug("TTBR0=0x%llx, IPA base=0x%llx", (unsigned long long) boot->ttbr0, (unsigned long long) g->ipa_base); @@ -352,32 +523,71 @@ int guest_bootstrap_prepare(guest_t *g, } proc_set_shim(shim_bin, (unsigned int) shim_bin_len); - proc_set_elf_path(elf_path); + proc_set_elf_path(elf_guest_path); if (sysroot) proc_set_sysroot(sysroot); - proc_set_cmdline(guest_argc, guest_argv); + + /* rosetta_finalize pre-opens the x86_64 binary at fd 3, constructs the + * binfmt_misc argv ([ROSETTA_PATH, binary, original_argv[1..]]), refreshes + * /proc/self/cmdline, and installs the TTBR0 kbuf alias. The aarch64 path + * uses the caller's argv directly. The remaining Rosetta runtime blocker is + * high-VA mmap support for the translator's own slab and JIT allocations. + */ + int rosetta_argc = 0; + const char **rosetta_argv = NULL; + if (want_rosetta) { + if (rosetta_finalize(g, 0, elf_host_path, elf_host_path_temp, + elf_guest_path, guest_argc, guest_argv, &rr, + verbose, &rosetta_argc, &rosetta_argv, NULL) < 0) { + log_error("rosetta_finalize failed"); + return -1; + } + } else { + proc_set_cmdline(guest_argc, guest_argv); + } proc_set_environ((const char **) environ); native_vdso = vdso_build(g); linux_stack_auxv_t auxv; + const elf_info_t *stack_elf = + want_rosetta ? &rr.rosetta_info : &boot->elf_info; + uint64_t stack_elf_load_base = want_rosetta ? 0 : boot->elf_load_base; + uint64_t stack_interp_base = want_rosetta ? 0 : boot->interp_base; + int stack_argc = want_rosetta ? rosetta_argc : guest_argc; + const char **stack_argv = want_rosetta ? rosetta_argv : guest_argv; boot->stack_pointer = build_linux_stack( - g, g->stack_top, guest_argc, guest_argv, (const char **) environ, - &boot->elf_info, boot->elf_load_base, boot->interp_base, native_vdso, - -1, &auxv); + g, g->stack_top, stack_argc, stack_argv, (const char **) environ, + stack_elf, stack_elf_load_base, stack_interp_base, native_vdso, -1, + &auxv); if (boot->stack_pointer == 0) { log_error("failed to build initial stack"); + free(rosetta_argv); return -1; } + /* rosetta_argv was copied into the guest stack; the host allocation is + * no longer needed. The strings themselves are constants (ROSETTA_PATH) + * or owned by the caller (binary_path, guest_argv entries) so freeing + * just the array is safe. + */ + free(rosetta_argv); proc_set_auxv(auxv.words, auxv.nwords * sizeof(auxv.words[0])); - boot->entry_point = (boot->interp_base != 0) - ? (boot->interp_info.entry + boot->interp_base) - : (boot->elf_info.entry + boot->elf_load_base); + if (want_rosetta) { + boot->entry_point = rr.entry_point; + } else { + boot->entry_point = (boot->interp_base != 0) + ? (boot->interp_info.entry + boot->interp_base) + : (boot->elf_info.entry + boot->elf_load_base); + } + const char *entry_via = ""; + if (want_rosetta) + entry_via = " (via rosetta)"; + else if (boot->interp_base) + entry_via = " (via interpreter)"; log_debug("SP=0x%llx, entry=0x%llx%s", (unsigned long long) boot->stack_pointer, - (unsigned long long) boot->entry_point, - boot->interp_base ? " (via interpreter)" : ""); + (unsigned long long) boot->entry_point, entry_via); return 0; } @@ -389,7 +599,13 @@ int guest_bootstrap_create_vcpu(guest_t *g, { uint64_t sctlr; uint64_t sctlr_with_mmu; - uint64_t tcr_value = TCR_EL1_VALUE; + /* Rosetta needs TTBR1 walks enabled and TBI1=1 so the kbuf window at + * KBUF_VA_BASE (bits-63-set) resolves and TaggedPointer extraction keeps + * working. Aarch64 guests stay on the EPD1=1 variant which keeps the + * upper VA range fault-clean. + */ + uint64_t tcr_value = g->is_rosetta ? TCR_EL1_VALUE_KBUF : TCR_EL1_VALUE; + uint64_t ttbr1_value = g->is_rosetta ? g->ttbr1 : 0; uint64_t shim_ipa = guest_ipa(g, g->shim_base); uint64_t entry_ipa = guest_ipa(g, boot->entry_point); uint64_t sp_ipa = guest_ipa(g, boot->stack_pointer); @@ -409,7 +625,7 @@ int guest_bootstrap_create_vcpu(guest_t *g, HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_MAIR_EL1, 0xFF00)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TCR_EL1, tcr_value)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR0_EL1, boot->ttbr0)); - HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR1_EL1, 0)); + HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR1_EL1, ttbr1_value)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CPACR_EL1, 3ULL << 20)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_ELR_EL1, entry_ipa)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SPSR_EL1, 0x0)); @@ -455,3 +671,99 @@ int guest_bootstrap_create_vcpu(guest_t *g, return 0; } + +int guest_bootstrap_rosetta_post_reset(guest_t *g, + const char *elf_host_path, + bool elf_host_path_temp, + const char *elf_guest_path, + int guest_argc, + const char **guest_argv, + char **environ, + size_t shim_bin_len, + bool verbose, + uint64_t *out_entry_point, + uint64_t *out_stack_pointer, + uint64_t *out_ttbr0) +{ + if (!g || !elf_host_path || !elf_guest_path || !out_entry_point || + !out_stack_pointer || !out_ttbr0) + return -1; + + /* Re-anchor brk/stack to the Rosetta defaults. guest_reset already + * restored mmap_next/mmap_end/mmap_rx_* to their initial values, but + * brk/stack were tuned for the previous image, so reset them here. + * The x86_64 target binary lives behind fd 3, not in guest memory, + * so brk_base does not move with the target's load_max. + */ + g->elf_load_min = ELF_DEFAULT_BASE; + g->brk_base = BRK_BASE_DEFAULT; + g->brk_current = g->brk_base; + g->stack_top = STACK_TOP_DEFAULT; + g->stack_base = g->stack_top - STACK_SIZE; + + mem_region_t regions[MAX_BOOT_REGIONS]; + int nregions = 0; + rosetta_result_t rr; + + if (rosetta_prepare(g, elf_host_path, regions, &nregions, MAX_BOOT_REGIONS, + verbose, &rr) < 0) { + log_error("rosetta_prepare failed during exec re-bootstrap"); + return -1; + } + + /* build_boot_regions skips ELF segments when g->is_rosetta is set, so a + * zero-initialised guest_bootstrap_t is enough to drive it here. + */ + guest_bootstrap_t boot_stub; + memset(&boot_stub, 0, sizeof(boot_stub)); + if (!build_boot_regions(regions, &nregions, g, &boot_stub, shim_bin_len)) { + log_error("too many boot regions for rosetta exec re-bootstrap"); + return -1; + } + + uint64_t ttbr0 = guest_build_page_tables(g, regions, nregions); + if (!ttbr0) { + log_error( + "guest_build_page_tables failed in rosetta exec re-bootstrap"); + return -1; + } + g->ttbr0 = ttbr0; + + /* Re-publish /proc/self/maps style metadata. Mirrors the bootstrap path + * so the post-exec view reports rosetta-as-anonymous-mapping plus the + * heap, stack, stack-guard, shim, and shim-data. + */ + register_elf_segment_regions(g, &rr.rosetta_info, 0, + g->rosetta_guest_base - g->rosetta_va_base, + ROSETTA_PATH); + register_runtime_regions(g, shim_bin_len); + + int rosetta_argc = 0; + const char **rosetta_argv = NULL; + if (rosetta_finalize(g, 0, elf_host_path, elf_host_path_temp, + elf_guest_path, guest_argc, guest_argv, &rr, verbose, + &rosetta_argc, &rosetta_argv, NULL) < 0) { + log_error("rosetta_finalize failed during exec re-bootstrap"); + return -1; + } + + proc_set_elf_path(elf_guest_path); + proc_set_environ((const char **) environ); + + uint64_t native_vdso = vdso_build(g); + linux_stack_auxv_t auxv; + uint64_t sp = build_linux_stack( + g, g->stack_top, rosetta_argc, rosetta_argv, (const char **) environ, + &rr.rosetta_info, 0, 0, native_vdso, -1 /* AT_EXECFD absent */, &auxv); + free(rosetta_argv); + if (sp == 0) { + log_error("build_linux_stack failed during exec re-bootstrap"); + return -1; + } + proc_set_auxv(auxv.words, auxv.nwords * sizeof(auxv.words[0])); + + *out_entry_point = rr.entry_point; + *out_stack_pointer = sp; + *out_ttbr0 = ttbr0; + return 0; +} diff --git a/src/core/bootstrap.h b/src/core/bootstrap.h index e2ce4c4..8527b8b 100644 --- a/src/core/bootstrap.h +++ b/src/core/bootstrap.h @@ -20,6 +20,7 @@ typedef struct { elf_info_t elf_info; elf_info_t interp_info; char interp_resolved[LINUX_PATH_MAX]; + char interp_display_path[LINUX_PATH_MAX]; uint64_t elf_load_base; uint64_t interp_base; uint64_t ttbr0; @@ -28,7 +29,9 @@ typedef struct { } guest_bootstrap_t; int guest_bootstrap_prepare(guest_t *g, - const char *elf_path, + const char *elf_host_path, + bool elf_host_path_temp, + const char *elf_guest_path, const char *sysroot, int guest_argc, const char **guest_argv, @@ -39,8 +42,51 @@ int guest_bootstrap_prepare(guest_t *g, bool *guest_initialized, guest_bootstrap_t *boot); +/* Lightweight ELF header probe used by CLI preflight checks. */ +int guest_bootstrap_probe_elf(const char *elf_path, elf_info_t *info); + int guest_bootstrap_create_vcpu(guest_t *g, const guest_bootstrap_t *boot, bool verbose, hv_vcpu_t *out_vcpu, hv_vcpu_exit_t **out_vexit); + +/* Post-reset Rosetta re-bootstrap helper used by sys_execve when an existing + * guest transitions to (or stays inside) an x86_64-via-Rosetta image. The + * caller must already have: + * - called guest_reset() on g + * - restored shim bytes into [g->shim_base, g->shim_base + shim_bin_len) + * - if the parent was a non-rosetta guest, set g->is_rosetta = true and + * proc_set_rosetta_active(true) so rosetta_prepare and rosettad gates + * see the right runtime state + * + * elf_host_path is the macOS filesystem path used by rosetta_prepare to + * open the binary (after sysroot/FUSE resolution). elf_guest_path is the + * unresolved guest-visible path published through proc_set_elf_path and + * rosetta_finalize's /proc/self/cmdline rewrite. + * + * The helper runs rosetta_prepare, appends every region the page-table + * builder needs, rebuilds page tables, registers guest_region_t entries + * for /proc/self/maps, runs rosetta_finalize (pre-opens fd 3, installs the + * kbuf user alias, publishes the binfmt-misc argv via proc_set_cmdline), + * and builds the initial Linux stack using the rosetta image as the + * AT_PHDR/AT_BASE ELF metadata. It does NOT touch the vCPU sysregs -- + * the caller writes TCR_EL1, TTBR0_EL1, TTBR1_EL1, ELR_EL1, SP_EL0, and + * PC itself once the out_* fields are returned. + * + * Returns 0 on success with out_entry_point, out_stack_pointer, out_ttbr0 + * set. Returns -1 on any internal failure; the caller is past the point of + * no return and treats that as fatal. + */ +int guest_bootstrap_rosetta_post_reset(guest_t *g, + const char *elf_host_path, + bool elf_host_path_temp, + const char *elf_guest_path, + int guest_argc, + const char **guest_argv, + char **environ, + size_t shim_bin_len, + bool verbose, + uint64_t *out_entry_point, + uint64_t *out_stack_pointer, + uint64_t *out_ttbr0); diff --git a/src/core/guest.c b/src/core/guest.c index 5e3d294..d00a49a 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -220,7 +220,11 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) if (max_ipa < 36) max_ipa = 36; - /* Determine VM IPA width. + /* Determine VM IPA width: the Stage-2 width passed to + * hv_vm_config_set_ipa_size. Distinct from the primary slab size below + * because Rosetta needs 48-bit guest VAs (image at 128 TiB) even when + * HVF rejects a 1 TiB Stage-2 mapping. + * * ipa_bits = 0 : auto-detect (40-bit on macOS 15+, else 36-bit). * ipa_bits > 0 : use that exact value. */ @@ -231,89 +235,26 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) vm_ipa = 40; else vm_ipa = 36; - - /* Primary buffer size: use the VM's configured IPA width (capped at - * 40-bit = 1TiB). macOS demand-pages the host reservation, so only touched - * pages cost physical memory. - */ - uint32_t buf_bits = (vm_ipa > 40) ? 40 : vm_ipa; - uint64_t buf_capacity = 1ULL << buf_bits; - if (size == 0 || size > buf_capacity) - size = buf_capacity; - g->guest_size = size; g->ipa_bits = vm_ipa; - /* Compute dynamic layout limits from primary buffer size. - * interp_base: last 4GiB (dynamic linker load address) - * mmap_limit: last 8GiB reserved (max mmap RW address) - * For 64GiB: interp=60GiB, mmap_limit=56GiB - * For 1TiB: interp=1020GiB, mmap_limit=1016GiB + /* Primary slab size is decoupled from the VM IPA width. The slab is + * what gets mmap'd and what hv_vm_map maps; some Apple Silicon hosts + * (including M5 in field reports) reject a 1 TiB primary slab with + * HV_BAD_ARGUMENT even when max_ipa >= 40, so a bisecting retry from + * 40-bit (1 TiB) down to 36-bit (64 GiB) is necessary. */ - g->interp_base = g->guest_size - 0x100000000ULL; - g->mmap_limit = g->guest_size - 0x200000000ULL; - if (compute_infra_layout(g) < 0) - return -1; - g->pt_pool_next = g->pt_pool_base; - - /* Reserve primary address space via mmap(MAP_ANON). macOS demand-pages - * this: physical pages are allocated only on first touch, so reserving up - * to 1TiB costs nothing until pages are actually used. Do NOT memset - * because that would touch all pages and defeat demand paging. - */ - g->host_base = - mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); - if (g->host_base == MAP_FAILED) { - perror("guest: mmap"); - g->host_base = NULL; - return -1; - } - - /* Upgrade to file-backed shared memory for CoW fork support. - * mkstemp + unlink + ftruncate + MAP_SHARED|MAP_FIXED replaces the - * anonymous mapping with file-backed memory at the same host address. - * At fork time, the parent stays on MAP_SHARED (HVF caches VA->PA, so - * remapping would cause stale reads) and sends the file fd to the child. - * The child maps it MAP_PRIVATE, giving it an instant copy-on-write - * clone of all guest memory. - * - * macOS rejects MAP_PRIVATE on shm_open objects (EINVAL), but regular file - * fds support MAP_SHARED, MAP_PRIVATE, and MAP_PRIVATE|MAP_FIXED correctly. - * The file is unlinked immediately; the fd keeps it alive. macOS - * demand-pages file mappings, so untouched pages cost nothing. If any step - * fails, guest memory silently keeps the MAP_ANON mapping and falls back to - * the IPC region-copy path on fork. - */ - { - char tmppath[] = "/tmp/elfuse-XXXXXX"; - int sfd = mkstemp(tmppath); - if (sfd >= 0) { - unlink(tmppath); /* Unlink immediately; fd keeps file alive */ - if (ftruncate(sfd, (off_t) size) == 0) { - void *p = mmap(g->host_base, size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, sfd, 0); - if (p != MAP_FAILED) { - g->shm_fd = sfd; - } else { - /* MAP_FIXED failed; keep the original MAP_ANON mapping */ - close(sfd); - } - } else { - close(sfd); - } - } - /* If shm_fd is still -1, guest memory is on MAP_ANON; fork uses IPC - * copy. - */ - } + uint32_t initial_slab_bits = (max_ipa > 40) ? 40 : max_ipa; + if (initial_slab_bits < 36) + initial_slab_bits = 36; - /* Create Hypervisor VM with the determined IPA width and map the - * primary slab at GUEST_IPA_BASE. + /* Create the HVF VM once at the requested IPA width. The slab retry + * loop below remaps within this VM; only the slab side resizes. * * macOS may not release HVF VM resources immediately after * hv_vm_destroy(), so rapid sequential VM creation (e.g. running - * many test binaries) can hit transient resource exhaustion. - * Retry with linear backoff (500ms intervals, up to 30 attempts = - * 15 seconds max wait) to handle this gracefully. + * many test binaries) can hit transient resource exhaustion. Retry + * with linear backoff (500ms intervals, up to 30 attempts = 15 + * seconds max wait) to handle this gracefully. */ hv_return_t ret = HV_ERROR; for (int attempt = 0; attempt < 30; attempt++) { @@ -328,65 +269,102 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) if (ret != HV_SUCCESS) { log_error("guest: hv_vm_create failed: %d (ipa_bits=%u)", (int) ret, vm_ipa); - munmap(g->host_base, size); - g->host_base = NULL; - if (g->shm_fd >= 0) { - close(g->shm_fd); - g->shm_fd = -1; - } return -1; } - ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, size, - HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC); - if (ret != HV_SUCCESS && buf_bits > max_ipa) { - /* 1TiB primary map failed; fall back to hardware-default buffer. - * This handles undocumented HVF limits on primary buffer size. - * Close shm_fd since the fallback uses anonymous memory (the file is no - * longer mapped to host_base, so CoW fork cannot work). + /* Bisecting slab retry: try the largest slab first, halve on each + * failure down to a known-safe 64 GiB floor. The shm_fd CoW upgrade + * is attempted on every successful slab so file-backed memory is + * preserved at any size HVF accepts. + */ + static const uint32_t slab_attempt_bits[] = {40, 38, 36}; + bool mapped = false; + size_t mapped_size = 0; + for (size_t i = 0; + i < sizeof(slab_attempt_bits) / sizeof(slab_attempt_bits[0]); i++) { + uint32_t bits = slab_attempt_bits[i]; + if (bits > initial_slab_bits) + continue; + + uint64_t try_size = 1ULL << bits; + /* Respect a caller-supplied size cap (size > 0 means "no larger + * than this"). Skip slab attempts that exceed the cap. */ - log_info( - "guest: hv_vm_map %llu GiB failed (%d), " - "retrying with %u-bit (%llu GiB)", - (unsigned long long) (size >> 30), (int) ret, max_ipa, - 1ULL << (max_ipa - 30)); - munmap(g->host_base, size); - if (g->shm_fd >= 0) { - close(g->shm_fd); - g->shm_fd = -1; - } - buf_bits = (max_ipa > 40) ? 40 : max_ipa; - size = 1ULL << buf_bits; - g->guest_size = size; - g->interp_base = size - 0x100000000ULL; - g->mmap_limit = size - 0x200000000ULL; - if (compute_infra_layout(g) < 0) { - hv_vm_destroy(); - return -1; - } + if (size > 0 && try_size > size) + continue; + + /* Re-derive the layout for this slab size. */ + g->guest_size = try_size; + g->interp_base = try_size - 0x100000000ULL; + g->mmap_limit = try_size - 0x200000000ULL; + g->overflow_ipa_next = try_size; + if (compute_infra_layout(g) < 0) + continue; g->pt_pool_next = g->pt_pool_base; - g->host_base = mmap(NULL, size, PROT_READ | PROT_WRITE, + + /* Reserve primary address space via mmap(MAP_ANON). macOS + * demand-pages this so an unused 1 TiB reservation costs no + * physical memory. Do NOT memset because that would touch every + * page and defeat demand paging. + */ + g->host_base = mmap(NULL, try_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); if (g->host_base == MAP_FAILED) { - perror("guest: mmap (fallback)"); + perror("guest: mmap"); g->host_base = NULL; - hv_vm_destroy(); - return -1; + continue; } - ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, size, + + /* Try the file-backed CoW upgrade. If any step fails, fall back + * silently to MAP_ANON; fork will then use the IPC region-copy + * path instead of SCM_RIGHTS fd passing. + */ + char tmppath[] = "/tmp/elfuse-XXXXXX"; + int sfd = mkstemp(tmppath); + if (sfd >= 0) { + unlink(tmppath); /* Unlink immediately; fd keeps file alive */ + if (ftruncate(sfd, (off_t) try_size) == 0) { + void *p = mmap(g->host_base, try_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, sfd, 0); + if (p != MAP_FAILED) { + g->shm_fd = sfd; + } else { + close(sfd); + } + } else { + close(sfd); + } + } + + ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, try_size, HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC); - } - if (ret != HV_SUCCESS) { - log_error("guest: hv_vm_map failed: %d", (int) ret); - hv_vm_destroy(); - munmap(g->host_base, size); - g->host_base = NULL; + if (ret == HV_SUCCESS) { + mapped_size = try_size; + mapped = true; + log_info("guest: primary slab %u GiB (%u-bit) mapped", + (unsigned) (try_size >> 30), bits); + break; + } + + log_info("guest: hv_vm_map %u GiB failed (%d), trying smaller slab", + (unsigned) (try_size >> 30), (int) ret); if (g->shm_fd >= 0) { close(g->shm_fd); g->shm_fd = -1; } + munmap(g->host_base, try_size); + g->host_base = NULL; + } + + if (!mapped) { + log_error( + "guest: hv_vm_map failed at every attempted slab size " + "(start=%u-bit, floor=36-bit)", + initial_slab_bits); + hv_vm_destroy(); return -1; } + size = mapped_size; /* Seed HVF segment list with one entry covering the whole slab. * sys_mmap may later split this for MAP_SHARED file overlays. @@ -416,6 +394,7 @@ int guest_init_from_shm(guest_t *g, /* Compute layout limits (same formula as guest_init) */ g->interp_base = size - 0x100000000ULL; g->mmap_limit = size - 0x200000000ULL; + g->overflow_ipa_next = size; if (compute_infra_layout(g) < 0) return -1; g->pt_pool_next = g->pt_pool_base; @@ -479,6 +458,45 @@ int guest_init_from_shm(guest_t *g, return 0; } +/* Tear down all overflow segments. Each segment is owned by + * guest_overflow_alloc, so the host buffer and its Stage-2 mapping are + * released here. Resets noverflow and overflow_ipa_next to the supplied + * anchor (guest_size for guest_reset, 0 for guest_destroy). + */ +static void release_overflow_segments(guest_t *g, uint64_t reset_anchor) +{ + for (int i = 0; i < g->noverflow; i++) { + guest_overflow_t *o = &g->overflow[i]; + if (o->host_base && o->size) { + hv_vm_unmap(o->ipa_start, o->size); + munmap(o->host_base, o->size); + } + o->host_base = NULL; + o->size = 0; + } + g->noverflow = 0; + g->overflow_ipa_next = reset_anchor; +} + +/* Tear down all extra (non-primary) IPA mappings recorded in g->mappings[]. + * Each owned host buffer is freed; unowned mappings (host VA supplied by the + * caller of guest_add_mapping) only have their Stage-2 entry torn down. + * Resets n_mappings to 0. + */ +static void release_extra_mappings(guest_t *g) +{ + for (int i = 0; i < g->n_mappings; i++) { + guest_mapping_t *m = &g->mappings[i]; + if (m->host_va && m->size) + hv_vm_unmap(m->gpa, m->size); + if (m->owns_host && m->host_va) + munmap(m->host_va, m->size); + m->host_va = NULL; + m->size = 0; + } + g->n_mappings = 0; +} + void guest_destroy(guest_t *g) { /* Destroy all worker vCPUs (thread table) before tearing down the VM. @@ -497,6 +515,14 @@ void guest_destroy(guest_t *g) for (int i = 0; i < g->n_segments; i++) hv_vm_unmap(g->segments[i].ipa, g->segments[i].len); g->n_segments = 0; + + /* Release extra IPA mappings (rosetta segments etc.). hv_vm_destroy would + * release Stage-2 state on its own; the explicit unmap inside the helper + * keeps Instruments / leak tools accurate. + */ + release_extra_mappings(g); + release_overflow_segments(g, 0); + hv_vm_destroy(); if (g->host_base) { munmap(g->host_base, g->guest_size); @@ -516,6 +542,495 @@ void guest_destroy(guest_t *g) } } +/* Check whether a candidate IPA range [gpa, gpa+size) overlaps the primary + * buffer or any existing extra mapping. Returns true on overlap. + */ +static bool guest_mapping_overlaps(const guest_t *g, uint64_t gpa, size_t size) +{ + if (size == 0) + return true; + uint64_t end = gpa + size; + if (end < gpa) + return true; /* arithmetic overflow */ + if (gpa < g->guest_size) + return true; /* would collide with the primary buffer */ + for (int i = 0; i < g->n_mappings; i++) { + const guest_mapping_t *m = &g->mappings[i]; + if (gpa < m->gpa + m->size && m->gpa < end) + return true; + } + /* Overflow segments occupy IPA ranges stacked above guest_size on a + * first-come basis. An explicit mapping added later must not silently + * land on top of an already-allocated overflow segment; HVF would + * accept the duplicate hv_vm_map but software bookkeeping (resolve + * order, destroy/reset ownership) would become ambiguous. + */ + for (int i = 0; i < g->noverflow; i++) { + const guest_overflow_t *o = &g->overflow[i]; + if (gpa < o->ipa_start + o->size && o->ipa_start < end) + return true; + } + return false; +} + +int guest_add_mapping(guest_t *g, + uint64_t gpa, + size_t size, + uint32_t hv_perms, + void **host_va_inout) +{ + if (!g || !host_va_inout || size == 0) + return -1; + if ((gpa & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + return -1; + /* If the caller supplied a host VA, it must be page-aligned. NULL means + * callee-allocate-and-own; non-NULL means caller-owned, mapped as-is into + * Stage-2 without taking ownership. + */ + if (*host_va_inout && ((uintptr_t) *host_va_inout & (PAGE_SIZE - 1))) + return -1; + if (g->n_mappings >= GUEST_MAX_MAPPINGS) { + log_error("guest_add_mapping: GUEST_MAX_MAPPINGS exhausted"); + return -1; + } + if (guest_mapping_overlaps(g, gpa, size)) { + log_error("guest_add_mapping: range [0x%llx,0x%llx) overlaps existing", + (unsigned long long) gpa, (unsigned long long) (gpa + size)); + return -1; + } + + bool allocated = false; + void *host_va = *host_va_inout; + if (!host_va) { + host_va = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + if (host_va == MAP_FAILED) { + log_error("guest_add_mapping: mmap %zu bytes failed: %s", size, + strerror(errno)); + return -1; + } + allocated = true; + } + + hv_return_t ret = hv_vm_map(host_va, gpa, size, hv_perms); + if (ret != HV_SUCCESS) { + log_error( + "guest_add_mapping: hv_vm_map gpa=0x%llx size=0x%zx " + "perms=0x%x failed: %d", + (unsigned long long) gpa, size, hv_perms, (int) ret); + if (allocated) + munmap(host_va, size); + return -1; + } + + guest_mapping_t *m = &g->mappings[g->n_mappings++]; + m->gpa = gpa; + m->host_va = host_va; + m->size = size; + m->hv_perms = hv_perms; + m->owns_host = allocated; + + *host_va_inout = host_va; + return 0; +} + +void guest_clear_rosetta_state(guest_t *g) +{ + if (!g) + return; + + release_extra_mappings(g); + + g->is_rosetta = false; + g->rosetta_guest_base = 0; + g->rosetta_va_base = 0; + g->rosetta_size = 0; + g->rosetta_entry = 0; + g->kbuf_gpa = 0; + g->kbuf_base = NULL; + g->ttbr1 = 0; +} + +const guest_mapping_t *guest_find_mapping(const guest_t *g, uint64_t gpa) +{ + if (!g || gpa < g->guest_size) + return NULL; + for (int i = 0; i < g->n_mappings; i++) { + const guest_mapping_t *m = &g->mappings[i]; + if (gpa >= m->gpa && gpa < m->gpa + m->size) + return m; + } + return NULL; +} + +const guest_overflow_t *guest_find_overflow(const guest_t *g, uint64_t gpa) +{ + if (!g || gpa < g->guest_size) + return NULL; + for (int i = 0; i < g->noverflow; i++) { + const guest_overflow_t *o = &g->overflow[i]; + if (gpa >= o->ipa_start && gpa < o->ipa_start + o->size) + return o; + } + return NULL; +} + +bool guest_is_valid_range(const guest_t *g, uint64_t gpa, uint64_t len) +{ + if (!g) + return false; + if (len == 0) + return true; + uint64_t end = gpa + len; + if (end < gpa) /* arithmetic overflow */ + return false; + + /* Primary buffer covers [0, guest_size). */ + if (end <= g->guest_size) + return true; + + /* For an extra-region or overflow match the WHOLE range must live inside + * a single region; host pointers cannot safely span discontiguous backing. + * gpa must be the entry point of that lookup so a straddling range + * (primary plus extra, or extra plus extra) is rejected. + */ + if (gpa >= g->guest_size) { + for (int i = 0; i < g->n_mappings; i++) { + const guest_mapping_t *m = &g->mappings[i]; + if (gpa >= m->gpa && end <= m->gpa + m->size) + return true; + } + for (int i = 0; i < g->noverflow; i++) { + const guest_overflow_t *o = &g->overflow[i]; + if (gpa >= o->ipa_start && end <= o->ipa_start + o->size) + return true; + } + } + return false; +} + +uint64_t guest_overflow_alloc(guest_t *g) +{ + if (!g) + return UINT64_MAX; + + /* Reuse bump space in an existing segment first. */ + for (int i = 0; i < g->noverflow; i++) { + guest_overflow_t *o = &g->overflow[i]; + if (o->next + BLOCK_2MIB <= o->size) { + uint64_t ipa = o->ipa_start + o->next; + o->next += BLOCK_2MIB; + return ipa; + } + } + + if (g->noverflow >= GUEST_MAX_OVERFLOW) { + log_error("guest_overflow_alloc: all %d segments exhausted", + GUEST_MAX_OVERFLOW); + return UINT64_MAX; + } + + /* overflow_ipa_next is anchored at guest_size by guest_init; ensure it + * also stays clear of any explicitly-registered extra mapping so the new + * segment does not collide with rosetta or kbuf placements. + */ + uint64_t seg_ipa = g->overflow_ipa_next; + if (seg_ipa < g->guest_size) + seg_ipa = g->guest_size; + uint64_t seg_size = GUEST_OVERFLOW_SIZE; + uint64_t seg_end = seg_ipa + seg_size; + if (seg_end < seg_ipa) { + log_error("guest_overflow_alloc: IPA overflow at 0x%llx", + (unsigned long long) seg_ipa); + return UINT64_MAX; + } + /* Skip past every extra mapping that overlaps the candidate placement. + * Each skip moves seg_ipa forward, so the scan must restart to catch any + * mapping the new position now overlaps. The loop is bounded by + * n_mappings -- each iteration that skips advances past at least one + * mapping, so termination is guaranteed. + */ + bool moved; + do { + moved = false; + for (int i = 0; i < g->n_mappings; i++) { + const guest_mapping_t *m = &g->mappings[i]; + uint64_t m_end = m->gpa + m->size; + if (seg_ipa < m_end && m->gpa < seg_end) { + seg_ipa = (m_end > seg_ipa) ? m_end : seg_ipa; + seg_end = seg_ipa + seg_size; + if (seg_end < seg_ipa) { + log_error("guest_overflow_alloc: IPA overflow after skip"); + return UINT64_MAX; + } + moved = true; + break; + } + } + } while (moved); + + void *host = mmap(NULL, seg_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + if (host == MAP_FAILED) { + log_error("guest_overflow_alloc: mmap %llu MiB failed: %s", + (unsigned long long) (seg_size >> 20), strerror(errno)); + return UINT64_MAX; + } + hv_return_t ret = + hv_vm_map(host, seg_ipa, seg_size, + HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC); + if (ret != HV_SUCCESS) { + log_error("guest_overflow_alloc: hv_vm_map IPA 0x%llx failed: %d", + (unsigned long long) seg_ipa, (int) ret); + munmap(host, seg_size); + return UINT64_MAX; + } + + int idx = g->noverflow++; + g->overflow[idx].host_base = host; + g->overflow[idx].ipa_start = seg_ipa; + g->overflow[idx].size = seg_size; + g->overflow[idx].next = BLOCK_2MIB; /* first 2 MiB block consumed */ + g->overflow_ipa_next = seg_end; + return seg_ipa; +} + +/* Write 128 x 2 MiB kbuf block descriptors into the supplied L2 page table + * starting at slot 384 (the first slot covering KBUF_VA_BASE / KBUF_USER_VA). + * Each descriptor maps the next 2 MiB of [kbuf_gpa, kbuf_gpa + KBUF_SIZE) + * with RW + UXN + PXN: the kbuf must stay data-only under both the kernel + * TTBR1 mirror and the user-VA TTBR0 alias to preserve the aliasing-proof + * W^X invariant. + */ +static void populate_kbuf_l2_blocks(uint64_t *l2, uint64_t kbuf_gpa) +{ + for (int i = 384; i < 512; i++) { + uint64_t ipa = kbuf_gpa + (uint64_t) (i - 384) * BLOCK_2MIB; + l2[i] = ipa | PT_AF | PT_SH_ISH | PT_ATTR1 | PT_AP_RW_EL0 | PT_UXN | + PT_PXN | PT_BLOCK; + } +} + +int guest_init_kbuf(guest_t *g, uint64_t kbuf_gpa) +{ + if (!g) + return -1; + /* Scrub kbuf state up-front so partial failure leaves the guest in a + * fully-zeroed state rather than a stale half-initialized one. The + * caller must treat a -1 return as "kbuf is unconfigured" and must not + * read g->ttbr1 / g->kbuf_base / g->kbuf_gpa. + */ + g->ttbr1 = 0; + g->kbuf_gpa = 0; + g->kbuf_base = NULL; + + if (kbuf_gpa & (BLOCK_2MIB - 1)) { + log_error("guest_init_kbuf: kbuf_gpa 0x%llx not 2 MiB-aligned", + (unsigned long long) kbuf_gpa); + return -1; + } + if (kbuf_gpa + KBUF_SIZE > g->guest_size || + kbuf_gpa + KBUF_SIZE < kbuf_gpa) { + log_error( + "guest_init_kbuf: [0x%llx,+%llu MiB) exceeds primary buffer " + "0x%llx", + (unsigned long long) kbuf_gpa, + (unsigned long long) (KBUF_SIZE >> 20), + (unsigned long long) g->guest_size); + return -1; + } + + /* kbuf lives inside the primary buffer; the existing Stage-2 mapping at + * IPA 0 already covers the GPA range, so no extra hv_vm_map is needed. + * macOS demand-pages the host buffer, so untouched 256 MiB cost nothing. + * kbuf_gpa and kbuf_base are published after the PT pool allocation + * succeeds below. + */ + + /* Build TTBR1 page-table tree: L0[511] -> L1 -> L1[511] -> L2. + * L2 entries 384..511 cover [KBUF_VA_BASE, KBUF_VA_BASE+KBUF_SIZE) with + * 128 x 2 MiB block descriptors. RW + UXN + PXN: kbuf is data-only, so + * no executable alias can exist via TTBR1. + */ + uint64_t l0_gpa = pt_alloc_page(g); + uint64_t l1_gpa = pt_alloc_page(g); + uint64_t l2_gpa = pt_alloc_page(g); + if (!l0_gpa || !l1_gpa || !l2_gpa) { + log_error("guest_init_kbuf: page-table allocation failed"); + /* Up-front scrub already cleared kbuf_base / kbuf_gpa / ttbr1. */ + return -1; + } + /* From this point the kbuf is real; publish the host pointer + GPA so + * subsequent code can look them up. ttbr1 is published last, after the + * L0/L1/L2 tree is fully populated below. + */ + g->kbuf_gpa = kbuf_gpa; + g->kbuf_base = (uint8_t *) g->host_base + kbuf_gpa; + + uint64_t *l0 = pt_at(g, l0_gpa); + l0[511] = (g->ipa_base + l1_gpa) | PT_VALID | PT_TABLE; + + uint64_t *l1 = pt_at(g, l1_gpa); + l1[511] = (g->ipa_base + l2_gpa) | PT_VALID | PT_TABLE; + + uint64_t *l2 = pt_at(g, l2_gpa); + populate_kbuf_l2_blocks(l2, kbuf_gpa); + + g->ttbr1 = g->ipa_base + l0_gpa; + return 0; +} + +/* Forward declarations for helpers defined later in the file. */ +static uint64_t make_block_desc(uint64_t gpa, int perms); + +int guest_map_va_range(guest_t *g, + uint64_t va_start, + uint64_t va_end, + uint64_t gpa_start, + int perms) +{ + if (!g || va_end <= va_start) + return -1; + if ((va_start | va_end | gpa_start) & (BLOCK_2MIB - 1)) { + log_error( + "guest_map_va_range: arguments not 2 MiB aligned " + "(va=[0x%llx,0x%llx) gpa=0x%llx)", + (unsigned long long) va_start, (unsigned long long) va_end, + (unsigned long long) gpa_start); + return -1; + } + if (!g->ttbr0) + return -1; + + uint64_t base = g->ipa_base; + uint64_t *l0 = pt_at(g, g->ttbr0 - base); + if (!l0) + return -1; + + uint64_t cur_gpa = gpa_start; + for (uint64_t va = va_start; va < va_end; + va += BLOCK_2MIB, cur_gpa += BLOCK_2MIB) { + unsigned l0_idx = (unsigned) (va / (512ULL * BLOCK_1GIB)); + if (l0_idx >= 512) { + log_error("guest_map_va_range: VA 0x%llx out of L0 range", + (unsigned long long) va); + return -1; + } + if (!(l0[l0_idx] & PT_VALID)) { + uint64_t l1_gpa = pt_alloc_page(g); + if (!l1_gpa) + return -1; + l0[l0_idx] = (base + l1_gpa) | PT_VALID | PT_TABLE; + } + uint64_t l1_ipa = l0[l0_idx] & 0xFFFFFFFFF000ULL; + uint64_t *l1 = pt_at(g, l1_ipa - base); + if (!l1) + return -1; + + unsigned l1_idx = + (unsigned) ((va % (512ULL * BLOCK_1GIB)) / BLOCK_1GIB); + if (!(l1[l1_idx] & PT_VALID)) { + uint64_t l2_gpa = pt_alloc_page(g); + if (!l2_gpa) + return -1; + l1[l1_idx] = (base + l2_gpa) | PT_VALID | PT_TABLE; + } else if (!(l1[l1_idx] & PT_TABLE)) { + log_error( + "guest_map_va_range: L1[%u] is a block, not a table; " + "VA 0x%llx collides with an existing mapping", + l1_idx, (unsigned long long) va); + return -1; + } + uint64_t l2_ipa = l1[l1_idx] & 0xFFFFFFFFF000ULL; + uint64_t *l2 = pt_at(g, l2_ipa - base); + if (!l2) + return -1; + + unsigned l2_idx = (unsigned) ((va % BLOCK_1GIB) / BLOCK_2MIB); + if (l2[l2_idx] & PT_VALID) { + /* Block already mapped -- caller may want guest_update_perms / + * guest_split_block instead. Skip silently to mirror upstream's + * sys_mmap_high_va "reuse existing GPA" behaviour. + */ + continue; + } + l2[l2_idx] = make_block_desc(cur_gpa, perms); + } + + /* The new entries are visible to the host immediately; the shim flushes + * the matching TLBs on syscall return via the per-vCPU accumulator. + */ + tlbi_request_range(va_start, va_end); + guest_pt_gen_bump(g); + return 0; +} + +int guest_install_kbuf_user_alias(guest_t *g) +{ + if (!g || !g->kbuf_gpa || !g->ttbr0) { + log_error( + "guest_install_kbuf_user_alias: kbuf or ttbr0 not " + "initialised"); + return -1; + } + + /* Walk to the L0/L1/L2 slot for KBUF_USER_VA. The VA bits 47:0 are + * bits 47:39 = 511 (L0 idx) + * bits 38:30 = 511 (L1 idx) + * bits 29:21 = 384..511 (L2 idx, 128 entries covering 256 MiB) + * Allocating L1/L2 pages from the PT pool if they are not present. + */ + uint64_t base = g->ipa_base; + uint64_t *l0 = pt_at(g, g->ttbr0 - base); + if (!l0) + return -1; + + if (!(l0[511] & PT_VALID)) { + uint64_t l1_gpa = pt_alloc_page(g); + if (!l1_gpa) + return -1; + l0[511] = (base + l1_gpa) | PT_VALID | PT_TABLE; + } + uint64_t l1_ipa = l0[511] & 0xFFFFFFFFF000ULL; + uint64_t *l1 = pt_at(g, l1_ipa - base); + if (!l1) + return -1; + + if (!(l1[511] & PT_VALID)) { + uint64_t l2_gpa = pt_alloc_page(g); + if (!l2_gpa) + return -1; + l1[511] = (base + l2_gpa) | PT_VALID | PT_TABLE; + } else if (!(l1[511] & PT_TABLE)) { + log_error( + "guest_install_kbuf_user_alias: L1[511] is a block, not a " + "table; cannot install kbuf alias"); + return -1; + } + uint64_t l2_ipa = l1[511] & 0xFFFFFFFFF000ULL; + uint64_t *l2 = pt_at(g, l2_ipa - base); + if (!l2) + return -1; + + /* Reject overlap before any descriptor is written so a collision leaves + * the existing mapping intact. + */ + for (int i = 384; i < 512; i++) { + if (l2[i] & PT_VALID) { + log_error( + "guest_install_kbuf_user_alias: L2[%d] already populated " + "(0x%llx); kbuf user-VA range collides with another " + "mapping", + i, (unsigned long long) l2[i]); + return -1; + } + } + populate_kbuf_l2_blocks(l2, g->kbuf_gpa); + + guest_pt_gen_bump(g); + return 0; +} + typedef struct { uint64_t gpa, chunk; } gva_translation_t; @@ -597,7 +1112,12 @@ static int gva_translate_perm(const guest_t *g, if (page_ipa < base) return -1; uint64_t gpa = (page_ipa - base) + (gva & (PAGE_SIZE - 1)); - if (gpa >= g->guest_size) + /* Accept GPAs inside the primary buffer or covered by an extra IPA + * mapping (rosetta segments, kbuf, etc.). Anything else is a + * dangling page-table entry pointing at unmapped Stage-2 IPA. + */ + if (gpa >= g->guest_size && !guest_find_mapping(g, gpa) && + !guest_find_overflow(g, gpa)) return -1; out->gpa = gpa; @@ -622,7 +1142,8 @@ static int gva_translate_perm(const guest_t *g, if (block_ipa < base) return -1; uint64_t gpa = (block_ipa - base) + (gva & (BLOCK_2MIB - 1)); - if (gpa >= g->guest_size) + if (gpa >= g->guest_size && !guest_find_mapping(g, gpa) && + !guest_find_overflow(g, gpa)) return -1; out->gpa = gpa; @@ -653,8 +1174,27 @@ static uint64_t gva_contiguous_avail(const guest_t *g, for (;;) { uint64_t chunk = cur.chunk; - if (chunk > g->guest_size - cur.gpa) - chunk = g->guest_size - cur.gpa; + /* Clamp to the remaining bytes in whichever backing region cur.gpa + * lives in: the primary buffer, or an extra IPA mapping. The original + * primary-buffer clamp underflowed harmlessly for high GPAs, but the + * explicit mapping lookup keeps the semantics correct. + */ + uint64_t region_end; + if (cur.gpa < g->guest_size) { + region_end = g->guest_size; + } else { + const guest_mapping_t *m = guest_find_mapping(g, cur.gpa); + if (m) { + region_end = m->gpa + m->size; + } else { + const guest_overflow_t *o = guest_find_overflow(g, cur.gpa); + if (!o) + break; + region_end = o->ipa_start + o->size; + } + } + if (chunk > region_end - cur.gpa) + chunk = region_end - cur.gpa; if (chunk > limit - total) chunk = limit - total; @@ -707,7 +1247,34 @@ static void *gva_resolve_perm(const guest_t *g, *avail = gva_contiguous_avail(g, gva, required_perms, &first, avail_limit); } - return (uint8_t *) g->host_base + first.gpa; + if (first.gpa < g->guest_size) + return (uint8_t *) g->host_base + first.gpa; + + /* GPA outside the primary buffer: consult the extra IPA mappings (rosetta + * segments, kbuf) first, then the overflow segments (lazy 1 GiB bump + * allocator for high-VA 2 MiB blocks). gva_contiguous_avail naturally + * stops at GPA-discontinuity boundaries between regions, so a single + * region match suffices for the host pointer translation. + */ + const guest_mapping_t *m = guest_find_mapping(g, first.gpa); + if (m) { + if (avail) { + uint64_t cap = (m->gpa + m->size) - first.gpa; + if (*avail > cap) + *avail = cap; + } + return (uint8_t *) m->host_va + (first.gpa - m->gpa); + } + const guest_overflow_t *o = guest_find_overflow(g, first.gpa); + if (o) { + if (avail) { + uint64_t cap = (o->ipa_start + o->size) - first.gpa; + if (*avail > cap) + *avail = cap; + } + return (uint8_t *) o->host_base + (first.gpa - o->ipa_start); + } + return NULL; } void *guest_ptr(const guest_t *g, uint64_t gva) @@ -871,14 +1438,22 @@ void guest_reset(guest_t *g) /* Zero tracked regions (ELF segments, heap, stack, mmap allocations). * Skip PROT_NONE regions because they were never touched. - * Skip regions with GPAs beyond the primary buffer. + * + * Scrub by backing GPA, not by VA: identity-mapped regions have + * gpa_base == start, but high-VA regions (rosetta) carry their VA in + * start/end and the real primary-buffer offset in gpa_base. Filtering + * by end <= guest_size alone would silently skip high-VA backing and + * leak bytes across a rosetta-to-rosetta execve. */ for (int i = 0; i < g->nregions; i++) { guest_region_t *r = &g->regions[i]; - if (r->prot != 0 /* PROT_NONE */ && r->end > r->start && - r->end <= g->guest_size) { - memset((uint8_t *) g->host_base + r->start, 0, r->end - r->start); - } + if (r->prot == 0 /* PROT_NONE */ || r->end <= r->start) + continue; + uint64_t len = r->end - r->start; + uint64_t gpa = r->gpa_base; + if (gpa > g->guest_size || len > g->guest_size - gpa) + continue; /* backing lies outside the primary slab */ + memset((uint8_t *) g->host_base + gpa, 0, len); } /* Zero page table pool (not tracked in region array) */ @@ -892,6 +1467,15 @@ void guest_reset(guest_t *g) memset((uint8_t *) g->host_base + g->shim_base, 0, g->shim_data_base + BLOCK_2MIB - g->shim_base); + /* Release overflow segments. The page tables that referenced them are + * about to be rebuilt by the exec path, so the GPA space is no longer + * needed. New segments will be allocated lazily when the next binary + * exercises the high-VA path. Rosetta placement (g->mappings[]) is + * intentionally NOT touched: it survives execve so that re-execing + * another x86_64 binary keeps the same rosetta image in place. + */ + release_overflow_segments(g, g->guest_size); + /* Reset allocation state */ guest_pt_gen_bump(g); guest_tlb_flush(); @@ -980,6 +1564,25 @@ int guest_get_used_regions(const guest_t *g, n++; } + /* Rosetta image and TTBR1 kbuf live near the top of the primary buffer + * (between the mmap RW high-water mark and the infra reserve), so the + * MMAP_BASE..mmap_next range above misses them. Snapshot each as its + * own block so the fork child inherits the translator image and the + * kernel-VA scratchpad without rebuilding either. + */ + if (g->is_rosetta) { + if (n < max && g->rosetta_guest_base && g->rosetta_size) { + out[n].offset = g->rosetta_guest_base; + out[n].size = g->rosetta_size; + n++; + } + if (n < max && g->kbuf_gpa) { + out[n].offset = g->kbuf_gpa; + out[n].size = KBUF_SIZE; + n++; + } + } + return n; } @@ -997,6 +1600,8 @@ static bool regions_mergeable(const guest_region_t *a, const guest_region_t *b) { if (a->end != b->start) return false; + if (a->gpa_base + (a->end - a->start) != b->gpa_base) + return false; if (a->prot != b->prot) return false; if (a->flags != b->flags) @@ -1086,8 +1691,29 @@ int guest_region_add_ex(guest_t *g, return -1; } - return guest_region_add_ex_owned(g, start, end, prot, flags, offset, name, - owned_backing_fd); + return guest_region_add_ex_owned_gpa(g, start, end, start, prot, flags, + offset, name, owned_backing_fd); +} + +int guest_region_add_ex_gpa(guest_t *g, + uint64_t start, + uint64_t end, + uint64_t gpa_base, + int prot, + int flags, + uint64_t offset, + const char *name, + int backing_fd) +{ + int owned_backing_fd = -1; + if (backing_fd >= 0) { + owned_backing_fd = dup(backing_fd); + if (owned_backing_fd < 0) + return -1; + } + + return guest_region_add_ex_owned_gpa(g, start, end, gpa_base, prot, flags, + offset, name, owned_backing_fd); } int guest_region_add_ex_owned(guest_t *g, @@ -1098,6 +1724,20 @@ int guest_region_add_ex_owned(guest_t *g, uint64_t offset, const char *name, int owned_backing_fd) +{ + return guest_region_add_ex_owned_gpa(g, start, end, start, prot, flags, + offset, name, owned_backing_fd); +} + +int guest_region_add_ex_owned_gpa(guest_t *g, + uint64_t start, + uint64_t end, + uint64_t gpa_base, + int prot, + int flags, + uint64_t offset, + const char *name, + int owned_backing_fd) { if (g->nregions >= GUEST_MAX_REGIONS) { log_error( @@ -1120,6 +1760,7 @@ int guest_region_add_ex_owned(guest_t *g, guest_region_t *r = &g->regions[i]; r->start = start; r->end = end; + r->gpa_base = gpa_base; r->prot = prot; r->flags = flags; r->offset = offset; @@ -1174,6 +1815,7 @@ void guest_region_remove(guest_t *g, uint64_t start, uint64_t end) if (r->start >= start && r->end > end) { uint64_t trimmed = end - r->start; r->offset += trimmed; + r->gpa_base += trimmed; r->start = end; guest_region_clip_overlay(r); i++; @@ -1214,6 +1856,7 @@ void guest_region_remove(guest_t *g, uint64_t start, uint64_t end) guest_region_t *right = &g->regions[i + 1]; *right = *r; /* Copy attributes */ right->offset += (end - r->start); + right->gpa_base += (end - r->start); right->start = end; if (r->backing_fd >= 0) { /* A dup failure leaves backing_fd=-1, silently converting this @@ -1293,6 +1936,7 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot) guest_region_clip_overlay(&g->regions[i]); /* Right half will be processed next iteration */ g->regions[i + 1].offset += (start - g->regions[i + 1].start); + g->regions[i + 1].gpa_base += (start - g->regions[i + 1].start); g->regions[i + 1].start = start; if (g->regions[i + 1].backing_fd >= 0) { g->regions[i + 1].backing_fd = @@ -1335,6 +1979,7 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot) guest_region_clip_overlay(&g->regions[i]); /* Right half: [end, old_end) keeps original prot */ g->regions[i + 1].offset += (end - g->regions[i + 1].start); + g->regions[i + 1].gpa_base += (end - g->regions[i + 1].start); g->regions[i + 1].start = end; if (g->regions[i + 1].backing_fd >= 0) { g->regions[i + 1].backing_fd = @@ -1437,8 +2082,18 @@ static bool finalize_block_perms(guest_t *g, const mem_region_t *regions, int n) * guest_update_perms produce the same final L3 state on every pass), so * dedup is an optimization the heap-region scale (~127 blocks for the * default brk window) does not justify against a fixed-size visited set. + * + * Non-identity (va_base != 0) rosetta regions are skipped: the maintenance + * helpers below (guest_split_block, guest_update_perms) navigate by GPA + * but rosetta's L2 entries live at high-VA indices, so a GPA-keyed walk + * lands in the wrong slot. Rosetta uses a single full-coverage RWX block + * descriptor by design, so no L3 splitting is needed; if a later workload + * requires per-segment perms inside the rosetta image, the maintenance + * helpers must learn va_base first. */ for (int r = 0; r < n; r++) { + if (regions[r].va_base != 0) + continue; uint64_t r_block_lo = ALIGN_2MIB_DOWN(regions[r].gpa_start); uint64_t r_block_hi = ALIGN_2MIB_UP(regions[r].gpa_end); @@ -1453,6 +2108,14 @@ static bool finalize_block_perms(guest_t *g, const mem_region_t *regions, int n) bool same_perm = true; for (int s = 0; s < n; s++) { + /* Non-identity regions are excluded from the coverage sweep + * for the same reason as the outer skip: their L2 entries + * live at a different VA-index than their GPA suggests, so + * mixing them into a GPA-keyed split would corrupt either + * tree. + */ + if (regions[s].va_base != 0) + continue; if (regions[s].gpa_end <= b || regions[s].gpa_start >= b + BLOCK_2MIB) continue; @@ -1569,9 +2232,19 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n) uint64_t gpa_start = ALIGN_2MIB_DOWN(regions[r].gpa_start); uint64_t gpa_end = ALIGN_2MIB_UP(regions[r].gpa_end); int perms = regions[r].perms; + /* Non-identity regions (rosetta segments) supply a va_base so the + * page-table entry is indexed by VA, but the block descriptor still + * carries the GPA where the data physically lives. va_offset is the + * constant delta between VA and GPA inside the region; 0 for the + * identity case which keeps the math identical to the original. + */ + uint64_t va_offset = 0; + if (regions[r].va_base) + va_offset = regions[r].va_base - regions[r].gpa_start; for (uint64_t gpa = gpa_start; gpa < gpa_end; gpa += BLOCK_2MIB) { - uint64_t lookup_addr = base + gpa; + uint64_t output_ipa = base + gpa; + uint64_t lookup_addr = output_ipa + va_offset; /* L0 index: which 512GiB slot this VA falls in */ unsigned l0_idx = (unsigned) (lookup_addr / (512ULL * BLOCK_1GIB)); @@ -1633,8 +2306,12 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n) block_perms |= old_perms; } - /* Block descriptor: output IPA (where data physically lives) */ - l2[l2_idx] = make_block_desc(lookup_addr, block_perms); + /* Block descriptor: output IPA (where data physically lives). + * For identity regions output_ipa == lookup_addr; for non-identity + * (rosetta) the entry sits at the high VA but the descriptor + * points to the primary-buffer GPA where the bytes actually are. + */ + l2[l2_idx] = make_block_desc(output_ipa, block_perms); } } @@ -1663,6 +2340,25 @@ int guest_extend_page_tables(guest_t *g, uint64_t end, int perms) { + /* Identity-only by construction: the L2 block descriptor's output IPA + * is identical to the VA index, so the new mapping puts data at the + * same GPA as the VA. That assumption breaks for non-identity rosetta + * ranges, where data lives at a low GPA below interp_base while the + * VA sits at 128 TiB. Such ranges already have entries installed by + * guest_map_va_range during rosetta_prepare; an extension request at + * a non-identity VA would silently fabricate a dangling block + * descriptor. Refuse cleanly so the misuse surfaces in logs rather + * than in a post-mortem stage-2 fault. + */ + if (start >= g->guest_size || end > g->guest_size) { + log_error( + "guest_extend_page_tables: [0x%llx,0x%llx) is outside the " + "primary buffer; non-identity ranges must use " + "guest_map_va_range", + (unsigned long long) start, (unsigned long long) end); + return -1; + } + uint64_t base = g->ipa_base; /* Navigate to L0 table */ @@ -1732,6 +2428,38 @@ int guest_extend_page_tables(guest_t *g, return 0; } +bool guest_va_block_mapped(const guest_t *g, uint64_t va) +{ + if (!g || !g->ttbr0 || (va & (BLOCK_2MIB - 1))) + return false; + + uint64_t base = g->ipa_base; + uint64_t *l0 = pt_at(g, g->ttbr0 - base); + if (!l0) + return false; + + unsigned l0_idx = (unsigned) (va / (512ULL * BLOCK_1GIB)); + if (l0_idx >= 512 || !(l0[l0_idx] & PT_VALID)) + return false; + + uint64_t l1_ipa = l0[l0_idx] & 0xFFFFFFFFF000ULL; + uint64_t *l1 = pt_at(g, l1_ipa - base); + if (!l1) + return false; + + unsigned l1_idx = (unsigned) ((va % (512ULL * BLOCK_1GIB)) / BLOCK_1GIB); + if (!(l1[l1_idx] & PT_VALID) || !(l1[l1_idx] & PT_TABLE)) + return false; + + uint64_t l2_ipa = l1[l1_idx] & 0xFFFFFFFFF000ULL; + uint64_t *l2 = pt_at(g, l2_ipa - base); + if (!l2) + return false; + + unsigned l2_idx = (unsigned) ((va % BLOCK_1GIB) / BLOCK_2MIB); + return (l2[l2_idx] & PT_VALID) != 0; +} + /* L3 page table splitting. */ /* L3 page descriptor: bits[1:0]=11 = valid page at level 3. @@ -1770,17 +2498,28 @@ static int desc_to_perms(uint64_t desc) return perms; } -/* Navigate L0->L1->L2 to find the L2 entry for a given GPA offset. - * Returns a pointer to the L2 entry, or NULL if not mapped. +/* Locate the L2 descriptor that covers a 2 MiB block at the given guest + * virtual address. The walk is VA-driven (L0/L1/L2 indices come from bits + * 47:21 of va), so it locates the correct entry for non-identity rosetta + * regions too as long as the caller passes the guest VA rather than the + * data's backing GPA. Callers iterating regions[] by gpa_start MUST + * translate to the region's va_base when va_base != 0 before invoking + * this helper, or skip the region entirely (see finalize_block_perms for + * the prevailing pattern). + * + * Returns NULL if the VA falls outside the L0 range or no entry has been + * installed along the L0 to L1 to L2 chain. */ -static uint64_t *find_l2_entry(guest_t *g, uint64_t gpa_offset) +static uint64_t *find_l2_entry(guest_t *g, uint64_t va) { - uint64_t base = g->ipa_base, ipa = base + gpa_offset; + uint64_t base = g->ipa_base, ipa = base + va; uint64_t l0_gpa_off = g->ttbr0 - base; uint64_t *l0 = pt_at(g, l0_gpa_off); - /* L0 index from actual IPA (not base), correct for >512GiB */ + /* L0 index from the full VA (not just the IPA-offset slice); correct + * for entries above the primary buffer (rosetta at 128 TiB, etc.). + */ unsigned l0_idx = (unsigned) (ipa / (512ULL * BLOCK_1GIB)); if (l0_idx >= 512 || !(l0[l0_idx] & PT_VALID)) return NULL; @@ -1920,6 +2659,22 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) start = start & ~(PAGE_SIZE - 1); end = (end + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); + /* Aliasing-proof invariant: TTBR1 maps the kbuf RW + UXN + PXN. The same + * physical pages will be dual-mapped at KBUF_USER_VA under TTBR0 by the + * rosetta finalize path. An executable TTBR0 alias would defeat HVF's + * per-mapping W^X enforcement and create a writable-and-executable race + * against the kernel-VA mirror. Reject any attempt to grant MEM_PERM_X + * inside the user-VA kbuf mirror window before page tables are touched. + */ + if ((perms & MEM_PERM_X) && end > start && + guest_kbuf_user_va_overlap(start, end - start)) { + log_error( + "guest_update_perms: refusing executable kbuf alias " + "[0x%llx, 0x%llx); violates W^X aliasing invariant", + (unsigned long long) start, (unsigned long long) end); + return -1; + } + for (uint64_t addr = start; addr < end;) { uint64_t *l2_entry = find_l2_entry(g, addr); if (!l2_entry) { @@ -1986,20 +2741,27 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) unsigned l3_idx = (unsigned) (((base + pa) % BLOCK_2MIB) / PAGE_SIZE); /* Extract the existing output IPA from the L3 entry. For - * non-identity mapped regions, pa is a VA not a GPA, so the builder - * must use the IPA already stored in the descriptor (set by + * non-identity mapped regions, pa is a VA not a GPA, so the + * builder must use the IPA already stored in the descriptor (set by * guest_split_block). * * For invalidated entries (set to 0 by guest_invalidate_ptes), the - * stored IPA is 0, which is wrong. Fall back to computing the - * identity-mapped IPA (base + pa). This is correct for TTBR0 - * user-space regions where VA == IPA == GPA. + * stored IPA is gone. Recover it from region metadata when the VA + * range is non-identity mapped; otherwise fall back to the usual + * identity IPA (base + pa). */ uint64_t page_ipa; - if (l3[l3_idx] & PT_VALID) + if (l3[l3_idx] & PT_VALID) { page_ipa = l3[l3_idx] & 0xFFFFFFFFF000ULL; - else - page_ipa = base + (pa & ~(PAGE_SIZE - 1)); + } else { + const guest_region_t *r = guest_region_find(g, pa); + if (r) { + uint64_t page_gpa = r->gpa_base + (pa - r->start); + page_ipa = base + (page_gpa & ~(PAGE_SIZE - 1)); + } else { + page_ipa = base + (pa & ~(PAGE_SIZE - 1)); + } + } uint64_t new_desc = make_page_desc(page_ipa, perms); if (l3[l3_idx] != new_desc) { l3[l3_idx] = new_desc; @@ -2019,6 +2781,71 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) return 0; } +int guest_install_va_pages(guest_t *g, + uint64_t va, + uint64_t length, + uint64_t gpa, + int perms) +{ + if (!g || length == 0) + return -1; + if ((va | length | gpa) & (PAGE_SIZE - 1)) + return -1; + /* Reject wrap on both the VA side and the GPA side. Without the gpa + * guard, a caller could pass a near-UINT64_MAX gpa with a non-zero + * length and the loop would wrap p back to a low GPA, silently + * installing descriptors pointing at the wrong physical pages. + */ + if (va > UINT64_MAX - length || gpa > UINT64_MAX - length) + return -1; + + /* Aliasing-proof invariant: TTBR1 maps the kbuf RW + UXN + PXN, and the + * same physical pages are mirrored at KBUF_USER_VA under TTBR0. An + * executable alias inside the user-VA kbuf window would defeat HVF's + * per-mapping W^X enforcement (the kernel-VA mirror is writable). Match + * the equivalent check in guest_update_perms before touching pages. + */ + if ((perms & MEM_PERM_X) && guest_kbuf_user_va_overlap(va, length)) { + log_error( + "guest_install_va_pages: refusing executable kbuf alias " + "[0x%llx, 0x%llx); violates W^X aliasing invariant", + (unsigned long long) va, (unsigned long long) (va + length)); + return -1; + } + + uint64_t base = g->ipa_base; + uint64_t end = va + length; + + /* Walk one 4 KiB page at a time. find_l2_entry locates the L2 slot for + * each VA; split_l2_block converts an L2 block descriptor into a table + * lazily so individual L3 entries can be written. The L3 entry is then + * unconditionally overwritten with the requested gpa + perms, so a prior + * invalidation (or a fresh split inheriting the wrong block address) + * cannot leave behind a stale or zero descriptor. + */ + for (uint64_t v = va, p = gpa; v < end; v += PAGE_SIZE, p += PAGE_SIZE) { + uint64_t *l2_entry = find_l2_entry(g, v); + if (!l2_entry) + return -1; + if (!(*l2_entry & PT_VALID)) + return -1; + if ((*l2_entry & 3) == 1) { + if (guest_split_block(g, ALIGN_2MIB_DOWN(v)) < 0) + return -1; + } + uint64_t l3_ipa = *l2_entry & 0xFFFFFFFFF000ULL; + uint64_t *l3 = pt_at(g, l3_ipa - base); + if (!l3) + return -1; + unsigned l3_idx = (unsigned) (((base + v) % BLOCK_2MIB) / PAGE_SIZE); + l3[l3_idx] = make_page_desc(base + p, perms); + } + + tlbi_request_range(va, end); + guest_pt_gen_bump(g); + return 0; +} + /* Lazy page materialization for MAP_NORESERVE. */ int guest_materialize_lazy(guest_t *g, uint64_t fault_offset) diff --git a/src/core/guest.h b/src/core/guest.h index 350267c..2dda869 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -98,6 +98,30 @@ */ #define GUEST_IPA_BASE 0x0ULL +/* Kernel-VA window for x86_64-via-Rosetta guests. + * + * Rosetta issues MAP_FIXED at bits-63-set addresses (0xFFFFFFFFF0000000+) for + * its internal kernel-VA allocations. With EPD1=1 (default TCR), TTBR1 walks + * are disabled and such VAs fault. The kbuf window enables TTBR1, backs a + * 256 MiB region in the primary buffer at kbuf_gpa, and installs an L0[511]/ + * L1[511]/L2[384..511] page-table tree. + * + * KBUF_USER_VA is the bits-47:0 alias used by rosetta's TaggedPointer + * extraction (which strips bits 63:48). Mapping the SAME physical kbuf pages + * at both KBUF_VA_BASE under TTBR1 and KBUF_USER_VA under TTBR0 lets a single + * physical region service both views. + * + * Aliasing-proof invariant: the kbuf is RW under both mappings; nothing + * executable is ever installed inside [kbuf_gpa, kbuf_gpa + KBUF_SIZE). The + * aliased VAs are leaf data only, so HVF's per-mapping W^X enforcement cannot + * create a writable-and-executable race. Future kbuf writers must keep the + * pages RW-only; an executable kbuf alias would violate the invariant. + */ +#define KBUF_VA_BASE \ + 0xFFFFFFFFF0000000ULL /* TTBR1 kernel-VA base (last 256 MiB) */ +#define KBUF_SIZE 0x10000000ULL /* 256 MiB */ +#define KBUF_USER_VA (KBUF_VA_BASE & 0x0000FFFFFFFFFFFFULL) /* TTBR0 mirror */ + /* Page table attributes. */ /* Memory region permission flags */ #define MEM_PERM_R (1 << 0) @@ -107,12 +131,23 @@ #define MEM_PERM_RW (MEM_PERM_R | MEM_PERM_W) /* A contiguous region of guest memory to be mapped in page tables. - * Identity-mapped: VA == GPA. + * + * Default mode (va_base == 0): identity-mapped, VA == GPA. Used by every + * boot region (shim, vDSO, brk, stack) and every aarch64 ELF segment. + * + * Rosetta segments use va_base != 0 to install a non-identity mapping: + * the rosetta ELF is statically linked at 0x800000000000 (128 TiB) but its + * bytes live in the primary buffer at a low GPA. Page-table entries are + * indexed by va_base + (offset within region) and emit a block descriptor + * whose output address is gpa_start + (offset within region). This is the + * only place in elfuse where guest VA diverges from guest GPA. */ typedef struct { - uint64_t gpa_start; /* Output IPA/GPA (2MiB aligned) */ - uint64_t gpa_end; /* Output IPA/GPA end (exclusive, 2MiB aligned) */ - int perms; /* MEM_PERM_* flags */ + uint64_t gpa_start; /* Output GPA / IPA (2MiB aligned) */ + uint64_t gpa_end; /* Output GPA / IPA end (exclusive, 2MiB aligned) */ + uint64_t + va_base; /* 0 for identity, else the guest VA the region appears at */ + int perms; /* MEM_PERM_* flags */ } mem_region_t; /* Semantic region tracking. */ @@ -151,14 +186,18 @@ typedef struct { * Regions are kept sorted by start address in guest_t.regions[]. */ typedef struct { - uint64_t start; /* GPA start for gap-finder (page-aligned) */ - uint64_t end; /* GPA end (exclusive, page-aligned) */ - int prot; /* LINUX_PROT_* flags */ - int flags; /* LINUX_MAP_* flags (for /proc/self/maps display) */ - uint64_t offset; /* File offset (for /proc/self/maps display) */ - int backing_fd; /* Duplicated host fd for file-backed mappings, or -1 */ - bool shared; /* MAP_SHARED (writes should propagate) */ - bool noreserve; /* MAP_NORESERVE: PTEs deferred until fault */ + uint64_t start; /* GPA start for gap-finder (page-aligned) */ + uint64_t end; /* GPA end (exclusive, page-aligned) */ + uint64_t gpa_base; /* Backing GPA corresponding to start. Equals start for + * identity-mapped regions; differs for high-VA guest + * mappings whose VA and GPA diverge. + */ + int prot; /* LINUX_PROT_* flags */ + int flags; /* LINUX_MAP_* flags (for /proc/self/maps display) */ + uint64_t offset; /* File offset (for /proc/self/maps display) */ + int backing_fd; /* Duplicated host fd for file-backed mappings, or -1 */ + bool shared; /* MAP_SHARED (writes should propagate) */ + bool noreserve; /* MAP_NORESERVE: PTEs deferred until fault */ bool overlay_active; /* Region has a live host MAP_FIXED|MAP_SHARED overlay * of backing_fd at host_base+start. The kernel's page * cache keeps it coherent with the file and with peer @@ -207,6 +246,51 @@ typedef struct { uint64_t start; /* Page-aligned VA when kind == TLBI_RANGE */ } tlbi_request_t; +/* Multi-region IPA mapping. + * + * The primary buffer is identity-mapped at IPA 0 and covers the low IPA range + * (typically 64 GiB on M1, 1 TiB on M3+). Anything that lives above that -- + * notably rosetta's statically-linked segments at 128 TiB -- needs its own + * Stage-2 mapping installed via a separate hv_vm_map. Each such region is + * recorded in guest_t.mappings[] so guest_ptr / gva_resolve can translate + * page-table-walk results that land outside the primary buffer. + * + * macOS user-space cannot directly mmap at host VA 128 TiB, so the host VA + * is unrelated to the guest IPA. The mapping records both. + */ +#define GUEST_MAX_MAPPINGS 8 +typedef struct { + uint64_t gpa; /* IPA where the mapping is installed (Stage-2 base) */ + void *host_va; /* Host virtual address backing the IPA range */ + size_t size; /* Bytes covered (always page-aligned) */ + uint32_t hv_perms; /* HV_MEMORY_READ/WRITE/EXEC bitmask used at map time */ + bool owns_host; /* True if host_va was allocated by guest_add_mapping */ +} guest_mapping_t; + +/* Overflow segment for incremental GPA expansion. + * + * The primary buffer's mmap pool is large by aarch64 standards (56 GiB on M1, + * 1016 GiB on M3+) but rosetta JIT/PIE/slab traffic at 85 TB / 240 TB issues + * many 2 MiB blocks that consume the pool quickly on hosts where Stage-2 caps + * the primary buffer at 36-bit IPA. Overflow segments are 1 GiB host buffers + * mapped at IPAs stacked just above guest_size; a bump allocator hands out + * 2 MiB blocks. New segments are created lazily so untouched overflow costs + * nothing. + * + * Layout matches externals/hyper-linux/src/guest.h:146-153 to keep the + * upcoming syscall_exec / fork_ipc ports straightforward. + */ +#define GUEST_MAX_OVERFLOW 4 +#define GUEST_OVERFLOW_SIZE \ + (1ULL * 1024 * 1024 * 1024) /* 1 GiB per segment \ + */ +typedef struct { + void *host_base; /* Host buffer backing the IPA range */ + uint64_t ipa_start; /* Stage-2 IPA of this segment */ + uint64_t size; /* Total bytes (always GUEST_OVERFLOW_SIZE today) */ + uint64_t next; /* Bump offset; (next + BLOCK_2MIB) > size means full */ +} guest_overflow_t; + /* Guest state. */ typedef struct { void *host_base; /* Host pointer to allocated guest memory */ @@ -254,10 +338,57 @@ typedef struct { */ uint64_t mmap_rw_gap_hint, mmap_rx_gap_hint; - uint64_t ttbr0; /* TTBR0 value (IPA of L0 page table) */ - hv_vcpu_t vcpu; /* vCPU handle */ + uint64_t ttbr0; /* TTBR0 value (IPA of L0 page table) */ + uint64_t ttbr1; /* TTBR1 value (IPA of L0 kernel page table; 0 if unused) */ + hv_vcpu_t vcpu; /* vCPU handle */ hv_vcpu_exit_t *exit; /* vCPU exit info */ uint32_t ipa_bits; /* IPA bits requested from HVF */ + + /* x86_64-via-Rosetta state. All zero for aarch64 guests. Populated when + * the rosetta feature flag is on and an EM_X86_64 binary is loaded. + * Survives guest_reset so execve of another x86_64 binary keeps the same + * placement and kbuf wiring. + * + * Field semantics mirror externals/hyper-linux/src/guest.h so that future + * ports of rosetta.c/.h, syscall_exec.c, and fork_ipc.c do not need to + * rename anything: + * rosetta_guest_base : Stage-2 GPA where rosetta segments are installed + * via guest_add_mapping (typically 128 TiB). + * rosetta_va_base : Guest virtual base where rosetta is loaded + * (matches its static link address, 0x800000000000). + * rosetta_size : Total bytes covering all rosetta PT_LOAD segments. + * rosetta_entry : Rosetta ELF entry point (high VA). + * kbuf_gpa : Stage-2 GPA backing the kbuf window inside the + * primary buffer (256 MiB, 2 MiB-aligned). + * kbuf_base : Host pointer to the kbuf, == host_base+kbuf_gpa. + * The guest VA for the kernel mirror is the fixed + * constant KBUF_VA_BASE; the user-VA alias is the + * derived constant KBUF_USER_VA. + */ + bool is_rosetta; + uint64_t rosetta_guest_base; + uint64_t rosetta_va_base; + uint64_t rosetta_size; + uint64_t rosetta_entry; + uint64_t kbuf_gpa; + void *kbuf_base; + + /* Extra IPA mappings installed via hv_vm_map at a non-zero GPA. Consulted + * by gva_resolve when the page-table walk yields a GPA outside the primary + * buffer (gpa >= guest_size). Cleared on guest_init; preserved across + * guest_reset because rosetta placement is stable across execve. + */ + guest_mapping_t mappings[GUEST_MAX_MAPPINGS]; + int n_mappings; + + /* Overflow segments. noverflow grows from 0 lazily as guest_overflow_alloc + * runs out of bump space. overflow_ipa_next tracks the next free IPA + * stacked above guest_size; initialized in guest_init to g->guest_size. + */ + guest_overflow_t overflow[GUEST_MAX_OVERFLOW]; + int noverflow; + uint64_t overflow_ipa_next; + /* Semantic region tracking for munmap/mprotect/proc-self-maps */ guest_region_t regions[GUEST_MAX_REGIONS]; int nregions; /* Number of active regions */ @@ -459,6 +590,175 @@ int guest_init_from_shm(guest_t *g, /* Tear down VM and free guest memory. */ void guest_destroy(guest_t *g); +/* Install a Stage-2 mapping for a high IPA range that the primary buffer does + * not cover (e.g. rosetta's segments at 128 TiB). Calls hv_vm_map with the + * supplied permissions. If host_va_inout points to NULL, allocates an anon + * host buffer of the requested size and records ownership so guest_destroy + * frees it. Otherwise the caller-supplied host_va is mapped as-is and the + * mapping does not own it. size and gpa must be page-aligned. + * + * The new region is appended to g->mappings[] for guest_ptr / gva_resolve + * fall-through. Returns 0 on success, -1 if GUEST_MAX_MAPPINGS is exhausted, + * if the allocation/mapping fails, or if the range collides with the primary + * buffer or an existing extra mapping. + * + * Locking: callers MUST hold mmap_lock. gva_resolve_perm reads mappings[] + * lock-free during page-table walks, so mutating n_mappings / mappings[] + * from concurrent vCPUs without serialization would race. + */ +int guest_add_mapping(guest_t *g, + uint64_t gpa, + size_t size, + uint32_t hv_perms, + void **host_va_inout); + +/* Tear down the Rosetta-specific guest personality: unmap the translator's + * extra IPA mappings, clear the TTBR1/kbuf fields, and scrub the rosetta_* + * metadata. Used when a Rosetta-launched process execve()s an aarch64 image. + * + * Locking: callers MUST hold mmap_lock. gva_resolve_perm reads mappings[] + * lock-free during page-table walks. + */ +void guest_clear_rosetta_state(guest_t *g); + +/* Linear scan of g->mappings[] for the entry covering gpa. Returns NULL if + * gpa is below g->guest_size (i.e. inside the primary buffer) or not covered + * by any extra mapping. + */ +const guest_mapping_t *guest_find_mapping(const guest_t *g, uint64_t gpa); + +/* Bump-allocate a 2 MiB block from the overflow segments. Lazily creates a + * new 1 GiB segment (via mmap + hv_vm_map at g->overflow_ipa_next) when the + * existing segments are exhausted. Returns the GPA of the allocated block, + * or UINT64_MAX if all GUEST_MAX_OVERFLOW segments are full or a host/HVF + * allocation step failed. Callers should treat UINT64_MAX as -ENOMEM. + * + * Locking: callers MUST hold mmap_lock. gva_resolve_perm reads overflow[] + * lock-free during page-table walks. + */ +uint64_t guest_overflow_alloc(guest_t *g); + +/* Locate the overflow segment covering gpa. Returns NULL if gpa is not within + * any overflow segment. + */ +const guest_overflow_t *guest_find_overflow(const guest_t *g, uint64_t gpa); + +/* Returns true when [gpa, gpa+len) is fully contained within the primary + * buffer, OR fully contained within a single extra mapping, OR fully + * contained within a single overflow segment. The check rejects ranges that + * straddle region boundaries -- host pointers cannot safely span discontiguous + * backing regions. len == 0 returns true (zero-length ranges are well-formed + * by convention; syscall handlers that disallow them check separately). + * + * Use this for syscalls that need to validate a guest IPA range without + * coupling to the primary-buffer-only assumption: sys_mmap MAP_FIXED, + * sys_munmap, sys_mremap, sys_mprotect, sys_msync, and any future caller + * that handles rosetta high-VA traffic. + */ +bool guest_is_valid_range(const guest_t *g, uint64_t gpa, uint64_t len); + +/* Initialize the TTBR1 kbuf window. kbuf_gpa must be 2 MiB-aligned and the + * [kbuf_gpa, kbuf_gpa + KBUF_SIZE) range must lie within the primary buffer. + * Allocates three page-table pages (L0/L1/L2) from the PT pool, populates + * L0[511] -> L1, L1[511] -> L2, and L2[384..511] = 128 x 2 MiB block + * descriptors with PT_AP_RW_EL0 | PT_UXN | PT_PXN (RW, non-executable). + * Stores the resulting TTBR1 IPA in g->ttbr1 and sets g->kbuf_gpa / + * g->kbuf_base. On any failure all three fields are scrubbed to 0/NULL + * so the caller cannot read stale state. + * + * Returns 0 on success, -1 on alignment / bounds / PT-pool-exhaustion failure. + * + * Locking: callers MUST hold mmap_lock. The function mutates the PT pool + * and the kbuf fields; gva translation reads ttbr1 lock-free. + */ +int guest_init_kbuf(guest_t *g, uint64_t kbuf_gpa); + +/* Install the TTBR0 user-VA mirror of the kbuf window. Walks the existing + * TTBR0 tree (at g->ttbr0) to L0[511]/L1[511]/L2[384..511] and writes + * 128 x 2 MiB block descriptors mapping [KBUF_USER_VA, KBUF_USER_VA + + * KBUF_SIZE) to [g->kbuf_gpa, g->kbuf_gpa + KBUF_SIZE) with RW + UXN + PXN + * perms. Rosetta's TaggedPointer extraction strips bits 63:48 so the same + * physical pages must be reachable through both TTBR1 (kernel VA) and + * TTBR0 (the user-VA alias). + * + * Must be called after guest_build_page_tables (g->ttbr0 must point at a + * valid L0 page) and after guest_init_kbuf (g->kbuf_gpa must be set). + * Returns 0 on success, -1 on PT-pool exhaustion or invalid state. + * + * Locking: callers MUST hold mmap_lock. + */ +int guest_install_kbuf_user_alias(guest_t *g); + +/* Install L2 block descriptors mapping [va_start, va_end) to + * [gpa_start, gpa_start + (va_end-va_start)) under TTBR0. Both addresses + * and the size must be 2 MiB-aligned. Walks the existing TTBR0 tree at + * g->ttbr0 and allocates L1/L2 tables from the PT pool as needed. + * + * If an L2 slot is already populated, the function leaves it untouched + * and continues with the next block; the caller is expected to use + * guest_update_perms (or split + update_perms) if it needs to refine + * permissions on an already-mapped 2 MiB block. + * + * Records a guest_pt_gen_bump and a TLBI request covering the new range + * so the shim invalidates matching TLB entries on the way back to EL0. + * + * Returns 0 on success, -1 on alignment, PT-pool-exhaustion, or out-of-L0 + * failure. Used by sys_mmap for high-VA MAP_FIXED requests (rosetta's + * JIT slabs at 240 TiB, code caches at 85 TiB) where the VA lives outside + * the primary buffer but the GPA still does. + * + * Locking: callers MUST hold mmap_lock. + */ +int guest_map_va_range(guest_t *g, + uint64_t va_start, + uint64_t va_end, + uint64_t gpa_start, + int perms); + +/* Install (or overwrite) 4 KiB L3 page descriptors mapping [va, va+length) + * to [gpa, gpa+length) with the requested perms. Unlike guest_update_perms, + * which only edits existing descriptors and falls back to region metadata + * for invalid entries, this helper always writes a fresh make_page_desc so + * a previously-invalidated L3 slot is restored without consulting the + * region table. Splits L2 block descriptors on the path lazily. + * + * All three arguments (va, length, gpa) must be PAGE_SIZE aligned. The L2 + * chain (L0->L1->L2) must already be in place (caller's responsibility, + * typically via a prior guest_map_va_range). + * + * Returns 0 on success, -1 on alignment, missing L2 chain, or PT-pool + * exhaustion. + * + * Locking: callers MUST hold mmap_lock. + */ +int guest_install_va_pages(guest_t *g, + uint64_t va, + uint64_t length, + uint64_t gpa, + int perms); + +/* Query whether a 2 MiB TTBR0 VA block already has a leaf mapping. + * Returns true only for a present L2 block descriptor. + */ +bool guest_va_block_mapped(const guest_t *g, uint64_t va); + +/* Returns true when the VA range [va, va+size) overlaps the user-VA kbuf + * alias window [KBUF_USER_VA, KBUF_USER_VA+KBUF_SIZE). Callers that install + * TTBR0 mappings (the future rosetta_finalize, sys_mmap MAP_FIXED touching + * this range, the page-table-build pass) must reject MEM_PERM_X when this + * helper returns true: TTBR1 maps the same physical pages RW only, and an + * executable TTBR0 alias would defeat HVF's per-mapping W^X enforcement. + */ +static inline bool guest_kbuf_user_va_overlap(uint64_t va, uint64_t size) +{ + if (size == 0) + return false; + uint64_t end = va + size; + if (end < va) /* arithmetic overflow */ + end = UINT64_MAX; + return va < (KBUF_USER_VA + KBUF_SIZE) && KBUF_USER_VA < end; +} + /* Get a host pointer for a guest virtual address (read access). * Returns NULL if gva is out of bounds or not readable. */ @@ -626,6 +926,15 @@ int guest_region_add_ex(guest_t *g, uint64_t offset, const char *name, int backing_fd); +int guest_region_add_ex_gpa(guest_t *g, + uint64_t start, + uint64_t end, + uint64_t gpa_base, + int prot, + int flags, + uint64_t offset, + const char *name, + int backing_fd); /* Like guest_region_add_ex, but consumes owned_backing_fd on success or * failure. */ @@ -637,6 +946,15 @@ int guest_region_add_ex_owned(guest_t *g, uint64_t offset, const char *name, int owned_backing_fd); +int guest_region_add_ex_owned_gpa(guest_t *g, + uint64_t start, + uint64_t end, + uint64_t gpa_base, + int prot, + int flags, + uint64_t offset, + const char *name, + int owned_backing_fd); /* Remove all region coverage in [start, end). Regions fully contained are * deleted; partially overlapping regions are trimmed or split. diff --git a/src/core/rosetta.c b/src/core/rosetta.c new file mode 100644 index 0000000..7805490 --- /dev/null +++ b/src/core/rosetta.c @@ -0,0 +1,1221 @@ +/* x86_64-via-Apple-Rosetta translator setup. + * + * Copyright 2026 elfuse contributors + * Copyright 2025 Moritz Angermann, zw3rk pte. ltd. + * SPDX-License-Identifier: Apache-2.0 + * + * rosetta_prepare loads the Apple Rosetta binary into the primary buffer at + * a low GPA and exposes it at its statically-linked high VA (0x800000000000) + * via a non-identity mem_region_t.va_base. The TTBR1 kbuf is initialised at + * a 256 MiB window just below the rosetta image. rosetta_finalize wires the + * bootstrap-visible pieces needed to enter the translator: fd 3 setup, + * binfmt-style argv construction, cmdline refresh, and the TTBR0 kbuf alias. + * The runtime still depends on the high-VA mmap path in mem.c for Rosetta's + * own slab and JIT allocations. + * + * Elfuse extends mem_region_t with a va_base field instead, so the page-table + * builder handles non-identity placement in a single pass. + */ + +#include "core/rosetta.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "core/elf.h" +#include "core/guest.h" +#include "debug/log.h" +#include "hvutil.h" +#include "utils.h" +#include "syscall/internal.h" /* fd_alloc_at, FD_REGULAR */ +#include "syscall/proc.h" /* proc_set_cmdline */ + +/* Round a guest virtual address (or size) up to the next 2 MiB boundary. */ +static inline uint64_t align2m_up(uint64_t v) +{ + return (v + BLOCK_2MIB - 1) & ~(BLOCK_2MIB - 1); +} + +static inline uint64_t align2m_down(uint64_t v) +{ + return v & ~(BLOCK_2MIB - 1); +} + +/* The VZ_CAPS payload only has room for a 42-byte inline path. Publish + * /proc/self/fd/3 there when the real host path is longer so rosetta sees a + * valid reopenable path without truncation, while the host-side translator + * subprocess still retains the full original binary path. Both buffers are + * read by the VZ_CAPS ioctl handler from any vCPU; writes happen during + * rosetta_finalize on execve. A pthread_mutex covers both setter and + * snapshot reader so a multi-vCPU guest doing concurrent execves cannot + * observe a torn or stale string. + */ +static pthread_mutex_t rosettad_path_lock = PTHREAD_MUTEX_INITIALIZER; +static char rosettad_binary_path[PATH_MAX]; +static char rosettad_caps_binary_path[ROSETTA_CAPS_BINARY_PATH_LEN]; +static char rosettad_owned_binary_path[PATH_MAX]; + +/* Move any owned path out of rosettad_owned_binary_path into out (which + * the caller can then unlink after dropping the lock). Returns true if a + * path was drained. The path lock must be held by the caller. The unlink + * itself is deferred to the caller because it can block on slow + * filesystems (NFS, FUSE), and the path lock is also taken by the + * VZ_CAPS snapshot helpers on every vCPU; running the syscall inside the + * critical section would stall those readers. + */ +static bool rosettad_drain_owned_path_locked(char out[PATH_MAX]) +{ + if (rosettad_owned_binary_path[0] == '\0') + return false; + memcpy(out, rosettad_owned_binary_path, PATH_MAX); + rosettad_owned_binary_path[0] = '\0'; + return true; +} + +void rosettad_set_binary_path(const char *path, bool take_ownership) +{ + if (!path) + path = ""; + size_t n = strlen(path); + + char prev_owned[PATH_MAX]; + bool have_prev_owned; + + pthread_mutex_lock(&rosettad_path_lock); + + have_prev_owned = rosettad_drain_owned_path_locked(prev_owned); + + size_t full_n = n; + if (full_n >= sizeof(rosettad_binary_path)) { + log_warn("rosetta: full binary path too long, truncating: %s", path); + full_n = sizeof(rosettad_binary_path) - 1; + } + memcpy(rosettad_binary_path, path, full_n); + rosettad_binary_path[full_n] = '\0'; + + const char *caps_path = path; + if (n >= sizeof(rosettad_caps_binary_path)) { + caps_path = "/proc/self/fd/3"; + log_debug("rosetta: using %s as caps binary path for long target %s", + caps_path, path); + } + size_t caps_n = strlen(caps_path); + memcpy(rosettad_caps_binary_path, caps_path, caps_n); + rosettad_caps_binary_path[caps_n] = '\0'; + + if (take_ownership) + str_copy_trunc(rosettad_owned_binary_path, path, + sizeof(rosettad_owned_binary_path)); + + pthread_mutex_unlock(&rosettad_path_lock); + + if (have_prev_owned) + unlink(prev_owned); +} + +void rosettad_clear_binary_path(void) +{ + char prev_owned[PATH_MAX]; + bool have_prev_owned; + + pthread_mutex_lock(&rosettad_path_lock); + have_prev_owned = rosettad_drain_owned_path_locked(prev_owned); + rosettad_binary_path[0] = '\0'; + rosettad_caps_binary_path[0] = '\0'; + pthread_mutex_unlock(&rosettad_path_lock); + + if (have_prev_owned) + unlink(prev_owned); +} + +/* Common body for the two snapshot helpers below. Copies src into out_buf + * with NUL termination while holding the rosetta path lock so the reader + * cannot observe a torn write from a concurrent execve. + */ +static size_t rosettad_snapshot_locked(const char *src, + char *out_buf, + size_t out_size) +{ + if (!out_buf || out_size == 0) + return 0; + pthread_mutex_lock(&rosettad_path_lock); + size_t n = strlen(src); + if (n >= out_size) + n = out_size - 1; + memcpy(out_buf, src, n); + out_buf[n] = '\0'; + pthread_mutex_unlock(&rosettad_path_lock); + return n; +} + +/* Snapshot the host-side full path; out_buf is sized PATH_MAX by callers. */ +size_t rosettad_snapshot_binary_path(char *out_buf, size_t out_size) +{ + return rosettad_snapshot_locked(rosettad_binary_path, out_buf, out_size); +} + +/* Snapshot the VZ_CAPS 42-byte path; caller buffer is at least + * ROSETTA_CAPS_BINARY_PATH_LEN bytes wide. + */ +size_t rosettad_snapshot_caps_binary_path(char *out_buf, size_t out_size) +{ + return rosettad_snapshot_locked(rosettad_caps_binary_path, out_buf, + out_size); +} + +int rosetta_prepare(guest_t *g, + const char *binary_path, + mem_region_t *regions, + int *nregions, + int max_regions, + bool verbose, + rosetta_result_t *result) +{ + (void) binary_path; /* used only by rosetta_finalize */ + + if (!g || !regions || !nregions || !result) + return -1; + memset(result, 0, sizeof(*result)); + + if (access(ROSETTA_PATH, X_OK) != 0) { + log_error( + "rosetta: x86_64 binary requires the Rosetta Linux " + "translator at %s", + ROSETTA_PATH); + log_error("rosetta: install via 'softwareupdate --install-rosetta'"); + return -1; + } + + if (elf_load(ROSETTA_PATH, &result->rosetta_info) < 0) { + log_error("rosetta: failed to load rosetta ELF from %s", ROSETTA_PATH); + return -1; + } + elf_info_t *ri = &result->rosetta_info; + if (verbose) { + log_debug("rosetta: ELF entry=0x%llx load=[0x%llx,0x%llx)", + (unsigned long long) ri->entry, + (unsigned long long) ri->load_min, + (unsigned long long) ri->load_max); + } + + /* Compute 2 MiB-aligned placement covering all rosetta PT_LOAD segments. + * rosetta is statically linked at 0x800000000000; segments span a small + * range above that. The mapping must cover the entire span so all + * segments resolve through a single Stage-2 region. + */ + uint64_t va_base = align2m_down(ri->load_min); + uint64_t va_end = align2m_up(ri->load_max); + if (va_end <= va_base) { + log_error("rosetta: empty load range"); + return -1; + } + uint64_t size = va_end - va_base; + + /* Pick a primary-buffer placement below the full high-IPA infra reserve, + * 2 MiB aligned. guest_init() has already reserved + * [interp_base - INFRA_RESERVE, interp_base) for the page-table pool, + * shim text, and shim data, so rosetta must stay below that window. + * + * HVF on M1 caps Stage-2 hv_vm_map at the hardware-default IPA width + * (36 bits on this host); a separate Stage-2 mapping at 128 TiB via + * guest_add_mapping is rejected with HV_BAD_ARGUMENT. Load rosetta into the + * primary buffer at a low GPA and use the non-identity page-table mapping + * (mem_region_t.va_base) to expose it at its statically-linked high VA. + */ + uint64_t guest_base; + if (g->rosetta_guest_base == 0) { + uint64_t rosetta_limit = g->interp_base - INFRA_RESERVE; + if (g->interp_base < INFRA_RESERVE || rosetta_limit < size) { + log_error( + "rosetta: primary buffer too small for rosetta image " + "(%llu MiB) below infra reserve [0x%llx,0x%llx)", + (unsigned long long) (size >> 20), + (unsigned long long) rosetta_limit, + (unsigned long long) g->interp_base); + return -1; + } + guest_base = (rosetta_limit - size) & ~(BLOCK_2MIB - 1); + if (guest_base < g->stack_top + BLOCK_2MIB) { + log_error( + "rosetta: no image gap between stack_top=0x%llx and " + "rosetta@0x%llx", + (unsigned long long) g->stack_top, + (unsigned long long) guest_base); + return -1; + } + + /* Load rosetta into the primary buffer. load_base = guest_base - + * va_base places p_vaddr+load_base inside host_base+guest_base. + * The wrap math is the same trick elf.c documents for high-VA + * binaries: uint64_t arithmetic, two's-complement intentional. + */ + uint64_t load_base = guest_base - va_base; + if (elf_map_segments(ri, ROSETTA_PATH, g->host_base, g->guest_size, + load_base) < 0) { + log_error("rosetta: elf_map_segments failed"); + return -1; + } + + /* Place TTBR1 kbuf in the primary buffer just below the rosetta + * image. The kbuf needs to fit in [kbuf_gpa, kbuf_gpa + 256 MiB); + * brk and stack live below it, rosetta above. Validate the gap + * before subtracting so the underflow case (guest_base near 0) + * cannot wrap into a huge uint64_t that defeats the gap check. + */ + if (guest_base < KBUF_SIZE + g->stack_top + BLOCK_2MIB) { + log_error( + "rosetta: no kbuf gap between stack_top=0x%llx and " + "rosetta@0x%llx", + (unsigned long long) g->stack_top, + (unsigned long long) guest_base); + return -1; + } + uint64_t kbuf_gpa = (guest_base - KBUF_SIZE) & ~(BLOCK_2MIB - 1); + if (guest_init_kbuf(g, kbuf_gpa) < 0) { + log_error("rosetta: guest_init_kbuf failed at 0x%llx", + (unsigned long long) kbuf_gpa); + return -1; + } + + g->rosetta_guest_base = guest_base; + g->rosetta_va_base = va_base; + g->rosetta_size = size; + g->rosetta_entry = ri->entry; + + if (verbose) { + log_debug( + "rosetta: GPA=0x%llx VA=0x%llx size=%llu MiB " + "kbuf_gpa=0x%llx ttbr1=0x%llx", + (unsigned long long) guest_base, (unsigned long long) va_base, + (unsigned long long) (size >> 20), + (unsigned long long) g->kbuf_gpa, + (unsigned long long) g->ttbr1); + } + } else { + /* execve re-entry. The placement (guest_base, va_base, kbuf_gpa) + * survived guest_reset; the PT pool was zeroed so the TTBR1 tree + * must be rebuilt. Segments get reloaded in place. + */ + guest_base = g->rosetta_guest_base; + uint64_t load_base = guest_base - va_base; + if (elf_map_segments(ri, ROSETTA_PATH, g->host_base, g->guest_size, + load_base) < 0) { + log_error("rosetta: re-entry elf_map_segments failed"); + return -1; + } + if (g->kbuf_base) + memset(g->kbuf_base, 0, KBUF_SIZE); + if (guest_init_kbuf(g, g->kbuf_gpa) < 0) { + log_error("rosetta: re-entry guest_init_kbuf failed"); + return -1; + } + } + + /* I-cache invalidation for the loaded executable segments. macOS wrote + * the bytes via the data side; without explicit invalidation the first + * execution can hit stale I-cache entries. + */ + for (int i = 0; i < ri->num_segments; i++) { + if (!(ri->segments[i].flags & PF_X)) + continue; + uint64_t seg_offset = ri->segments[i].gpa - va_base; + if (seg_offset > size || ri->segments[i].memsz > size - seg_offset) { + log_error("rosetta: segment %d out of placement bounds", i); + return -1; + } + sys_icache_invalidate( + (uint8_t *) g->host_base + guest_base + seg_offset, + ri->segments[i].memsz); + } + + /* One mem_region_t covering the whole rosetta image with the union of + * segment permissions. The page-table builder honours va_base for + * non-identity placement: entry indices come from va_base + offset, + * block descriptors point at guest_base + offset (the primary buffer). + * RWX is acceptable because rosetta is a static binary whose JIT writes + * code into separate mmap regions (not into its own image). + */ + int perms = MEM_PERM_R; + for (int i = 0; i < ri->num_segments; i++) { + if (ri->segments[i].flags & PF_W) + perms |= MEM_PERM_W; + if (ri->segments[i].flags & PF_X) + perms |= MEM_PERM_X; + } + if (*nregions >= max_regions) { + log_error("rosetta: page-table region table exhausted"); + return -1; + } + regions[(*nregions)++] = (mem_region_t) { + .gpa_start = guest_base, + .gpa_end = guest_base + size, + .va_base = va_base, + .perms = perms, + }; + + result->entry_point = ri->entry; + return 0; +} + +int rosetta_finalize(guest_t *g, + hv_vcpu_t vcpu, + const char *binary_host_path, + bool binary_host_path_temp, + const char *binary_guest_path, + int guest_argc, + const char **guest_argv, + const rosetta_result_t *rr, + bool verbose, + int *out_argc, + const char ***out_argv, + uint64_t *out_vdso_addr) +{ + (void) vcpu; /* TCR_EL1 / TTBR1_EL1 set in bootstrap_create_vcpu */ + (void) out_vdso_addr; /* bootstrap drives vdso_build directly */ + + if (!g || !binary_host_path || !binary_guest_path || !rr || !out_argc || + !out_argv) + return -1; + *out_argc = 0; + *out_argv = NULL; + + /* Defer every externally-visible state change (guest fd 3, cmdline, + * out_argc/out_argv) until after every fallible setup step succeeds. + * Any failure before commit goes through the fail label and tears down + * only the host-local resources allocated so far. + */ + int bin_host_fd = -1; + const char **rosetta_argv = NULL; + int rosetta_argc = 0; + + /* Pre-open the x86_64 binary so it can be installed at guest fd 3 once + * the full setup succeeds. Rosetta locates its target via /proc/self/fd/3 + * (binfmt_misc convention). + */ + bin_host_fd = open(binary_host_path, O_RDONLY); + if (bin_host_fd < 0) { + log_error("rosetta_finalize: failed to open binary '%s': %s", + binary_host_path, strerror(errno)); + goto fail; + } + + /* Construct binfmt_misc-style argv. The lossy form is chosen here: + * argv = [ROSETTA_PATH, binary_path, original_argv[1..argc-1], NULL] + * + * Rosetta uses argv[1] (the binary path) as the guest's argv[0] after + * stripping the first two slots. Trade-offs vs the preserving form: + * + busybox applet dispatch (basename(argv[0])) works because the + * guest sees argv[0] == binary_path, and that path's basename is + * the applet name. + * - login shells that rely on the leading-dash argv[0] convention + * (-sh, -bash) lose the dash. + * - execve(path, "altname", ...) loses "altname"; the guest sees + * binary_path as argv[0]. + * If a real workload surfaces either pain point, switch to the + * preserving form (4 slots: rosetta, binary, original_argv[0], + * original_argv[1..]) and verify rosetta strips both leading slots + * rather than one. + * + * Minimum argc is 2 (rosetta + binary) so rosetta always sees argv[1] + * even when the caller supplied no argv. + */ + rosetta_argc = (guest_argc > 0) ? guest_argc + 1 : 2; + rosetta_argv = malloc(sizeof(char *) * (rosetta_argc + 1)); + if (!rosetta_argv) { + log_error("rosetta_finalize: malloc(%d argv slots) failed", + rosetta_argc + 1); + goto fail; + } + rosetta_argv[0] = ROSETTA_PATH; + rosetta_argv[1] = binary_guest_path; + for (int i = 1; i < guest_argc; i++) + rosetta_argv[i + 1] = guest_argv[i]; + rosetta_argv[rosetta_argc] = NULL; + + /* Install the TTBR0 user-VA alias for the kbuf so rosetta's TaggedPointer + * extraction (which strips bits 63:48) resolves to the same physical + * pages as the TTBR1 kernel-VA window. The aliasing-proof invariant + * (RW + UXN + PXN under both mappings) is enforced inside the helper. + * An installed-but-unused alias is harmless (read-write pages aliasing + * the same physical kbuf), so the commit step below does not need to + * roll it back if a later allocation fails. + */ + if (guest_install_kbuf_user_alias(g) < 0) { + log_error("rosetta_finalize: failed to install TTBR0 kbuf alias"); + goto fail; + } + + /* Commit: from here on, no failure is possible. Install guest fd 3, + * publish the binary path to the VZ_CAPS handler, refresh + * /proc/self/cmdline, and transfer argv ownership to the caller. + */ + int bin_guest_fd = fd_alloc_at(3, FD_REGULAR, bin_host_fd); + if (bin_guest_fd < 0) { + log_error("rosetta_finalize: fd_alloc_at(3) failed"); + goto fail; + } + bin_host_fd = -1; /* Ownership transferred to the guest fd table */ + /* Mark the rosetta target fd CLOEXEC so a rosetta-to-native execve + * does not leak it into the new image. fd_alloc_at clears + * linux_flags, so the OR is safe. + */ + fd_table[bin_guest_fd].linux_flags |= LINUX_O_CLOEXEC; + + rosettad_set_binary_path(binary_host_path, binary_host_path_temp); + proc_set_cmdline(rosetta_argc, rosetta_argv); + + if (verbose) + log_debug("rosetta_finalize: argv=[%s, %s, ...], target_fd=3", + rosetta_argv[0], rosetta_argv[1]); + + *out_argc = rosetta_argc; + *out_argv = rosetta_argv; + + /* The VZ ioctl trio is in; the rosettad translate pipeline and the + * mem.c body refactor for rosetta high-VA mmap allocations are still + * pending. Without rosettad, rosetta issues a translate request, hits + * the socketpair where the handler returns MISS, and exits. Without + * the high-VA mmap support, rosetta's slab allocator at 240 TiB cannot + * back its JIT memory and aborts in VMAllocationTracker. + */ + log_debug( + "rosetta_finalize: setup complete; runtime path still needs " + "rosettad bridge + high-VA mmap support"); + return 0; + +fail: + if (bin_host_fd >= 0) + close(bin_host_fd); + free(rosetta_argv); + return -1; +} + +/* SHA-256 over a file descriptor */ + +/* Streaming chunk for digesting a binary fd; matches the typical filesystem + * read-ahead window so a few syscalls cover most x86_64 ELFs. + */ +#define ROSETTAD_SHA256_CHUNK 65536 + +static int compute_fd_sha256(int fd, uint8_t digest[ROSETTAD_DIGEST_SIZE]) +{ + off_t saved = lseek(fd, 0, SEEK_CUR); + if (lseek(fd, 0, SEEK_SET) < 0) + return -1; + + CC_SHA256_CTX ctx; + CC_SHA256_Init(&ctx); + + uint8_t buf[ROSETTAD_SHA256_CHUNK]; + ssize_t n; + while ((n = read(fd, buf, sizeof(buf))) > 0) + CC_SHA256_Update(&ctx, buf, (CC_LONG) n); + + if (saved >= 0) + (void) lseek(fd, saved, SEEK_SET); + if (n < 0) + return -1; + + CC_SHA256_Final(digest, &ctx); + return 0; +} + +static void digest_to_hex(const uint8_t digest[ROSETTAD_DIGEST_SIZE], + char hex[ROSETTAD_DIGEST_HEX_LEN]) +{ + static const char hex_chars[] = "0123456789abcdef"; + for (int i = 0; i < ROSETTAD_DIGEST_SIZE; i++) { + hex[i * 2 + 0] = hex_chars[(digest[i] >> 4) & 0xf]; + hex[i * 2 + 1] = hex_chars[digest[i] & 0xf]; + } + hex[ROSETTAD_DIGEST_SIZE * 2] = '\0'; +} + +/* AOT cache paths */ + +/* Build /.cache/elfuse-rosettad[/suffix] into out. When suffix is NULL, + * write the bare cache directory; otherwise append "/". Lazily + * creates ~/.cache and the elfuse-rosettad subdirectory; EEXIST on either is + * fine. Returns 0 on success, -1 on any failure (HOME unset, snprintf + * truncation, mkdir denied for a reason other than EEXIST). + */ +static int aot_cache_path(const char *suffix, char *out, size_t outsz) +{ + const char *home = getenv("HOME"); + if (!home || !*home) + return -1; + + /* Make ~/.cache first (fresh-user case), then the elfuse subdirectory. + * The intermediate path lives in a scratch buffer so out is only written + * once -- callers may pass the same buffer for both phases. + */ + char parent[PATH_MAX]; + int pn = snprintf(parent, sizeof(parent), "%s/.cache", home); + if (pn > 0 && (size_t) pn < sizeof(parent)) + (void) mkdir(parent, 0755); + + char dir[PATH_MAX]; + int dn = snprintf(dir, sizeof(dir), "%s/%s", home, ROSETTAD_CACHE_SUBDIR); + if (dn < 0 || (size_t) dn >= sizeof(dir)) + return -1; + if (mkdir(dir, 0755) < 0 && errno != EEXIST) + return -1; + + int n = suffix ? snprintf(out, outsz, "%s/%s", dir, suffix) + : snprintf(out, outsz, "%s", dir); + if (n < 0 || (size_t) n >= outsz) + return -1; + return 0; +} + +/* Convenience: build the persistent / path for a digest. + * suffix is appended verbatim (".aot", ".aot.new.", etc.). + */ +static int aot_cache_path_for_digest(const uint8_t digest[ROSETTAD_DIGEST_SIZE], + const char *suffix, + char *out, + size_t outsz) +{ + char hex[ROSETTAD_DIGEST_HEX_LEN]; + digest_to_hex(digest, hex); + char leaf[ROSETTAD_DIGEST_HEX_LEN + 32]; + int ln = snprintf(leaf, sizeof(leaf), "%s%s", hex, suffix ? suffix : ""); + if (ln < 0 || (size_t) ln >= sizeof(leaf)) + return -1; + return aot_cache_path(leaf, out, outsz); +} + +/* Open the cached AOT file for a binary with the given digest. Returns + * the open fd (O_RDONLY) on hit, -1 on miss or any other error. The fd + * is positioned at offset 0 so the caller can hand it straight back via + * SCM_RIGHTS. + */ +static int aot_cache_lookup(const uint8_t digest[ROSETTAD_DIGEST_SIZE]) +{ + char path[PATH_MAX]; + if (aot_cache_path_for_digest(digest, ".aot", path, sizeof(path)) < 0) + return -1; + return open(path, O_RDONLY); +} + +/* Materialise the current bytes behind bin_fd into a temp file in the cache + * directory so the translator reads the same inode contents that were hashed. + * The caller owns out_path on success and must unlink it after use. + */ +static int aot_materialize_input_fd(int bin_fd, char out_path[PATH_MAX]) +{ + if (aot_cache_path("input.XXXXXX", out_path, PATH_MAX) < 0) + return -1; + + int out_fd = mkstemp(out_path); + if (out_fd < 0) + return -1; + + off_t saved = lseek(bin_fd, 0, SEEK_CUR); + if (lseek(bin_fd, 0, SEEK_SET) < 0) { + close(out_fd); + (void) unlink(out_path); + return -1; + } + + uint8_t buf[ROSETTAD_SHA256_CHUNK]; + int rc = -1; + for (;;) { + ssize_t nr = read(bin_fd, buf, sizeof(buf)); + if (nr < 0) { + if (errno == EINTR) + continue; + goto out; + } + if (nr == 0) + break; + + uint8_t *p = buf; + ssize_t remaining = nr; + while (remaining > 0) { + ssize_t nw = write(out_fd, p, (size_t) remaining); + if (nw < 0) { + if (errno == EINTR) + continue; + goto out; + } + p += nw; + remaining -= nw; + } + } + + rc = 0; + +out: + if (saved >= 0) + (void) lseek(bin_fd, saved, SEEK_SET); + if (close(out_fd) < 0 && rc == 0) + rc = -1; + if (rc < 0) + (void) unlink(out_path); + return rc; +} + +/* Translator publishes via rename() of its temp file -- a content copy is + * unnecessary because the AOT output is already in the cache directory. + * Keeping the cache-store step as a single rename inline in + * rosettad_translate; a separate copy helper would only be needed for + * paths that produce the AOT bytes outside the cache directory. + */ + +/* SCM_RIGHTS fd-passing */ + +/* Receive exactly one fd alongside buflen bytes of normal data. On + * success returns the number of normal bytes received and writes the fd + * to *out_fd. EINTR is retried. The protocol requires exactly one + * SCM_RIGHTS cmsg carrying exactly one fd; anything weaker (truncation, + * extra cmsgs, multiple fds, wrong cmsg payload size) is a malformed + * peer and rejected. On rejection any kernel-allocated fds are closed + * to avoid leaking them into the elfuse process. + */ +static ssize_t rosettad_recv_fd(int sock, void *buf, size_t buflen, int *out_fd) +{ + *out_fd = -1; + struct iovec iov = {.iov_base = buf, .iov_len = buflen}; + uint8_t cmsg_buf[CMSG_SPACE(sizeof(int))]; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = cmsg_buf, + .msg_controllen = sizeof(cmsg_buf), + }; + ssize_t n; + do { + n = recvmsg(sock, &msg, 0); + } while (n < 0 && errno == EINTR); + if (n <= 0) + return n; + + /* Walk every cmsg first, closing kernel-allocated fds in malformed or + * extra payloads. Defer the MSG_CTRUNC bailout until after the walk so + * fds that fit in cmsg_buf are closed instead of leaked into the elfuse + * process. cmsg_len is validated against CMSG_LEN(0) before any payload + * arithmetic to keep a hostile peer from underflowing the size_t. + */ + int n_rights = 0; + bool malformed = false; + for (struct cmsghdr *c = CMSG_FIRSTHDR(&msg); c; c = CMSG_NXTHDR(&msg, c)) { + if (c->cmsg_level != SOL_SOCKET || c->cmsg_type != SCM_RIGHTS) { + malformed = true; + continue; + } + /* cmsg_len must cover at least the cmsghdr header; the macro + * captures any alignment padding the platform inserts. Anything + * smaller is structurally invalid and the payload arithmetic + * below would underflow into a huge size_t. + */ + if (c->cmsg_len < CMSG_LEN(0)) { + malformed = true; + continue; + } + size_t payload = c->cmsg_len - CMSG_LEN(0); + if (payload % sizeof(int) != 0) { + malformed = true; + continue; + } + size_t nfd = payload / sizeof(int); + for (size_t i = 0; i < nfd; i++) { + int fd; + memcpy(&fd, (uint8_t *) CMSG_DATA(c) + i * sizeof(int), sizeof(fd)); + /* Canonical case: exactly one fd in exactly one SCM_RIGHTS + * cmsg. Anything else (extra cmsgs, extra fds per cmsg) is + * malformed; close every fd that does not fit the canonical + * slot so none leak. + */ + if (n_rights == 0 && nfd == 1) { + *out_fd = fd; + } else { + close(fd); + malformed = true; + } + n_rights++; + } + } + + /* MSG_CTRUNC means the kernel discarded part of the ancillary data + * because cmsg_buf was too small. fds in the discarded portion were + * dropped by the kernel without ever entering this process; fds in + * the surviving cmsgs were walked above. Treat as a hard protocol + * error because the message framing is no longer reliable. + */ + if ((msg.msg_flags & MSG_CTRUNC) || malformed || n_rights != 1) { + if (*out_fd >= 0) { + close(*out_fd); + *out_fd = -1; + } + errno = EPROTO; + return -1; + } + return n; +} + +/* Send one fd alongside a 1-byte normal-data payload. Rosetta's recv_fd + * counterpart allocates a 1-byte iov buffer and silently drops anything + * larger; matching that exactly keeps the protocol bit-compatible. + */ +static ssize_t rosettad_send_fd(int sock, uint8_t payload, int send_fd) +{ + struct iovec iov = {.iov_base = &payload, .iov_len = 1}; + uint8_t cmsg_buf[CMSG_SPACE(sizeof(int))]; + memset(cmsg_buf, 0, sizeof(cmsg_buf)); + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = cmsg_buf, + .msg_controllen = sizeof(cmsg_buf), + }; + struct cmsghdr *c = CMSG_FIRSTHDR(&msg); + c->cmsg_level = SOL_SOCKET; + c->cmsg_type = SCM_RIGHTS; + c->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(c), &send_fd, sizeof(int)); + ssize_t n; + do { + n = sendmsg(sock, &msg, 0); + } while (n < 0 && errno == EINTR); + return n; +} + +/* Translate subprocess */ + +/* Spawn `elfuse rosettad translate ` and wait for it + * to exit. Returns 0 if the translator exited successfully and the + * output file is non-empty, -1 otherwise. + */ +static int translate_via_rosettad(const char *in_path, const char *out_path) +{ + const char *self = proc_get_elfuse_path(); + if (!self) { + log_error( + "rosettad: cannot locate elfuse binary for translate " + "subprocess"); + return -1; + } + char *argv[] = { + (char *) self, (char *) "rosettad", (char *) "translate", + (char *) in_path, (char *) out_path, NULL, + }; + pid_t pid; + extern char **environ; + if (posix_spawn(&pid, self, NULL, NULL, argv, environ) != 0) { + log_error("rosettad: posix_spawn failed: %s", strerror(errno)); + return -1; + } + + /* Bounded wait: a hung translator must not stall the handler thread + * that owns the rosetta socket. ROSETTAD_TRANSLATE_TIMEOUT_SEC bounds + * the wall-clock budget; on expiry, SIGKILL the child and report MISS + * to the caller so rosetta falls through to its JIT path. Override via + * the ELFUSE_ROSETTAD_TIMEOUT env var (seconds) for stress testing. + */ + int timeout_sec = 120; + const char *to_env = getenv("ELFUSE_ROSETTAD_TIMEOUT"); + if (to_env && *to_env) { + long v = strtol(to_env, NULL, 10); + if (v > 0 && v < 3600) + timeout_sec = (int) v; + } + int status = 0; + pid_t r = 0; + int waited_ms = 0; + const int poll_ms = 50; + while (waited_ms < timeout_sec * 1000) { + r = waitpid(pid, &status, WNOHANG); + if (r == pid) + break; + if (r < 0) { + if (errno == EINTR) + continue; + log_error("rosettad: waitpid failed: %s", strerror(errno)); + return -1; + } + struct timespec sleep_req = {.tv_sec = 0, + .tv_nsec = poll_ms * 1000000L}; + nanosleep(&sleep_req, NULL); + waited_ms += poll_ms; + } + if (r != pid) { + log_error( + "rosettad: translate subprocess timed out after %d s, " + "killing pid %d", + timeout_sec, (int) pid); + kill(pid, SIGKILL); + do { + r = waitpid(pid, &status, 0); + } while (r < 0 && errno == EINTR); + return -1; + } + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + log_error("rosettad: translate subprocess exited %d (status=0x%x)", + WIFEXITED(status) ? WEXITSTATUS(status) : -1, status); + return -1; + } + struct stat st; + if (stat(out_path, &st) < 0 || st.st_size == 0) { + log_error("rosettad: translate produced empty/missing output"); + return -1; + } + return 0; +} + +/* Run the translate pipeline for a binary fd: SHA-256, cache lookup + * (hit -> return cached fd), or spawn the translator and publish the + * result. Returns an O_RDONLY fd pointing at the AOT file on success, + * -1 on any failure. *out_digest is always written when the SHA-256 + * succeeds; the caller passes it back to rosetta so subsequent `d` + * lookups reuse the same key. + */ +static int rosettad_translate(int bin_fd, + uint8_t out_digest[ROSETTAD_DIGEST_SIZE]) +{ + /* Materialise first, hash second. Hashing bin_fd directly and then + * copying its bytes later opens a TOCTOU window: a concurrent writer + * mutating the inode between the two reads can poison the cache with + * bytes whose digest does not match. Snapshot the contents into an + * elfuse-owned temp file, then compute the digest from that snapshot + * so (hash, bytes) are taken from the same on-disk state. + */ + char in_path[PATH_MAX]; + if (aot_materialize_input_fd(bin_fd, in_path) < 0) { + log_error("rosettad: failed to materialize translate input"); + return -1; + } + + int in_fd = open(in_path, O_RDONLY | O_CLOEXEC); + if (in_fd < 0) { + log_error("rosettad: failed to reopen translate input: %s", + strerror(errno)); + (void) unlink(in_path); + return -1; + } + if (compute_fd_sha256(in_fd, out_digest) < 0) { + log_error("rosettad: digest of translate input failed"); + close(in_fd); + (void) unlink(in_path); + return -1; + } + close(in_fd); + + int cached = aot_cache_lookup(out_digest); + if (cached >= 0) { + (void) unlink(in_path); + return cached; + } + + char tmp_suffix[32]; + if (snprintf(tmp_suffix, sizeof(tmp_suffix), ".aot.new.%d", + (int) getpid()) < 0) { + (void) unlink(in_path); + return -1; + } + char tmp_path[PATH_MAX]; + if (aot_cache_path_for_digest(out_digest, tmp_suffix, tmp_path, + sizeof(tmp_path)) < 0) { + (void) unlink(in_path); + return -1; + } + + if (translate_via_rosettad(in_path, tmp_path) < 0) { + (void) unlink(in_path); + (void) unlink(tmp_path); + return -1; + } + (void) unlink(in_path); + + int aot_fd = open(tmp_path, O_RDONLY); + if (aot_fd < 0) { + (void) unlink(tmp_path); + return -1; + } + + /* Publish to the persistent cache; if rename fails another translator + * is racing this one, but aot_fd still points at the temp file's data + * and is safe to return. + */ + char final_path[PATH_MAX]; + if (aot_cache_path_for_digest(out_digest, ".aot", final_path, + sizeof(final_path)) == 0) { + if (rename(tmp_path, final_path) < 0) + (void) unlink(tmp_path); + } + return aot_fd; +} + +/* rosettad handler thread */ + +/* Maximum size of the per-translate params buffer rosetta sends alongside + * the binary fd. The protocol allows up to this many bytes of opaque data; + * the handler reads them but does not interpret them today. + */ +#define ROSETTAD_PARAMS_MAX 256 + +/* Rosetta's view of the socketpair: the fd it received via sys_socket. + * Recorded so sys_connect / recvmsg / sendmsg can short-circuit the + * connect (the socketpair is pre-wired) and pick rosettad-aware paths. + * Static visibility: at most one rosettad bridge per elfuse process. + * + * The handler thread writes -1 to this field at termination while syscall + * threads keep reading it via rosettad_is_socket. Plain int would let the + * compiler tear or fold the load; atomic load/store with relaxed ordering + * is enough since each read is independent of any other state. + */ +static _Atomic int rosettad_client_fd = -1; + +bool rosettad_is_socket(int host_fd) +{ + int active = + atomic_load_explicit(&rosettad_client_fd, memory_order_relaxed); + return host_fd >= 0 && host_fd == active; +} + +bool rosettad_wait_for_idle(unsigned int max_ms) +{ + /* 1 ms granularity poll on the atomic. The handler thread clears the + * marker on EOF or on explicit QUIT; both paths run inside the + * detached worker, which the caller cannot join. + */ + for (unsigned int i = 0; i < max_ms; i++) { + if (atomic_load_explicit(&rosettad_client_fd, memory_order_acquire) == + -1) + return true; + usleep(1000); + } + return atomic_load_explicit(&rosettad_client_fd, memory_order_acquire) == + -1; +} + +static ssize_t rosettad_read_full(int fd, void *buf, size_t len) +{ + uint8_t *p = buf; + size_t got = 0; + while (got < len) { + ssize_t n = read(fd, p + got, len - got); + if (n == 0) + return (ssize_t) got; + if (n < 0) { + if (errno == EINTR) + continue; + return -1; + } + got += (size_t) n; + } + return (ssize_t) got; +} + +static int rosettad_write_byte(int fd, uint8_t b) +{ + for (;;) { + ssize_t n = write(fd, &b, 1); + if (n == 1) + return 0; + if (n < 0 && errno == EINTR) + continue; + return -1; + } +} + +static int rosettad_write_full(int fd, const void *buf, size_t len) +{ + const uint8_t *p = buf; + size_t sent = 0; + while (sent < len) { + ssize_t n = write(fd, p + sent, len - sent); + if (n > 0) { + sent += (size_t) n; + continue; + } + if (n < 0 && errno == EINTR) + continue; + return -1; + } + return 0; +} + +/* Send the success reply for a TRANSLATE or DIGEST command: + * {HIT byte, optional 32-byte digest, AOT fd via SCM_RIGHTS in a 1-byte iov}. + * Pass digest=NULL on the DIGEST path (rosetta already knows the key). + * Returns 0 on success, -1 if any wire write failed. + */ +static int rosettad_send_aot(int fd, const uint8_t *digest, int aot_fd) +{ + if (rosettad_write_byte(fd, ROSETTAD_RESP_HIT) < 0) + return -1; + if (digest && rosettad_write_full(fd, digest, ROSETTAD_DIGEST_SIZE) < 0) + return -1; + if (rosettad_send_fd(fd, 0, aot_fd) < 0) + return -1; + return 0; +} + +static void *rosettad_handler_thread(void *arg) +{ + int fd = (int) (intptr_t) arg; + + /* Block the host-directed signals elfuse uses internally so the handler + * thread is not interrupted in the middle of a protocol exchange. The + * SIGPIPE / SIGCHLD masking matches the rest of elfuse's worker + * threads; SIGUSR1 is the vCPU timer kick. + */ + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, SIGPIPE); + sigaddset(&mask, SIGCHLD); + sigaddset(&mask, SIGUSR1); + pthread_sigmask(SIG_BLOCK, &mask, NULL); + + for (;;) { + uint8_t cmd; + ssize_t n; + do { + n = read(fd, &cmd, 1); + } while (n < 0 && errno == EINTR); + if (n <= 0) + break; + + switch (cmd) { + case ROSETTAD_CMD_HANDSHAKE: + /* Reply HIT so rosetta proceeds with the AOT-aware code path. */ + if (rosettad_write_byte(fd, ROSETTAD_RESP_HIT) < 0) + goto done; + break; + + case ROSETTAD_CMD_TRANSLATE: { + /* Translate request: rosetta sends the binary fd via sendmsg + * with up to ROSETTAD_PARAMS_MAX bytes of params. Compute the + * SHA-256, look up the persistent AOT cache, and on miss invoke + * the translator subprocess. Reply is {HIT, digest[32], fd via + * SCM_RIGHTS in a 1-byte iov payload} or MISS on any failure. + * + * recv_fd failure means protocol desync (no way to tell what + * rosetta did or did not send): close the connection. Translate + * failure is recoverable -- reply MISS and keep serving. + */ + uint8_t params[ROSETTAD_PARAMS_MAX]; + int bin_fd = -1; + ssize_t rn = rosettad_recv_fd(fd, params, sizeof(params), &bin_fd); + if (rn <= 0 || bin_fd < 0) { + if (bin_fd >= 0) + close(bin_fd); + (void) rosettad_write_byte(fd, ROSETTAD_RESP_MISS); + goto done; + } + uint8_t digest[ROSETTAD_DIGEST_SIZE]; + int aot_fd = rosettad_translate(bin_fd, digest); + close(bin_fd); + if (aot_fd < 0) { + if (rosettad_write_byte(fd, ROSETTAD_RESP_MISS) < 0) + goto done; + break; + } + int sr = rosettad_send_aot(fd, digest, aot_fd); + close(aot_fd); + if (sr < 0) + goto done; + break; + } + + case ROSETTAD_CMD_DIGEST: { + /* Digest lookup: rosetta caches its own .flu digests across + * runs and asks here first to skip re-translation. Cache hit + * sends back the AOT fd; miss makes rosetta fall through to a + * full translate request. + */ + uint8_t digest[ROSETTAD_DIGEST_SIZE]; + if (rosettad_read_full(fd, digest, sizeof(digest)) != + (ssize_t) sizeof(digest)) + goto done; + int cached = aot_cache_lookup(digest); + if (cached < 0) { + if (rosettad_write_byte(fd, ROSETTAD_RESP_MISS) < 0) + goto done; + break; + } + int sr = rosettad_send_aot(fd, NULL, cached); + close(cached); + if (sr < 0) + goto done; + break; + } + + case ROSETTAD_CMD_QUIT: + goto done; + + default: + /* Unknown command: reply MISS to keep the wire balanced and + * hope rosetta recovers. A real cache miss / handshake noise + * landing in this branch would otherwise hang the protocol. + */ + if (rosettad_write_byte(fd, ROSETTAD_RESP_MISS) < 0) + goto done; + break; + } + } + +done: + /* Drop the client-fd marker so rosettad_is_socket stops misclassifying + * a recycled fd number. The actual client fd was closed by the guest + * (or will be) -- this just retracts the bridge claim. The handler + * does not race with sys_socket starting another bridge: at most one + * rosetta bridge per elfuse process by design. + */ + atomic_store_explicit(&rosettad_client_fd, -1, memory_order_relaxed); + close(fd); + return NULL; +} + +int rosettad_start_handler(int handler_fd, int client_fd) +{ + if (handler_fd < 0 || client_fd < 0) + return -1; + /* Only one bridge per process is supported. Claim the slot via a single + * atomic compare-exchange so two threads cannot both observe -1 and then + * race to install their own client fd; the loser sees the winner's fd + * loaded into 'expected' and fails out cleanly. + */ + int expected = -1; + if (!atomic_compare_exchange_strong_explicit( + &rosettad_client_fd, &expected, client_fd, memory_order_acq_rel, + memory_order_relaxed)) { + log_error("rosettad_start_handler: bridge already active on fd %d", + expected); + return -1; + } + + pthread_t thr; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + int rc = pthread_create(&thr, &attr, rosettad_handler_thread, + (void *) (intptr_t) handler_fd); + pthread_attr_destroy(&attr); + if (rc != 0) { + log_error("rosettad_start_handler: pthread_create failed: %s", + strerror(rc)); + atomic_store_explicit(&rosettad_client_fd, -1, memory_order_release); + return -1; + } + return 0; +} diff --git a/src/core/rosetta.h b/src/core/rosetta.h new file mode 100644 index 0000000..34b95f5 --- /dev/null +++ b/src/core/rosetta.h @@ -0,0 +1,207 @@ +/* x86_64-via-Apple-Rosetta translator setup for elfuse. + * + * Copyright 2026 elfuse contributors + * Copyright 2025 Moritz Angermann, zw3rk pte. ltd. + * SPDX-License-Identifier: Apache-2.0 + * + * Two-phase API: + * - rosetta_prepare() runs before guest_build_page_tables(), loading Rosetta + * binary into the primary buffer at a low GPA below interp_base, initializes + * the TTBR1 kbuf, and appends a non-identity mem_region_t so the page-table + * builder exposes the segments at their statically-linked high VA + * (0x800000000000). The primary-buffer placement is required because HVF o + * Apple Silicon caps Stage-2 hv_vm_map at the hardware-default IPA width (36 + * bits on M1), so a separate high-IPA mapping is not viable. + * - rosetta_finalize() runs after guest_build_page_tables(), installing TTBR0 + * kbuf user-VA alias, builds the vDSO, registers semantic regions for + * /proc/self/maps, pre-opens the x86_64 binary at fd 3, constructs + * binfmt_misc-style argv, and refreshes proc state. + * + * Bootstrap-side wiring (placement, kbuf, page-tables, fd 3, binfmt argv, TTBR0 + * kbuf alias, VZ probe ioctls, rosettad socket bridge and AOT cache) is in. The + * runtime still depends on the high-VA mmap body refactor for Rosetta's own + * fixed-address slab and JIT allocations. + */ + +#pragma once + +#include +#include +#include + +#include "core/elf.h" +#include "core/guest.h" + +/* Apple Rosetta Linux translator path. The OS ships the binary inside the + * platform image; users who have not run 'softwareupdate --install-rosetta' + * will not have this file and elfuse must refuse the load with a helpful + * error. + */ +#define ROSETTA_PATH "/Library/Apple/usr/libexec/oah/RosettaLinux/rosetta" + +/* Path of Apple's standalone translator daemon. The 'elfuse rosettad + * translate' subcommand re-execs into this binary inside an aarch64-linux + * guest to materialise an AOT translation on cache miss. + */ +#define ROSETTAD_TRANSLATOR_PATH \ + "/Library/Apple/usr/libexec/oah/RosettaLinux/rosettad" + +/* Rosetta's Virtualization.framework probe ioctls. Rosetta issues these on + * an open fd very early at startup to verify that it is running inside a + * supported VZ environment. Without affirmative responses, rosetta prints + * "Rosetta is only intended to run on Apple Silicon ..." and exits. + * + * Reverse-engineered from the rosetta binary; values match what the + * Lima-on-VZ Linux VM observes via strace. + */ +#define ROSETTA_VZ_CHECK 0x80456125 /* Returns 69-byte signature */ +#define ROSETTA_VZ_CAPS 0x80806123 /* Returns 128-byte capability blob */ +#define ROSETTA_VZ_ACTIVATE 0x6124 /* Activate VZ mode (just returns 1) */ + +/* VZ_CAPS buffer layout */ +#define ROSETTA_CAPS_SIZE 128 +#define ROSETTA_CAPS_VZ_ENABLE 0 +#define ROSETTA_CAPS_SOCKET_PATH 1 +#define ROSETTA_CAPS_SOCKET_PATH_LEN 64 +#define ROSETTA_CAPS_BINARY_PATH 66 +#define ROSETTA_CAPS_BINARY_PATH_LEN 42 +#define ROSETTA_CAPS_VZ_SECONDARY 108 +#define ROSETTA_VZ_SIG_LEN 69 +#define ROSETTAD_SOCKET_PATH "/run/rosettad/rosetta.sock" + +/* Record the x86_64 binary path for both Rosetta consumers: + * - VZ_CAPS exposes a 42-byte inline path field to rosetta itself. When the + * original host path is longer, publish a short runtime alias instead of a + * truncated string. + * - The host-side translate subprocess needs the full original path. + * Subsequent calls overwrite the previous value (execve into a different + * x86_64 binary). + */ +void rosettad_set_binary_path(const char *path, bool take_ownership); +void rosettad_clear_binary_path(void); + +/* Snapshot the published paths into caller-supplied buffers under the + * setter's mutex. Returns the byte count written (excluding NUL). The + * lock keeps the VZ_CAPS reader (any vCPU) and the execve writer from + * racing on the static buffer contents. + * + * Caller buffers: + * rosettad_snapshot_binary_path - PATH_MAX wide for full host path + * rosettad_snapshot_caps_binary_path - >= ROSETTA_CAPS_BINARY_PATH_LEN + */ +size_t rosettad_snapshot_binary_path(char *out_buf, size_t out_size); +size_t rosettad_snapshot_caps_binary_path(char *out_buf, size_t out_size); + +/* rosettad wire protocol. + * + * Rosetta opens AF_UNIX SOCK_SEQPACKET and connects to a socket; macOS + * lacks SOCK_SEQPACKET for AF_UNIX so elfuse intercepts socket(SEQPACKET) + * with socketpair(SOCK_STREAM) and runs a handler thread on the other + * end. The thread implements the single-byte command protocol below. + * + * Wire sequence (rosetta is the client, handler is the daemon): + * '?' (HANDSHAKE): handler replies one byte HIT (0x01) when ready. + * 't' (TRANSLATE): rosetta sendmsg's the x86_64 binary fd via SCM_RIGHTS + * with up to 256 bytes of params. Handler computes SHA-256, checks + * the persistent AOT cache, on miss spawns the real rosettad to do + * the translation, then sends back {success_byte, 32-byte digest, + * AOT fd via SCM_RIGHTS in a 1-byte iov payload}. + * 'd' (DIGEST): rosetta sends a 32-byte SHA-256 fingerprint; handler + * looks up the persistent cache and replies HIT + fd or MISS. + * 'q' (QUIT): handler thread exits and closes its end of the socket. + */ +#define ROSETTAD_CMD_HANDSHAKE '?' +#define ROSETTAD_CMD_TRANSLATE 't' +#define ROSETTAD_CMD_DIGEST 'd' +#define ROSETTAD_CMD_QUIT 'q' + +#define ROSETTAD_RESP_HIT 0x01 +#define ROSETTAD_RESP_MISS 0x00 + +#define ROSETTAD_DIGEST_SIZE 32 +#define ROSETTAD_DIGEST_HEX_LEN (ROSETTAD_DIGEST_SIZE * 2 + 1) + +/* Persistent AOT cache directory, relative to $HOME. Real rosettad uses + * ~/.cache/rosetta/ for its own .flu cache; elfuse's intercept runs in + * parallel under a separate subdirectory to keep the two from colliding. + */ +#define ROSETTAD_CACHE_SUBDIR ".cache/elfuse-rosettad" + +/* Spawn the rosettad handler thread on the elfuse-side end of the + * socketpair. handler_fd is the host fd the thread reads/writes; client_fd + * is the rosetta-visible side, recorded so rosettad_is_socket can later + * identify it (sys_recvmsg / sys_sendmsg paths use this to decide whether + * to take the rosettad-aware code branch). + * + * Returns 0 on success, -1 on pthread_create failure. The thread runs + * detached; its lifetime is bounded by the client closing its fd (read + * returns 0) or by an explicit ROSETTAD_CMD_QUIT. + */ +int rosettad_start_handler(int handler_fd, int client_fd); + +/* True when host_fd is the rosetta-visible end of a socketpair installed + * by rosettad_start_handler. Used by sys_connect to short-circuit the + * connect (the socketpair is pre-wired) and by sendmsg/recvmsg to pick + * the rosettad-aware code paths. + */ +bool rosettad_is_socket(int host_fd); + +/* Block (with a short poll loop) until the rosettad bridge handler thread + * has cleared its process-global client-fd marker, OR the timeout elapses. + * Used by sys_execve before installing a fresh bridge so a stale handler + * winding down does not collide with the new rosettad_start_handler CAS. + * + * Returns true if the bridge is idle on return, false if the timeout + * expired with the bridge still claimed. + */ +bool rosettad_wait_for_idle(unsigned int max_ms); + +/* Rosetta is statically linked at the 128 TiB mark. Stage-2 mapping installed + * via guest_add_mapping covers this IPA; guest page tables map the VA range + * identity-within-segment to that IPA. + */ +#define ROSETTA_VA_BASE_DEFAULT 0x800000000000ULL + +typedef struct { + elf_info_t rosetta_info; + uint64_t entry_point; /* High-VA entry from rosetta ELF */ +} rosetta_result_t; + +/* First-pass rosetta setup, runs before guest_build_page_tables(): parse + * the rosetta binary, place its segments in the primary buffer (or reload + * into the existing placement on execve), initialise the TTBR1 kbuf, and + * append page-table regions for the builder. A single non-identity + * mem_region_t covers the rosetta image, mapping its statically-linked high + * VA to the chosen low GPA via mem_region_t.va_base. + * + * The caller's regions array and *nregions cursor are updated. Returns 0 on + * success, -1 on any failure; on failure g->rosetta_* state is left in the + * configuration it had on entry (so a retry can succeed). + */ +int rosetta_prepare(guest_t *g, + const char *binary_path, + mem_region_t *regions, + int *nregions, + int max_regions, + bool verbose, + rosetta_result_t *result); + +/* Second-pass rosetta setup, runs after guest_build_page_tables(): install + * the TTBR0 user-VA alias for the kbuf, pre-open the x86_64 binary at fd 3, + * build the binfmt_misc argv ([ROSETTA_PATH, binary, original argv[1..]]) + * for build_linux_stack to consume, and refresh proc state. The remaining + * runtime blocker after this stage is high-VA mmap support for Rosetta's + * own internal fixed-address allocations. + */ +int rosetta_finalize(guest_t *g, + hv_vcpu_t vcpu, + const char *binary_host_path, + bool binary_host_path_temp, + const char *binary_guest_path, + int guest_argc, + const char **guest_argv, + const rosetta_result_t *rr, + bool verbose, + int *out_argc, + const char ***out_argv, + uint64_t *out_vdso_addr); diff --git a/src/hvutil.h b/src/hvutil.h index 6a211d1..02995fe 100644 --- a/src/hvutil.h +++ b/src/hvutil.h @@ -65,8 +65,13 @@ /* TCR_EL1. * 4KiB granule, 48-bit VA, EPD1=1 (TTBR1 walks disabled). * Used by main.c (initial setup) and syscall/exec.c (exec re-init). + * + * The KBUF variant clears EPD1 (TTBR1 walks enabled) and sets TBI1=1 so + * rosetta's TaggedPointer masking still resolves the kbuf window at the + * bits-63-set VA range. Selected at bootstrap when g->is_rosetta is set. */ #define TCR_EL1_VALUE 0x25B5903510ULL +#define TCR_EL1_VALUE_KBUF 0x65B5103510ULL /* vCPU register helpers. * diff --git a/src/main.c b/src/main.c index cebf591..30eb23d 100644 --- a/src/main.c +++ b/src/main.c @@ -29,11 +29,14 @@ #include "core/bootstrap.h" #include "core/guest.h" +#include "core/rosetta.h" #include "core/sysroot.h" #include "runtime/forkipc.h" #include "runtime/proctitle.h" +#include "syscall/fuse.h" +#include "syscall/path.h" #include "syscall/proc.h" #include "debug/gdbstub.h" @@ -50,6 +53,42 @@ static int parse_int_arg(const char *s, int min, int max, int *out) return 0; } +static int resolve_guest_elf_host_path(const char *elf_guest_path, + char *elf_host_path, + size_t elf_host_path_sz, + bool *elf_host_temp) +{ + path_translation_t tx; + if (!elf_guest_path || !elf_host_path || elf_host_path_sz == 0 || + !elf_host_temp) { + errno = EINVAL; + return -1; + } + + *elf_host_temp = false; + if (path_translate_at(LINUX_AT_FDCWD, elf_guest_path, PATH_TR_NONE, &tx) < + 0) + return -1; + + if (tx.fuse_path) { + int rc = fuse_materialize_path(tx.intercept_path, elf_host_path, + elf_host_path_sz); + if (rc < 0) { + errno = -rc; + return -1; + } + *elf_host_temp = true; + return 0; + } + + size_t len = str_copy_trunc(elf_host_path, tx.host_path, elf_host_path_sz); + if (len >= elf_host_path_sz) { + errno = ENAMETOOLONG; + return -1; + } + return 0; +} + static void free_guest_argv(const char **guest_argv, int guest_argc) { if (!guest_argv) @@ -70,6 +109,7 @@ static void cleanup_main_resources(guest_t *g, { if (guest_initialized) guest_destroy(g); + rosettad_clear_binary_path(); if (host_cwd && host_cwd[0] != '\0' && chdir(host_cwd) < 0) (void) chdir("/"); sysroot_cleanup_mount(sysroot_mount); @@ -120,6 +160,12 @@ int main(int argc, char **argv) log_init(); bool verbose = false; + /* x86_64-via-Rosetta is on by default; --no-rosetta or + * ELFUSE_NO_ROSETTA=1 disables it. Architecture is auto-detected from + * the ELF header in guest_bootstrap_prepare; the access() probe in + * rosetta_prepare surfaces an install hint if Rosetta is not present. + */ + bool rosetta_enabled = true; int timeout_sec = 10, fork_child_fd = -1, vfork_notify_fd = -1; const char *sysroot = NULL; const char *create_sysroot = NULL; @@ -127,6 +173,25 @@ int main(int argc, char **argv) bool gdb_stop_on_entry = false; int arg_start = 1; + /* 'elfuse rosettad translate ' runs the real Apple rosettad + * binary inside an elfuse guest to materialise an AOT translation. The + * rosettad bridge in src/core/rosetta.c invokes this on cache miss; the + * subcommand rewrites argv so the rest of main proceeds as a normal + * aarch64-linux execution of rosettad with the requested arguments. + */ + if (argc >= 5 && !strcmp(argv[1], "rosettad") && + !strcmp(argv[2], "translate")) { + static char *rewritten[6]; + rewritten[0] = argv[0]; + rewritten[1] = (char *) ROSETTAD_TRANSLATOR_PATH; + rewritten[2] = (char *) "translate"; + rewritten[3] = argv[3]; + rewritten[4] = argv[4]; + rewritten[5] = NULL; + argv = rewritten; + argc = 5; + } + /* --help and --version do not require an ELF path. */ if (argc > 1) { if (!strcmp(argv[1], "--version") || !strcmp(argv[1], "-V")) { @@ -137,6 +202,7 @@ int main(int argc, char **argv) printf( "usage: elfuse [--verbose] [--timeout N] [--sysroot PATH]\n" " [--create-sysroot PATH]\n" + " [--no-rosetta]\n" " [--gdb PORT] [--gdb-stop-on-entry]\n" " [args...]\n" "\n" @@ -150,6 +216,9 @@ int main(int argc, char **argv) "PATH first\n" " --create-sysroot PATH Provision and use a case-sensitive " "APFS sparsebundle mounted at PATH\n" + " --no-rosetta Disable x86_64-via-Rosetta " + "(architecture is auto-detected from the ELF header; " + "on by default)\n" " --gdb PORT Listen for GDB Remote Serial " "Protocol on PORT\n" " --gdb-stop-on-entry Halt before the first guest " @@ -199,6 +268,9 @@ int main(int argc, char **argv) arg_start + 1 < argc) { create_sysroot = argv[arg_start + 1]; arg_start += 2; + } else if (!strcmp(argv[arg_start], "--no-rosetta")) { + rosetta_enabled = false; + arg_start++; } else if (!strcmp(argv[arg_start], "--gdb") && arg_start + 1 < argc) { if (parse_int_arg(argv[arg_start + 1], 1, 65535, &gdb_port) < 0) { log_error("invalid GDB port: %s", argv[arg_start + 1]); @@ -217,7 +289,8 @@ int main(int argc, char **argv) log_error("unknown option: %s", argv[arg_start]); log_error( "usage: elfuse [--verbose] [--timeout N] " - "[--sysroot PATH] [--create-sysroot PATH] [--gdb PORT] " + "[--sysroot PATH] [--create-sysroot PATH] [--no-rosetta] " + "[--gdb PORT] " "[--gdb-stop-on-entry] [args...]"); return 1; } @@ -229,6 +302,19 @@ int main(int argc, char **argv) return 1; } + /* ELFUSE_NO_ROSETTA=1 mirrors --no-rosetta for environments where + * passing flags is awkward (test harnesses, wrapper scripts). Unset or + * any other value leaves the default on. Commit the result before the + * --fork-child early-return so helper processes inherit the parent's + * opt-out semantics exactly. + */ + if (rosetta_enabled) { + const char *no_rosetta_env = getenv("ELFUSE_NO_ROSETTA"); + if (no_rosetta_env && strcmp(no_rosetta_env, "1") == 0) + rosetta_enabled = false; + } + proc_set_rosetta_enabled(rosetta_enabled); + /* Fork-child mode: receive VM state over IPC and run */ if (fork_child_fd >= 0) return fork_child_main(fork_child_fd, vfork_notify_fd, verbose, @@ -237,7 +323,8 @@ int main(int argc, char **argv) if (arg_start >= argc) { log_error( "usage: elfuse [--verbose] [--timeout N] " - "[--sysroot PATH] [--create-sysroot PATH] [args...]"); + "[--sysroot PATH] [--create-sysroot PATH] [--no-rosetta] " + " [args...]"); return 1; } @@ -271,6 +358,8 @@ int main(int argc, char **argv) bool guest_initialized = false; sysroot_mount_t sysroot_mount; char host_cwd[LINUX_PATH_MAX]; + char elf_host_path[LINUX_PATH_MAX]; + bool elf_host_temp = false; bool have_host_cwd = (getcwd(host_cwd, sizeof(host_cwd)) != NULL); memset(&sysroot_mount, 0, sizeof(sysroot_mount)); if (!elf_path || (have_sysroot && !sysroot_path) || !guest_argv) { @@ -320,17 +409,54 @@ int main(int argc, char **argv) return 1; } + proc_set_sysroot(sysroot); + if (resolve_guest_elf_host_path(elf_path, elf_host_path, + sizeof(elf_host_path), + &elf_host_temp) < 0) { + log_error("failed to resolve ELF path %s: %s", elf_path, + strerror(errno)); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, + guest_argc, elf_path, sysroot_path); + if (elf_host_temp) + unlink(elf_host_path); + return 1; + } + + if (gdb_port > 0) { + elf_info_t probe_info; + if (guest_bootstrap_probe_elf(elf_host_path, &probe_info) == 0 && + probe_info.e_machine == EM_X86_64) { + log_error( + "--gdb is not supported for x86_64 guests; the current stub " + "only exposes the translated aarch64 view"); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, + guest_argc, elf_path, sysroot_path); + if (elf_host_temp) + unlink(elf_host_path); + return 1; + } + } + guest_bootstrap_t boot; extern char **environ; - if (guest_bootstrap_prepare(&g, elf_path, sysroot, guest_argc, guest_argv, - environ, shim_bin, shim_bin_len, verbose, + if (guest_bootstrap_prepare(&g, elf_host_path, elf_host_temp, elf_path, + sysroot, guest_argc, guest_argv, environ, + shim_bin, shim_bin_len, verbose, &guest_initialized, &boot) < 0) { cleanup_main_resources(&g, guest_initialized, &sysroot_mount, have_host_cwd ? host_cwd : NULL, guest_argv, guest_argc, elf_path, sysroot_path); + if (elf_host_temp) + unlink(elf_host_path); return 1; } + if (elf_host_temp && !g.is_rosetta) { + unlink(elf_host_path); + elf_host_temp = false; + } if (have_sysroot) { bool case_sensitive = true; diff --git a/src/runtime/fork-state.h b/src/runtime/fork-state.h index 4b45c61..8386dca 100644 --- a/src/runtime/fork-state.h +++ b/src/runtime/fork-state.h @@ -18,7 +18,11 @@ /* Magic values for IPC frame delimiters */ #define IPC_MAGIC_HEADER 0x454C464BU /* "ELFK" */ #define IPC_MAGIC_SENTINEL 0x454C4F4BU /* "ELOK" */ -#define IPC_VERSION 9 /* v9: preserve elf_load_min */ +/* Bumped to 10 when the rosetta placement / kbuf / ttbr1 tuple was added so + * a rosetta-aware child rejects an older parent's header instead of trying + * to interpret unknown trailing fields. + */ +#define IPC_VERSION 10 typedef struct { uint32_t magic; @@ -39,12 +43,34 @@ typedef struct { uint32_t _pad; uint64_t absock_namespace_id; int64_t sid, pgid; + /* Rosetta placement fields. All zero for aarch64 guests; populated when + * the parent is_rosetta. The child rebuilds the TTBR1 kbuf tree from the + * PT pool that came across in the memory transfer; rosetta_guest_base / + * va_base / size pin the segments at the same primary-buffer location so + * the non-identity page-table mapping remains coherent across the fork. + */ + uint32_t is_rosetta; + uint32_t _rosetta_pad; + uint64_t rosetta_guest_base; + uint64_t rosetta_va_base; + uint64_t rosetta_size; + uint64_t rosetta_entry; + uint64_t kbuf_gpa; + uint64_t ttbr1; } ipc_header_t; typedef struct { uint64_t elr_el1, sp_el0; uint64_t spsr_el1, vbar_el1; uint64_t ttbr0_el1; + /* TTBR1_EL1 is zero for aarch64 guests and carries the rosetta kbuf + * page-table root for is_rosetta guests. The parent captures the live + * sysreg so a forked child resumes with the same TTBR1 the parent had + * after bootstrap_create_vcpu set it; without this the child comes up + * with TTBR1=0 even though TCR_EL1.EPD1 is cleared, and the first + * kernel-VA access faults. + */ + uint64_t ttbr1_el1; uint64_t sctlr_el1, tcr_el1, mair_el1, cpacr_el1, tpidr_el0, sp_el1; uint64_t x[31]; vcpu_simd_state_t simd_state; diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index dc32bda..52847cb 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -121,9 +121,12 @@ int fork_child_main(int ipc_fd, * (size - 8 GiB) and interp_base (size - 4 GiB) plus the 4 MiB infra * reserve below it. 8 GiB satisfies all three with margin. * Upper bound: guest_size must fit in the negotiated IPA width. - * IPA bits: 36 (Apple M2) and 40 (M3+) are the supported widths. + * IPA bits: 36 (M1/M2) and 40 (M3+) for native aarch64; 48 for + * Rosetta guests, which need the wider Stage-2 width for high VAs + * (image at 128 TiB) even though their primary slab stays under + * 40-bit. Reject anything outside [36, 48]. */ - if (hdr.ipa_bits < 36 || hdr.ipa_bits > 40) { + if (hdr.ipa_bits < 36 || hdr.ipa_bits > 48) { log_error("fork-child: invalid ipa_bits %u", (unsigned) hdr.ipa_bits); close(ipc_fd); return 1; @@ -210,6 +213,22 @@ int fork_child_main(int ipc_fd, g.ttbr0 = hdr.ttbr0; g.mmap_rx_next = hdr.mmap_rx_next; g.mmap_rx_end = hdr.mmap_rx_end; + /* Restore rosetta placement so the non-identity page-table entries that + * came across in the memory transfer continue to resolve. ttbr1 points + * at the L0 page the parent's PT pool emitted; that page sits inside + * the primary buffer and is copied by the region transfer below, so the + * child can reuse it without rebuilding the tree. + */ + g.is_rosetta = (hdr.is_rosetta != 0); + proc_set_rosetta_active(g.is_rosetta); + g.rosetta_guest_base = hdr.rosetta_guest_base; + g.rosetta_va_base = hdr.rosetta_va_base; + g.rosetta_size = hdr.rosetta_size; + g.rosetta_entry = hdr.rosetta_entry; + g.kbuf_gpa = hdr.kbuf_gpa; + g.ttbr1 = hdr.ttbr1; + if (g.is_rosetta && g.kbuf_gpa) + g.kbuf_base = (uint8_t *) g.host_base + g.kbuf_gpa; /* Register state is the fork return frame captured from the parent vCPU. */ ipc_registers_t regs; @@ -274,6 +293,7 @@ int fork_child_main(int ipc_fd, HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_MAIR_EL1, regs.mair_el1)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TCR_EL1, regs.tcr_el1)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR0_EL1, regs.ttbr0_el1)); + HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR1_EL1, regs.ttbr1_el1)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CPACR_EL1, regs.cpacr_el1)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL0, regs.sp_el0)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, regs.sp_el1)); @@ -1056,6 +1076,15 @@ int64_t sys_clone(hv_vcpu_t vcpu, verbose); } + /* Rosetta fork takes the helper-process IPC path. The CoW shm fast-path + * is gated off in use_shm because HVF caches VA->PA at hv_vm_map time and + * the parent's MAP_SHARED mapping cannot be safely remapped under the + * running vCPU. The TTBR1 kbuf tree, translator image, and kbuf bytes + * ride along as primary-buffer used regions; the child restores + * TCR_EL1 / TTBR1_EL1 from ipc_registers_t and recomputes kbuf_base + * from kbuf_gpa. + */ + /* elfuse only supports fork-like clone (SIGCHLD) and posix_spawn-like * clone (CLONE_VM|CLONE_VFORK|SIGCHLD) */ @@ -1117,6 +1146,13 @@ int64_t sys_clone(hv_vcpu_t vcpu, child_argv[ci++] = self_path; if (verbose) child_argv[ci++] = "--verbose"; + /* Rosetta is on by default; only propagate the opt-out flag when the + * parent explicitly disabled it. The child re-reads ELFUSE_NO_ROSETTA + * from the environment too, so an env-based opt-out is preserved + * across fork without an explicit argv entry. + */ + if (!proc_rosetta_enabled()) + child_argv[ci++] = "--no-rosetta"; child_argv[ci++] = "--fork-child"; child_argv[ci++] = fd_str; if (is_vfork) { @@ -1207,9 +1243,13 @@ int64_t sys_clone(hv_vcpu_t vcpu, /* Determine if elfuse can use the CoW (shm) fast path. * If shm_fd >= 0, elfuse freezes a snapshot via MAP_PRIVATE and sends the * shm fd to the child. Otherwise fall back to region-by-region copy. + * + * Rosetta guests are excluded from CoW even when shm-backed: rosetta's + * JIT state (TLS slabs, code caches, indirect-call tables, block lists) + * is process-local and corrupts when COW-shared. The legacy region-copy + * path preserves the parent's JIT state independently per child. */ - /* Use CoW fork when the guest has file-backed shared memory. */ - bool use_shm = (g->shm_fd >= 0); + bool use_shm = (g->shm_fd >= 0) && !g->is_rosetta; /* elfuse does not remap the parent to MAP_PRIVATE here. The parent * stays on MAP_SHARED; its vCPU continues writing to the shared file. @@ -1260,6 +1300,13 @@ int64_t sys_clone(hv_vcpu_t vcpu, .absock_namespace_id = absock_get_namespace_id(), .sid = proc_get_sid(), .pgid = proc_get_pgid(), + .is_rosetta = g->is_rosetta ? 1 : 0, + .rosetta_guest_base = g->rosetta_guest_base, + .rosetta_va_base = g->rosetta_va_base, + .rosetta_size = g->rosetta_size, + .rosetta_entry = g->rosetta_entry, + .kbuf_gpa = g->kbuf_gpa, + .ttbr1 = g->ttbr1, }; if (fork_ipc_write_all(ipc_sock, &hdr, sizeof(hdr)) < 0) { log_error("clone: failed to send header"); @@ -1320,6 +1367,7 @@ int64_t sys_clone(hv_vcpu_t vcpu, regs.spsr_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_SPSR_EL1); regs.vbar_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_VBAR_EL1); regs.ttbr0_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_TTBR0_EL1); + regs.ttbr1_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_TTBR1_EL1); regs.sctlr_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_SCTLR_EL1); regs.tcr_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_TCR_EL1); regs.mair_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_MAIR_EL1); diff --git a/src/runtime/futex.c b/src/runtime/futex.c index a82192f..0d02d6b 100644 --- a/src/runtime/futex.c +++ b/src/runtime/futex.c @@ -1713,7 +1713,14 @@ void robust_list_walk(guest_t *g, thread_entry_t *t) futex_gva = list_ptr + (uint64_t) futex_offset; else futex_gva = list_ptr - (uint64_t) (-futex_offset); - if (futex_gva >= g->ipa_base + g->guest_size || + /* Canonical user-VA range check (bits 47:0). Anything with bit 63 + * set is kernel-VA territory and is never a valid futex address. + * The previous primary-buffer-only check (futex_gva < ipa_base + + * guest_size) silently dropped rosetta's high-VA futexes; the + * subsequent guest_read_small / guest_write_small calls do the + * actual mapping check via the page-table walker. + */ + if (futex_gva > 0x0000FFFFFFFFFFFFULL || !futex_uaddr_is_aligned(futex_gva)) { /* Out of range or unaligned: skip. Linux's unaligned_p() rejects * these; emulating the same avoids partial cross-page writes @@ -1765,7 +1772,11 @@ void robust_list_walk(guest_t *g, thread_entry_t *t) futex_gva = pending + (uint64_t) futex_offset; else futex_gva = pending - (uint64_t) (-futex_offset); - if (futex_gva >= g->ipa_base + g->guest_size || + /* Canonical user-VA + alignment only; guest_read_small below is the + * actual reachability test, so rosetta high-VA robust futexes are + * not silently skipped (was: futex_gva >= ipa_base + guest_size). + */ + if (futex_gva > 0x0000FFFFFFFFFFFFULL || !futex_uaddr_is_aligned(futex_gva)) return; uint32_t futex_val; diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c index 86d2a07..3acf491 100644 --- a/src/runtime/procemu.c +++ b/src/runtime/procemu.c @@ -45,6 +45,7 @@ #include "debug/log.h" #include "runtime/procemu.h" +#include "core/rosetta.h" #include "runtime/thread.h" #include "syscall/abi.h" @@ -1560,8 +1561,15 @@ int proc_intercept_open(const guest_t *g, /* /proc/self/exe -> open the actual ELF binary. * Unlike readlinkat (which returns the path string), openat needs to * return an actual file descriptor to the binary. + * Under rosetta, the binfmt_misc convention treats rosetta as the + * interpreter visible to the guest: rosetta opens /proc/self/fd/X + * via /proc/self/exe to identify itself and then issues the VZ + * ioctls on that descriptor. Return ROSETTA_PATH so the VZ ioctl + * gate (rosetta_ioctl_target_fd) recognises the fd. */ if (!strcmp(path, "/proc/self/exe")) { + if (g && g->is_rosetta) + return open(ROSETTA_PATH, O_RDONLY); char exe[LINUX_PATH_MAX]; if (!proc_elf_path_snapshot(exe, sizeof(exe))) { errno = ENOENT; @@ -2591,6 +2599,17 @@ int proc_intercept_readlink(const char *path, char *buf, size_t bufsiz) * abstraction the rest of the path layer presents. */ if (!strcmp(path, "/proc/self/exe")) { + /* Under rosetta, readlink("/proc/self/exe") points at the rosetta + * translator (the binfmt_misc interpreter). Matches the behavior + * Linux exposes when binfmt_misc dispatch is active. + */ + if (proc_rosetta_active()) { + size_t len = strlen(ROSETTA_PATH); + if (len > bufsiz) + len = bufsiz; + memcpy(buf, ROSETTA_PATH, len); + return (int) len; + } char exe_buf[LINUX_PATH_MAX]; if (!proc_elf_path_snapshot(exe_buf, sizeof(exe_buf))) { errno = ENOENT; diff --git a/src/syscall/abi.h b/src/syscall/abi.h index 8716cec..0104058 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -672,6 +672,12 @@ enum { SOCK_OPT_TCP_KEEPINTVL, SOCK_OPT_IPV6_V6ONLY, SOCK_OPT_PASSCRED, + /* IP_MTU_DISCOVER value stored verbatim so getsockopt round-trips the + * Linux PMTUD mode the guest set. The host accepts the value but does + * not honour every Linux mode; see sys_setsockopt for the IP_DONTFRAG + * translation for the modes macOS supports. + */ + SOCK_OPT_IP_MTU_DISCOVER, SOCK_OPT_COUNT }; diff --git a/src/syscall/exec.c b/src/syscall/exec.c index 43a8097..cecfcb2 100644 --- a/src/syscall/exec.c +++ b/src/syscall/exec.c @@ -22,7 +22,9 @@ #include "hvutil.h" #include "utils.h" +#include "core/bootstrap.h" #include "core/elf.h" +#include "core/rosetta.h" #include "core/stack.h" #include "core/vdso.h" @@ -41,6 +43,100 @@ * entry has been removed from the shared fd table. */ +/* Force HVF to commit the sysreg/GPR writes that sys_execve performs after + * a guest_reset before vcpu_run resumes. HVF defers writes until the next + * register-touch on the owning thread, and a stale read here is harmless. + * Use the HV_CHECK-wrapped accessors so a real HVF error (HV_BUSY, + * HV_ERROR) past the point of no return aborts cleanly with a diagnostic + * instead of silently resuming with undefined register state. + */ +static void exec_sync_vcpu_regs(hv_vcpu_t vcpu) +{ + (void) vcpu_get_sysreg(vcpu, HV_SYS_REG_TTBR0_EL1); + (void) vcpu_get_sysreg(vcpu, HV_SYS_REG_TCR_EL1); + (void) vcpu_get_sysreg(vcpu, HV_SYS_REG_TTBR1_EL1); + (void) vcpu_get_sysreg(vcpu, HV_SYS_REG_ELR_EL1); + (void) vcpu_get_sysreg(vcpu, HV_SYS_REG_SP_EL0); + (void) vcpu_get_sysreg(vcpu, HV_SYS_REG_SPSR_EL1); + (void) vcpu_get_reg(vcpu, HV_REG_X8); +} + +/* Release the buffers and temporary host-side files that sys_execve allocates + * before crossing the point of no return. Used by both the Rosetta and the + * aarch64 success paths. + */ +static void exec_cleanup_inputs(char *argv_buf, + char *envp_buf, + const char *path_host_buf, + bool path_host_temp, + const char *interp_host_buf, + bool interp_host_temp) +{ + if (path_host_temp) + unlink(path_host_buf); + if (interp_host_temp) + unlink(interp_host_buf); + free(argv_buf); + free(envp_buf); +} + +static int exec_resolve_guest_host_path(const char *guest_path, + char *host_path, + size_t host_path_sz, + bool *host_path_temp) +{ + path_translation_t tx; + if (!guest_path || !host_path || host_path_sz == 0 || !host_path_temp) { + errno = EINVAL; + return -1; + } + + *host_path_temp = false; + if (path_translate_at(LINUX_AT_FDCWD, guest_path, PATH_TR_NONE, &tx) < 0) + return -1; + if (tx.fuse_path) { + int rc = + fuse_materialize_path(tx.intercept_path, host_path, host_path_sz); + if (rc < 0) { + errno = -rc; + return -1; + } + *host_path_temp = true; + return 0; + } + + size_t len = str_copy_trunc(host_path, tx.host_path, host_path_sz); + if (len >= host_path_sz) { + errno = ENAMETOOLONG; + return -1; + } + return 0; +} + +static int exec_resolve_interp_host_path(const char *sysroot, + const char *interp_guest_path, + char *interp_host_path, + size_t interp_host_path_sz, + bool *interp_host_temp) +{ + char interp_candidate[LINUX_PATH_MAX]; + elf_resolve_interp(sysroot, interp_guest_path, interp_candidate, + sizeof(interp_candidate)); + if (strcmp(interp_candidate, interp_guest_path) != 0) { + size_t len = str_copy_trunc(interp_host_path, interp_candidate, + interp_host_path_sz); + if (len >= interp_host_path_sz) { + errno = ENAMETOOLONG; + return -1; + } + *interp_host_temp = false; + return 0; + } + + return exec_resolve_guest_host_path(interp_guest_path, interp_host_path, + interp_host_path_sz, interp_host_temp); +} + /* Read a NULL-terminated pointer array from guest memory. * Each pointer in the array is a 64-bit GVA pointing to a string. * Returns the count of entries (excluding the NULL terminator), @@ -346,11 +442,23 @@ int64_t sys_execve(hv_vcpu_t vcpu, * unrecoverable, matching the Linux kernel's behavior (SIGKILL). */ - /* Reject x86_64 binaries (only aarch64 is supported) */ + /* x86_64 targets dispatch through guest_bootstrap_rosetta_post_reset + * once the point-of-no-return work below clears guest state. Reject + * here only when Rosetta is disabled via --no-rosetta or + * ELFUSE_NO_ROSETTA=1; otherwise mark the transition and skip the + * aarch64-specific ELF/interp setup below. + */ + bool target_is_rosetta = false; if (elf_info.e_machine == EM_X86_64) { - log_error("execve: x86_64 binaries not supported: %s", path); - err = -LINUX_ENOEXEC; - goto fail; + if (!proc_rosetta_enabled()) { + log_error( + "execve: x86_64 ELF rejected by --no-rosetta " + "(or ELFUSE_NO_ROSETTA=1): %s", + path); + err = -LINUX_ENOEXEC; + goto fail; + } + target_is_rosetta = true; } /* Compute load base once (used for size check and later mapping). @@ -378,14 +486,32 @@ int64_t sys_execve(hv_vcpu_t vcpu, elf_info_t interp_info; memset(&interp_info, 0, sizeof(interp_info)); char interp_resolved[LINUX_PATH_MAX]; + char interp_display_path[LINUX_PATH_MAX]; interp_resolved[0] = '\0'; + interp_display_path[0] = '\0'; - if (elf_info.interp_path[0] != '\0') { + /* x86_64 targets do not pre-load their PT_INTERP. Rosetta is statically + * linked and loads the target binary (and any guest-side dynamic linker) + * itself via fd 3, so the aarch64-only interpreter pre-load below is + * skipped for rosetta exec. + */ + if (!target_is_rosetta && elf_info.interp_path[0] != '\0') { char sysroot_snap[LINUX_PATH_MAX]; bool have_sr = proc_sysroot_snapshot(sysroot_snap, sizeof(sysroot_snap)); - elf_resolve_interp(have_sr ? sysroot_snap : NULL, elf_info.interp_path, - interp_resolved, sizeof(interp_resolved)); + if (exec_resolve_interp_host_path(have_sr ? sysroot_snap : NULL, + elf_info.interp_path, interp_resolved, + sizeof(interp_resolved), + &interp_host_temp) < 0) { + log_error("execve: failed to resolve interpreter: %s", + elf_info.interp_path); + err = -LINUX_ENOEXEC; + goto fail; + } + str_copy_trunc( + interp_display_path, + interp_host_temp ? elf_info.interp_path : interp_resolved, + sizeof(interp_display_path)); log_debug("execve: pre-validating interpreter: %s", interp_resolved); @@ -409,12 +535,8 @@ int64_t sys_execve(hv_vcpu_t vcpu, */ if (0) { fail: - if (path_host_temp) - unlink(path_host_buf); - if (interp_host_temp) - unlink(interp_host_buf); - free(argv_buf); - free(envp_buf); + exec_cleanup_inputs(argv_buf, envp_buf, path_host_buf, path_host_temp, + interp_host_buf, interp_host_temp); return err; } @@ -508,6 +630,24 @@ int64_t sys_execve(hv_vcpu_t vcpu, * kernel exec failure after its point of no return. */ fork_notify_vfork_exec(); + /* Only clear rosetta state when leaving rosetta. For rosetta-to-rosetta + * exec the placement (rosetta_guest_base, rosetta_va_base, kbuf_gpa, + * ttbr1) must survive guest_reset so guest_bootstrap_rosetta_post_reset + * hits rosetta_prepare's re-entry branch and reuses the existing GPA + * instead of picking a fresh one. Keep proc_rosetta_active in sync so + * /proc/self/exe readlink reports the right path. + */ + if (g->is_rosetta && !target_is_rosetta) { + rosettad_clear_binary_path(); + guest_clear_rosetta_state(g); + proc_set_rosetta_active(false); + } else if (!g->is_rosetta && target_is_rosetta) { + /* aarch64 -> rosetta: enter rosetta mode fresh. guest_clear was + * already a no-op in this branch since the parent had no rosetta + * state to clear. */ + g->is_rosetta = true; + proc_set_rosetta_active(true); + } guest_reset(g); /* The replacement image must not inherit process-wide shutdown requests @@ -531,6 +671,94 @@ int64_t sys_execve(hv_vcpu_t vcpu, memcpy((uint8_t *) g->host_base + g->shim_base, shim_ptr, shim_size); } + /* x86_64 re-bootstrap branch: hand off the post-reset work to the + * Rosetta-aware helper, then write vCPU sysregs for kernel-VA execution + * and return without touching the aarch64-specific block below. + */ + if (target_is_rosetta) { + /* Drain the previous rosettad bridge before rosetta_finalize wires + * a fresh one. The detached handler thread only clears its global + * client-fd marker on its own EOF/exit. 1 s is enough headroom + * for a loaded host; a hung handler past that point will lose + * the start_handler CAS later, and the warning here marks the + * cause. Soft cap; the install may still succeed on timeout if + * the handler's CAS races us favourably. + */ + if (!rosettad_wait_for_idle(1000)) { + log_warn( + "execve: rosettad bridge did not drain within 1s; " + "rosetta_finalize CAS may lose the race"); + } + + /* path_host may point at path_host_buf (normal path) or at + * interp_host_buf (shebang resolution landed on a FUSE-backed + * x86_64 binary). Ownership of any materialized temp transfers + * to rosettad regardless of which buffer holds the path, so + * capture that temp path in one place and clear the matching + * temp flag here. exec_cleanup_inputs becomes a no-op for the + * transferred slot, and the post-PNR rollback below can unlink + * via owned_rosetta_temp without re-discriminating which buffer + * was selected. + */ + const char *owned_rosetta_temp = NULL; + if (path_host == path_host_buf && path_host_temp) { + owned_rosetta_temp = path_host_buf; + path_host_temp = false; + } else if (path_host == interp_host_buf && interp_host_temp) { + owned_rosetta_temp = interp_host_buf; + interp_host_temp = false; + } + + uint64_t r_entry = 0, r_sp = 0, r_ttbr0 = 0; + if (guest_bootstrap_rosetta_post_reset( + g, path_host, owned_rosetta_temp != NULL, path, argc, + (const char **) argv, envp, shim_size, false, &r_entry, &r_sp, + &r_ttbr0) < 0) { + /* Post-PNR fatal failure. The temp flag was cleared up front + * so exec_cleanup_inputs would be a no-op, and rosettad never + * reached its ownership-commit point on this failure path. + * Best-effort unlink so the materialized temp does not orphan + * in /tmp on a path the kernel parallels with SIGKILL. + */ + if (owned_rosetta_temp) + unlink(owned_rosetta_temp); + log_fatal( + "execve failed after point of no return: " + "rosetta re-bootstrap failed for %s", + path); + exit(128); + } + + /* I-cache for the (possibly re-mapped) rosetta segments has already + * been invalidated inside rosetta_prepare; only the shim needs an + * I-cache flush from here. + */ + sys_icache_invalidate((uint8_t *) g->host_base + g->shim_base, + shim_size); + + uint64_t entry_ipa = guest_ipa(g, r_entry); + uint64_t sp_ipa = guest_ipa(g, r_sp); + + hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR0_EL1, r_ttbr0); + hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TCR_EL1, TCR_EL1_VALUE_KBUF); + hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR1_EL1, g->ttbr1); + hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_ELR_EL1, entry_ipa); + hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL0, sp_ipa); + hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SPSR_EL1, 0x0); + hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, 0); + vcpu_zero_gprs(vcpu); + hv_vcpu_set_reg(vcpu, HV_REG_X8, 2); + tlbi_request_clear(); + + exec_sync_vcpu_regs(vcpu); + + log_debug("execve: rosetta target %s, entry=0x%llx sp=0x%llx", path, + (unsigned long long) entry_ipa, (unsigned long long) sp_ipa); + exec_cleanup_inputs(argv_buf, envp_buf, path_host_buf, path_host_temp, + interp_host_buf, interp_host_temp); + return SYSCALL_EXEC_HAPPENED; + } + /* Load the executable image that was validated before guest_reset(). */ if (elf_map_segments(&elf_info, path_host, g->host_base, g->guest_size, elf_load_base) < 0) { @@ -735,7 +963,7 @@ int64_t sys_execve(hv_vcpu_t vcpu, interp_info.segments[i].memsz + interp_base, elf_pf_to_prot(interp_info.segments[i].flags), LINUX_MAP_PRIVATE, interp_info.segments[i].offset, - interp_resolved); + interp_display_path); } } /* Leave the lowest stack page unmapped so downward overflow faults before @@ -785,6 +1013,8 @@ int64_t sys_execve(hv_vcpu_t vcpu, /* Switch EL0 translation to the rebuilt page tables. */ hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR0_EL1, ttbr0); + hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TCR_EL1, TCR_EL1_VALUE); + hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR1_EL1, 0); /* The shim will ERET to this address after syscall dispatch returns. */ hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_ELR_EL1, entry_ipa); @@ -816,28 +1046,13 @@ int64_t sys_execve(hv_vcpu_t vcpu, hv_vcpu_set_reg(vcpu, HV_REG_X8, 2); tlbi_request_clear(); - /* Readback forces HVF to commit sysreg/GPR writes before the run loop - * resumes the vCPU. - */ - { - uint64_t _sync; - hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_TTBR0_EL1, &_sync); - hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_ELR_EL1, &_sync); - hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_SP_EL0, &_sync); - hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_SPSR_EL1, &_sync); - hv_vcpu_get_reg(vcpu, HV_REG_X8, &_sync); - (void) _sync; - } + exec_sync_vcpu_regs(vcpu); log_debug("execve: loaded %s, entry=0x%llx sp=0x%llx", path_host, (unsigned long long) entry_ipa, (unsigned long long) sp_ipa); - if (path_host_temp) - unlink(path_host_buf); - if (interp_host_temp) - unlink(interp_host_buf); - free(argv_buf); - free(envp_buf); + exec_cleanup_inputs(argv_buf, envp_buf, path_host_buf, path_host_temp, + interp_host_buf, interp_host_temp); return SYSCALL_EXEC_HAPPENED; diff --git a/src/syscall/fuse.c b/src/syscall/fuse.c index 0a80a06..dc64d98 100644 --- a/src/syscall/fuse.c +++ b/src/syscall/fuse.c @@ -1970,6 +1970,28 @@ static void fuse_file_release(fuse_file_snap_t *snap) pthread_mutex_unlock(&fuse_lock); } +int fuse_materialize_fd(int fd, char *out_path, size_t outsz) +{ + fuse_file_snap_t snap; + int rc = fuse_file_acquire(fd, false, false, &snap); + if (rc < 0) + return rc; + + if (snap.path_only) { + char path[LINUX_PATH_MAX]; + str_copy_trunc(path, snap.file->path, sizeof(path)); + fuse_file_release(&snap); + return fuse_materialize_path(path, out_path, outsz); + } + + pthread_mutex_lock(&snap.session->lock); + rc = fuse_materialize_open_file_locked(snap.session, snap.nodeid, snap.fh, + LINUX_O_RDONLY, out_path, outsz); + pthread_mutex_unlock(&snap.session->lock); + fuse_file_release(&snap); + return rc; +} + /* Read up to count bytes from a FUSE-backed file or directory at offset. * Writes the daemon's reply into the guest buffer at buf_gva. Updates the * stream offset on success when advance_offset is true and the fd still diff --git a/src/syscall/fuse.h b/src/syscall/fuse.h index 0b5eaec..fd5e58b 100644 --- a/src/syscall/fuse.h +++ b/src/syscall/fuse.h @@ -30,6 +30,7 @@ bool fuse_path_matches_mount(const char *path); int fuse_stat_path(const char *path, struct stat *st, int at_flags); int fuse_access_path(const char *path, int mode, int flags); int fuse_materialize_path(const char *path, char *out_path, size_t outsz); +int fuse_materialize_fd(int fd, char *out_path, size_t outsz); int fuse_fstat_fd(int fd, struct stat *st); int64_t fuse_getdents64(guest_t *g, int fd, uint64_t buf_gva, uint64_t count); int64_t fuse_read_fd(guest_t *g, int fd, uint64_t buf_gva, uint64_t count); diff --git a/src/syscall/io.c b/src/syscall/io.c index 748ff98..60f6f34 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -25,7 +26,10 @@ #include "utils.h" +#include "core/rosetta.h" +#include "hvutil.h" #include "runtime/procemu.h" +#include "runtime/thread.h" #include "syscall/abi.h" #include "syscall/fd.h" @@ -118,6 +122,98 @@ static int64_t io_return_zero(host_fd_ref_t *host_ref) return 0; } +static bool rosetta_ioctl_target_fd(guest_t *g, int host_fd) +{ + if (!g->is_rosetta) + return false; + + /* Rosetta opens /proc/self/exe (which under rosetta resolves to the + * rosetta translator, not elfuse) and issues the VZ probe ioctls on + * that descriptor. Match against ROSETTA_PATH so the gate triggers + * regardless of where elfuse itself lives on disk. + */ + char resolved[PATH_MAX]; + if (fcntl(host_fd, F_GETPATH, resolved) < 0) + return false; + if (strcmp(resolved, ROSETTA_PATH) != 0) + return false; + + /* Defense in depth: require the syscall to originate from inside the + * rosetta translator image. The /proc/self/exe redirection makes the + * launcher fd reachable to any code running under a rosetta-enabled + * VM, so without this check a guest-launched helper that opened + * /proc/self/exe could exercise the synthetic VZ probe path. Today + * the responses are public constants, but the gate guards against + * future synthetic responses that leak host state. ELR_EL1 carries + * the EL0 return PC captured at SVC entry on aarch64. + * + * Skip if the rosetta image bounds are not yet known (pre-finalize); + * the F_GETPATH match above is the only gate in that window, and + * rosetta_finalize publishes the bounds before issuing any ioctl. + */ + if (g->rosetta_va_base && g->rosetta_size) { + if (!current_thread) + return false; + uint64_t pc = vcpu_get_sysreg(current_thread->vcpu, HV_SYS_REG_ELR_EL1); + if (pc < g->rosetta_va_base || + pc - g->rosetta_va_base >= g->rosetta_size) + return false; + } + return true; +} + +/* Returns true if request matches one of the Rosetta VZ probe ioctls. */ +static bool rosetta_vz_request(uint64_t request) +{ + return request == ROSETTA_VZ_CHECK || request == ROSETTA_VZ_CAPS || + request == ROSETTA_VZ_ACTIVATE; +} + +/* Handle the Rosetta VZ probe ioctl trio. Writes synthetic responses to the + * guest buffer at arg and returns the value the guest sees (1 on success, + * negative Linux errno on a guest_write fault). Caller is responsible for + * dispatch gating (see rosetta_vz_request + rosetta_ioctl_target_fd). + */ +static int64_t rosetta_vz_ioctl(guest_t *g, uint64_t request, uint64_t arg) +{ + switch (request) { + case ROSETTA_VZ_CHECK: { + static const char rosetta_sig[ROSETTA_VZ_SIG_LEN] = + "Our hard work\nby these words guarded\n" + "please don't steal\n\xc2\xa9 Apple Inc"; + if (guest_write(g, arg, rosetta_sig, sizeof(rosetta_sig)) < 0) + return -LINUX_EFAULT; + return 1; + } + case ROSETTA_VZ_CAPS: { + /* caps is zero-initialised: VZ_SECONDARY and the trailing NUL of any + * partially-copied binary path are already in place. + */ + uint8_t caps[ROSETTA_CAPS_SIZE] = {0}; + caps[ROSETTA_CAPS_VZ_ENABLE] = 1; + static const char fake_sock_path[] = ROSETTAD_SOCKET_PATH; + memcpy(&caps[ROSETTA_CAPS_SOCKET_PATH], fake_sock_path, + sizeof(fake_sock_path)); + /* Snapshot the caps binary path under the rosetta path lock so a + * concurrent execve cannot tear the string between length probe + * and copy. Inline buffer matches the cap exactly; the snapshot + * helper bounds the write itself. + */ + char bin[ROSETTA_CAPS_BINARY_PATH_LEN]; + size_t bin_n = rosettad_snapshot_caps_binary_path(bin, sizeof(bin)); + if (bin_n > 0) + memcpy(&caps[ROSETTA_CAPS_BINARY_PATH], bin, bin_n); + if (guest_write(g, arg, caps, sizeof(caps)) < 0) + return -LINUX_EFAULT; + return 1; + } + case ROSETTA_VZ_ACTIVATE: + return 1; + } + /* Caller gates dispatch; this is unreachable in practice. */ + return -LINUX_ENOTTY; +} + /* termios flag translation helpers. */ /* Linux aarch64 c_iflag bits (from asm-generic/termbits-common.h). @@ -1180,6 +1276,18 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg) return err; int host_fd = host_ref.fd; + /* Rosetta's Virtualization.framework probe ioctls are issued on the + * /proc/self/exe launcher fd very early at startup. Gate on that actual + * host file rather than on every fd in a Rosetta guest, but do not key on + * ROSETTA_PATH itself: the probe is against the launcher, not the + * translator image. + */ + if (rosetta_vz_request(request) && rosetta_ioctl_target_fd(g, host_fd)) { + int64_t r = rosetta_vz_ioctl(g, request, arg); + host_fd_ref_close(&host_ref); + return r; + } + switch (request) { case LINUX_TIOCSPGRP: { /* Set foreground process group for controlling terminal. */ diff --git a/src/syscall/mem.c b/src/syscall/mem.c index 5edba87..32620c2 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -216,6 +217,7 @@ static void split_regions_at_boundary(guest_t *g, uint64_t boundary) g->regions[i].end = boundary; g->regions[i + 1].offset += (boundary - g->regions[i + 1].start); + g->regions[i + 1].gpa_base += (boundary - g->regions[i + 1].start); g->regions[i + 1].start = boundary; if (g->regions[i + 1].backing_fd >= 0) { g->regions[i + 1].backing_fd = dup(g->regions[i + 1].backing_fd); @@ -347,6 +349,319 @@ static int prot_to_perms(int prot) return perms; } +static void *host_ptr_for_gpa(const guest_t *g, uint64_t gpa) +{ + if (gpa < g->guest_size) + return (uint8_t *) g->host_base + gpa; + + const guest_mapping_t *m = guest_find_mapping(g, gpa); + if (m) + return (uint8_t *) m->host_va + (gpa - m->gpa); + + const guest_overflow_t *o = guest_find_overflow(g, gpa); + if (o) + return (uint8_t *) o->host_base + (gpa - o->ipa_start); + + return NULL; +} + +static bool region_range_overlaps(const guest_t *g, + uint64_t start, + uint64_t end) +{ + int lo = 0, hi = g->nregions - 1, first = g->nregions; + while (lo <= hi) { + int mid = (lo + hi) / 2; + if (g->regions[mid].end > start) { + first = mid; + hi = mid - 1; + } else { + lo = mid + 1; + } + } + + return first < g->nregions && g->regions[first].start < end; +} + +static int64_t sys_mmap_fixed_high_va(guest_t *g, + uint64_t addr, + uint64_t length, + int prot, + int flags, + guest_fd_t fd, + uint64_t offset, + bool is_noreplace) +{ + int64_t ret = -LINUX_ENOMEM; + + if (!g->is_rosetta) + return -LINUX_ENOMEM; + + bool is_anon = (flags & LINUX_MAP_ANONYMOUS) != 0; + bool is_shared = (flags & LINUX_MAP_SHARED) != 0; + host_fd_ref_t backing_ref = {.fd = -1, .owned = false}; + int host_backing_fd = -1; + int track_backing_fd = -1; + bool close_host_backing_fd = false; + /* High-water mark of VA installed by the mapping loop; reachable from + * the fail label so the rollback knows what to invalidate. Must be + * initialized before any goto fail that runs before the loop. + */ + uint64_t va_installed_end = 0; + /* If a fresh block has been block-mapped (live RW/RX over the full + * 2 MiB) but has not yet had its L3 split-inherited entries zeroed, + * the rollback must clear the full block, not just [addr, addr+length). + * Tracks at most one in-flight fresh block at a time; UINT64_MAX means + * no in-flight fresh block needs full-scope rollback. + */ + uint64_t inflight_fresh_block_va = UINT64_MAX; + + if (!is_anon && is_shared) + return -LINUX_ENODEV; + + /* Reject wrap before reusing addr + length anywhere below. The caller + * page-rounds length, but addr is guest-supplied and a huge length + * against a high VA can still overflow. Also reject the case where + * addr + length is too close to UINT64_MAX for ALIGN_UP to round up + * the 2 MiB boundary without wrapping to 0 (which would make va_end + * smaller than va_start and underflow backing_span). + */ + if (length == 0 || addr > UINT64_MAX - length) + return -LINUX_ENOMEM; + if ((addr + length) > UINT64_MAX - (BLOCK_2MIB - 1)) + return -LINUX_ENOMEM; + + if (guest_kbuf_user_va_overlap(addr, length)) + return -LINUX_ENOMEM; + + if (region_range_overlaps(g, addr, addr + length)) { + if (is_noreplace) + return -LINUX_EEXIST; + /* High-VA MAP_FIXED replacement is still limited to fresh ranges. + * Replacing partially-overlapping non-identity mappings needs a more + * complete VA-aware rollback path than the low-VA slab code uses. + */ + return -LINUX_ENOMEM; + } + + uint64_t va_start = ALIGN_DOWN(addr, BLOCK_2MIB); + uint64_t va_end = ALIGN_UP(addr + length, BLOCK_2MIB); + uint64_t backing_span = va_end - va_start; + uint64_t backing_gpa_start = ALIGN_UP( + (g->mmap_end > g->mmap_next) ? g->mmap_end : g->mmap_next, BLOCK_2MIB); + uint64_t backing_limit = + g->kbuf_gpa ? g->kbuf_gpa : (g->interp_base - INFRA_RESERVE); + if (backing_gpa_start >= backing_limit || + backing_span > backing_limit - backing_gpa_start) + return -LINUX_ENOMEM; + + if (!is_anon) { + if (fuse_fd_refuse_mmap(fd)) { + char materialized_path[PATH_MAX]; + int rc = fuse_materialize_fd(fd, materialized_path, + sizeof(materialized_path)); + if (rc < 0) + return rc; + host_backing_fd = open(materialized_path, O_RDONLY | O_CLOEXEC); + int saved_errno = errno; + unlink(materialized_path); + if (host_backing_fd < 0) { + errno = saved_errno; + return linux_errno(); + } + close_host_backing_fd = true; + } else { + if (host_fd_ref_open(fd, &backing_ref) < 0) + return -LINUX_EBADF; + host_backing_fd = backing_ref.fd; + } + track_backing_fd = dup(host_backing_fd); + if (track_backing_fd < 0) { + ret = -LINUX_ENOMEM; + goto fail; + } + if (prot != LINUX_PROT_NONE) { + char probe; + ssize_t nr; + do { + nr = pread(host_backing_fd, &probe, sizeof(probe), + (off_t) offset); + } while (nr < 0 && errno == EINTR); + if (nr < 0) { + ret = linux_errno(); + goto fail; + } + } + } + + int map_perms = + (prot == LINUX_PROT_NONE) ? MEM_PERM_RW : prot_to_perms(prot); + + /* Mapping loop installs PT state in block-sized steps. Any L1/L2 tables + * newly allocated during this call are left in place on rollback: they + * are zero descriptors after invalidation and harmless until reused by + * a later mmap. + */ + va_installed_end = va_start; + + for (uint64_t va = va_start; va < va_end; va += BLOCK_2MIB) { + uint64_t gpa = backing_gpa_start + (va - va_start); + + void *host = host_ptr_for_gpa(g, gpa); + if (!host) + goto fail; + memset(host, 0, BLOCK_2MIB); + + /* Detect freshness BEFORE guest_map_va_range so the decision is not + * confused by a prior high-VA mmap into the same 2 MiB block. A + * fresh block needs its split-inherited L3 entries zeroed so gap + * pages do not silently inherit block-level perms; a pre-existing + * block must be left alone so earlier mappings into the same block + * survive. + */ + bool fresh_block = !guest_va_block_mapped(g, va); + + if (guest_map_va_range(g, va, va + BLOCK_2MIB, gpa, map_perms) < 0) + goto fail; + va_installed_end = va + BLOCK_2MIB; + + /* Fresh blocks are live with full-2 MiB block-level perms from + * here until guest_invalidate_ptes zeros the split-inherited L3 + * entries. If split or invalidate fails in between, the rollback + * must scrub the entire block; record it for the fail path. + */ + if (fresh_block) + inflight_fresh_block_va = va; + + /* Always split so guest_install_va_pages can write 4 KiB L3 PTEs + * for the actual mapped range; pre-existing tables make split a + * no-op. + */ + if (guest_split_block(g, va) < 0) + goto fail; + + if (fresh_block) { + if (guest_invalidate_ptes(g, va, va + BLOCK_2MIB) < 0) + goto fail; + /* L3 entries are zeroed; the block is no longer live at + * 2 MiB scope and the narrow rollback is sufficient. + */ + inflight_fresh_block_va = UINT64_MAX; + } + } + + uint8_t *map_host = + host_ptr_for_gpa(g, backing_gpa_start + (addr - va_start)); + if (!map_host) + goto fail; + + if (!is_anon && prot != LINUX_PROT_NONE) { + memset(map_host, 0, length); + uint8_t *dst = map_host; + size_t remaining = length; + off_t file_off = (off_t) offset; + while (remaining > 0) { + ssize_t nr = pread(host_backing_fd, dst, remaining, file_off); + if (nr < 0) { + if (errno == EINTR) + continue; + break; + } + if (nr == 0) + break; + dst += nr; + remaining -= (size_t) nr; + file_off += nr; + } + } + + /* Install L3 PTEs for the actual mapped range. Fresh blocks were + * fully invalidated in the loop above so their gap pages do not + * inherit block-level perms; pre-existing blocks are left untouched + * so prior high-VA mmaps into the same 2 MiB block survive. + * + * PROT_NONE still needs an explicit invalidate for the requested + * pages: when the range lands inside a reused 2 MiB block, leaving + * the inherited L3 descriptors intact would make the new guard range + * spuriously accessible. + */ + if (prot == LINUX_PROT_NONE) { + if (guest_invalidate_ptes(g, addr, addr + length) < 0) + goto fail; + } else { + uint64_t gpa_for_addr = backing_gpa_start + (addr - va_start); + if (guest_install_va_pages(g, addr, length, gpa_for_addr, + prot_to_perms(prot)) < 0) + goto fail; + } + + uint64_t backing_gpa_end = backing_gpa_start + backing_span; + if (backing_gpa_end > g->mmap_next) + g->mmap_next = backing_gpa_end; + if (backing_gpa_end > g->mmap_end) + g->mmap_end = backing_gpa_end; + + uint64_t gpa_base = backing_gpa_start + (addr - va_start); + if (!region_has_capacity_after_removes(g, NULL, 0, 1)) + goto fail; + if (guest_region_add_ex_owned_gpa(g, addr, addr + length, gpa_base, prot, + flags, offset, NULL, + track_backing_fd) < 0) + goto fail; + track_backing_fd = -1; + if (close_host_backing_fd && host_backing_fd >= 0) + close(host_backing_fd); + host_fd_ref_close(&backing_ref); + + return (int64_t) addr; + +fail: + /* Roll back PT state installed by this call. The success path + * preserves pre-existing 2 MiB blocks (so prior high-VA mmaps in the + * same block survive); the rollback must respect that same + * invariant. Two cases: + * + * 1. An in-flight fresh block: block-mapped at full-2 MiB perms but + * not yet invalidated. Zero the entire 2 MiB so no stray RW/RX + * mapping survives across the failure. + * 2. The requested subrange [addr, addr+length): pre-existing + * blocks and completed fresh blocks were only ever written + * inside this range by guest_install_va_pages, so a narrow + * invalidate is the right scope. Completed fresh blocks had all + * L3 entries cleared in the loop, so any leftover split- + * inherited descriptors outside [addr, addr+length) are dormant + * and harmless until overwritten by a future mmap into the same + * VA range. Region tracking itself was never updated on this + * path (guest_region_add_ex_owned_gpa is the final commit), so + * region metadata is consistent without further cleanup. + */ + if (inflight_fresh_block_va != UINT64_MAX) { + if (guest_invalidate_ptes(g, inflight_fresh_block_va, + inflight_fresh_block_va + BLOCK_2MIB) < 0) { + log_error( + "sys_mmap_fixed_high_va: rollback invalidate failed for " + "fresh block [0x%llx, 0x%llx)", + (unsigned long long) inflight_fresh_block_va, + (unsigned long long) (inflight_fresh_block_va + BLOCK_2MIB)); + } + } + if (va_installed_end > va_start) { + if (guest_invalidate_ptes(g, addr, addr + length) < 0) { + log_error( + "sys_mmap_fixed_high_va: rollback invalidate failed for " + "VA [0x%llx, 0x%llx)", + (unsigned long long) addr, + (unsigned long long) (addr + length)); + } + } + if (track_backing_fd >= 0) + close(track_backing_fd); + if (close_host_backing_fd && host_backing_fd >= 0) + close(host_backing_fd); + host_fd_ref_close(&backing_ref); + return ret; +} + static int mremap_extend_range(guest_t *g, uint64_t off, uint64_t size, @@ -425,6 +740,7 @@ static int restore_file_overlay_range(guest_t *g, typedef struct { uint64_t start; uint64_t end; + uint64_t gpa_base; int prot; int flags; uint64_t offset; @@ -500,6 +816,7 @@ static int capture_region_snapshots(guest_t *g, region_snapshot_t *snap = &snaps[n++]; snap->start = r->start; snap->end = r->end; + snap->gpa_base = r->gpa_base; snap->prot = r->prot; snap->flags = r->flags; snap->offset = r->offset; @@ -617,10 +934,10 @@ static int restore_region_snapshots(guest_t *g, region_snapshot_t *snaps, int n) { for (int i = 0; i < n; i++) { region_snapshot_t *snap = &snaps[i]; - if (guest_region_add_ex_owned(g, snap->start, snap->end, snap->prot, - snap->flags, snap->offset, - snap->name[0] ? snap->name : NULL, - snap->backing_fd) < 0) { + if (guest_region_add_ex_owned_gpa( + g, snap->start, snap->end, snap->gpa_base, snap->prot, + snap->flags, snap->offset, snap->name[0] ? snap->name : NULL, + snap->backing_fd) < 0) { snap->backing_fd = -1; close_region_snapshots(snaps, n); return -LINUX_ENOMEM; @@ -1178,8 +1495,15 @@ int64_t sys_mmap(guest_t *g, if (!is_anon && (offset & 4095)) return -LINUX_EINVAL; - if (!is_anon && fuse_fd_refuse_mmap(fd)) - return -LINUX_ENODEV; + if (!is_anon && fuse_fd_refuse_mmap(fd)) { + bool allow_materialized_fuse_mmap = + g->is_rosetta && + ((flags & LINUX_MAP_FIXED) || + (flags & LINUX_MAP_FIXED_NOREPLACE)) && + addr >= g->guest_size && !(flags & LINUX_MAP_SHARED); + if (!allow_materialized_fuse_mmap) + return -LINUX_ENODEV; + } /* Round length up to page size (overflow-safe) */ if (length > UINT64_MAX - 4095) @@ -1207,9 +1531,27 @@ int64_t sys_mmap(guest_t *g, if (addr > 0x0000FFFFFFFFFFFFULL) return -LINUX_ENOMEM; + if (addr >= g->guest_size) + return sys_mmap_fixed_high_va(g, addr, length, prot, flags, fd, + offset, is_noreplace); + + /* High-VA MAP_FIXED (rosetta's JIT slabs at 240 TiB, code caches + * at 85 TiB, etc.) is not safe to expose yet. The previous draft + * could install TTBR0 aliases for addresses above the primary guest + * buffer, but munmap/mprotect/fork bookkeeping still track regions + * by low GPA and mmap_next only. Returning success here therefore + * created mappings that later teardown paths could not manage and + * let overflow-backed aliases corrupt the fork snapshot high-water + * mark. Fail closed until the region metadata and teardown paths are + * made VA-aware end-to-end. + */ /* MAP_FIXED: addr is IPA-based, convert to offset */ uint64_t off = addr - g->ipa_base; - /* Use subtraction-based check to avoid off+length overflow */ + /* Use subtraction-based check to avoid off+length overflow. + * Stays primary-buffer-only for the low-VA path because the body + * below issues raw host_base+off arithmetic (memset, pread, etc.). + * The high-VA path above is the alternate route for rosetta. + */ if (off > g->guest_size || length > g->guest_size - off) return -LINUX_ENOMEM; @@ -1792,7 +2134,10 @@ int64_t sys_mremap(guest_t *g, ~(LINUX_MREMAP_MAYMOVE | LINUX_MREMAP_FIXED | LINUX_MREMAP_DONTUNMAP)) return -LINUX_EINVAL; - /* Overflow check on old range */ + /* Overflow check on old range. mremap's body shrinks, copies, and zeroes + * via raw host_base+off arithmetic, so the check stays primary-only + * here until the data-movement paths are made region-aware. + */ uint64_t old_off = old_addr - g->ipa_base; if (old_off > g->guest_size) return -LINUX_EFAULT; @@ -1835,6 +2180,9 @@ int64_t sys_mremap(guest_t *g, if (new_addr & 4095) return -LINUX_EINVAL; uint64_t new_off = new_addr - g->ipa_base; + /* MREMAP_FIXED dest stays primary-only for the same reason as the + * source check above. + */ if (new_off > g->guest_size || new_size > g->guest_size - new_off) return -LINUX_ENOMEM; @@ -2300,9 +2648,13 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice) return 0; /* Range must lie within the guest IPA window. Linux returns -ENOMEM - * (not -EINVAL) for addresses outside the process address space — see + * (not -EINVAL) for addresses outside the process address space; see * madvise(2): "Addresses in the specified range are not currently - * mapped, or are outside the address space of the process." + * mapped, or are outside the address space of the process." Stays + * primary-only because MADV_DONTNEED zeroes via raw host_base+off and + * the slab restore path likewise assumes primary backing. The + * region-aware variant will consume guest_is_valid_range once the body + * is updated. */ uint64_t off = addr - g->ipa_base; if (off > g->guest_size || length > g->guest_size - off) @@ -2521,6 +2873,14 @@ int64_t sys_munmap(guest_t *g, uint64_t addr, uint64_t length) return -LINUX_EINVAL; if (addr <= 0x0000FFFFFFFFFFFFULL) { + if (addr >= g->guest_size) { + if (region_range_overlaps(g, addr, addr + length)) { + if (guest_invalidate_ptes(g, addr, addr + length) < 0) + return -LINUX_ENOMEM; + guest_region_remove(g, addr, addr + length); + } + return 0; + } uint64_t unmap_off = addr - g->ipa_base; if (unmap_off <= g->guest_size && length <= g->guest_size - unmap_off) { uint64_t end = unmap_off + length; @@ -2583,6 +2943,23 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot) return -LINUX_EINVAL; if (addr <= 0x0000FFFFFFFFFFFFULL) { + if (addr >= g->guest_size) { + uint64_t mprot_end = addr + length; + if (guest_kbuf_user_va_overlap(addr, length) && + (prot & LINUX_PROT_EXEC)) + return -LINUX_EINVAL; + + guest_region_set_prot(g, addr, mprot_end, prot); + if (prot != LINUX_PROT_NONE) { + if (guest_update_perms(g, addr, mprot_end, + prot_to_perms(prot)) < 0) + return -LINUX_ENOMEM; + } else { + if (guest_invalidate_ptes(g, addr, mprot_end) < 0) + return -LINUX_ENOMEM; + } + return 0; + } uint64_t mprot_off = addr - g->ipa_base; if (mprot_off <= g->guest_size && length <= g->guest_size - mprot_off) { uint64_t mprot_end = mprot_off + length; @@ -2761,6 +3138,11 @@ int64_t sys_msync(guest_t *g, uint64_t addr, uint64_t length, int flags) return -LINUX_ENOMEM; uint64_t off = addr - g->ipa_base; + /* sys_msync stays primary-only here for the same reason as madvise: + * msync iterates regions and reaches into host_base+off to read pages + * back from the file overlay. Widening to extra-region ranges needs a + * region-aware iterator landing alongside the data-movement refactor. + */ if (off > g->guest_size || length > g->guest_size - off) return -LINUX_ENOMEM; uint64_t end = off + length; diff --git a/src/syscall/net-sockopt.c b/src/syscall/net-sockopt.c index ec21b3b..d52fc3d 100644 --- a/src/syscall/net-sockopt.c +++ b/src/syscall/net-sockopt.c @@ -124,6 +124,8 @@ static int net_sock_opt_index_for(int level, int optname) } if (level == LINUX_IPPROTO_IPV6 && optname == LINUX_IPV6_V6ONLY) return SOCK_OPT_IPV6_V6ONLY; + if (level == LINUX_IPPROTO_IP && optname == LINUX_IP_MTU_DISCOVER) + return SOCK_OPT_IP_MTU_DISCOVER; return -1; } diff --git a/src/syscall/net.c b/src/syscall/net.c index fb004cc..b80ca18 100644 --- a/src/syscall/net.c +++ b/src/syscall/net.c @@ -21,6 +21,7 @@ #include "utils.h" +#include "core/rosetta.h" #include "debug/log.h" #include #include @@ -42,10 +43,46 @@ /* Syscall implementations. */ -int64_t sys_socket(guest_t *g, int domain, int type, int protocol) +static bool rosetta_socket_shim_enabled(guest_t *g) +{ + if (!g || !g->is_rosetta) + return false; + + size_t cmdline_len = 0; + const char *cmdline = proc_get_cmdline(&cmdline_len); + size_t rosetta_len = strlen(ROSETTA_PATH); + + return cmdline && cmdline_len > rosetta_len && + memcmp(cmdline, ROSETTA_PATH, rosetta_len + 1) == 0; +} + +static bool rosettad_connect_target(const struct sockaddr_storage *mac_sa) +{ + if (!mac_sa || mac_sa->ss_family != AF_UNIX) + return false; + const struct sockaddr_un *sun = (const struct sockaddr_un *) mac_sa; + return strcmp(sun->sun_path, ROSETTAD_SOCKET_PATH) == 0; +} + +static bool rosetta_seqpacket_placeholder(guest_t *g, int guest_fd, int host_fd) { - (void) g; + int cached_type = 0; + if (!rosetta_socket_shim_enabled(g) || + !net_socket_cached_int_get(guest_fd, LINUX_SOL_SOCKET, LINUX_SO_TYPE, + &cached_type) || + cached_type != LINUX_SOCK_SEQPACKET || rosettad_is_socket(host_fd)) + return false; + + int so_type = 0; + socklen_t so_type_len = sizeof(so_type); + if (getsockopt(host_fd, SOL_SOCKET, SO_TYPE, &so_type, &so_type_len) < 0) + return false; + return (so_type & 0xF) == SOCK_STREAM; +} + +int64_t sys_socket(guest_t *g, int domain, int type, int protocol) +{ /* AF_NETLINK: synthetic emulation, no macOS equivalent */ if (domain == LINUX_AF_NETLINK) return netlink_socket(protocol, type); @@ -55,6 +92,37 @@ int64_t sys_socket(guest_t *g, int domain, int type, int protocol) int nonblock = extract_sock_nonblock(type); int cloexec = extract_sock_cloexec(type); + /* Rosetta opens AF_UNIX SOCK_SEQPACKET to talk to rosettad. macOS does + * not support SOCK_SEQPACKET on AF_UNIX, so while the translator process + * is active we create an unconnected SOCK_STREAM placeholder instead. + * sys_connect() upgrades only the specific rosettad path to the private + * socketpair/handler transport; any other connect on this placeholder + * fails so unrelated Unix IPC is not silently downgraded to STREAM. + */ + if (rosetta_socket_shim_enabled(g) && mac_domain == AF_UNIX && + real_type == LINUX_SOCK_SEQPACKET) { + int fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return linux_errno(); + int one = 1; + setsockopt(fd, SOL_SOCKET, SO_NOSIGPIPE, &one, sizeof(one)); + if ((nonblock && fd_set_nonblock(fd) < 0) || + (cloexec && fd_set_cloexec(fd) < 0)) { + close(fd); + return linux_errno(); + } + + int gfd = fd_alloc(FD_SOCKET, fd, NULL); + if (gfd < 0) { + close(fd); + return -LINUX_EMFILE; + } + if (cloexec) + fd_table[gfd].linux_flags |= LINUX_O_CLOEXEC; + net_socket_cache_init_defaults(gfd, domain, real_type); + return gfd; + } + int fd = socket(mac_domain, real_type, protocol); if (fd < 0) return linux_errno(); @@ -368,6 +436,65 @@ int64_t sys_connect(guest_t *g, int fd, uint64_t addr_gva, uint32_t addrlen) return 0; } + /* Upgrade the translator's fake AF_UNIX/SOCK_SEQPACKET placeholder to the + * private rosettad bridge only when it actually connects to the rosettad + * Unix path from the VZ_CAPS payload. + */ + int cached_type = 0; + bool shimmed_seqpacket = rosetta_seqpacket_placeholder(g, fd, host_ref.fd); + if (shimmed_seqpacket && + net_socket_cached_int_get(fd, LINUX_SOL_SOCKET, LINUX_SO_TYPE, + &cached_type) && + rosettad_connect_target(&mac_sa)) { + int pair[2]; + if (socketpair(AF_UNIX, SOCK_STREAM, 0, pair) < 0) { + host_fd_ref_close(&host_ref); + return linux_errno(); + } + + int one = 1; + setsockopt(pair[0], SOL_SOCKET, SO_NOSIGPIPE, &one, sizeof(one)); + setsockopt(pair[1], SOL_SOCKET, SO_NOSIGPIPE, &one, sizeof(one)); + + int old_status = fcntl(host_ref.fd, F_GETFL, 0); + fd_entry_t snap; + bool have_snap = fd_snapshot(fd, &snap); + if ((old_status >= 0 && (old_status & O_NONBLOCK) && + fd_set_nonblock(pair[0]) < 0) || + (have_snap && (snap.linux_flags & LINUX_O_CLOEXEC) && + fd_set_cloexec(pair[0]) < 0)) { + close(pair[0]); + close(pair[1]); + host_fd_ref_close(&host_ref); + return linux_errno(); + } + + if (fd_alloc_at(fd, FD_SOCKET, pair[0]) < 0) { + close(pair[0]); + close(pair[1]); + host_fd_ref_close(&host_ref); + return -LINUX_EMFILE; + } + if (have_snap) + fd_table[fd].linux_flags = snap.linux_flags; + net_socket_cache_init_defaults(fd, LINUX_AF_UNIX, cached_type); + + if (rosettad_start_handler(pair[1], pair[0]) < 0) { + close(pair[1]); + log_warn( + "sys_connect: rosettad handler thread failed to start; " + "rosetta will see EOF on its socketpair"); + } + + host_fd_ref_close(&host_ref); + return 0; + } + + if (shimmed_seqpacket) { + host_fd_ref_close(&host_ref); + return -LINUX_EPROTOTYPE; + } + if (connect(host_ref.fd, (struct sockaddr *) &mac_sa, (socklen_t) mac_len) < 0) { host_fd_ref_close(&host_ref); @@ -676,8 +803,42 @@ int64_t sys_setsockopt(guest_t *g, return 0; } - if (level == LINUX_IPPROTO_IP && - (optname == LINUX_IP_MTU_DISCOVER || optname == LINUX_IP_RECVERR)) { + if (level == LINUX_IPPROTO_IP && optname == LINUX_IP_MTU_DISCOVER) { + /* P2P networking tools (libp2p, syncthing, WireGuard userland, + * Tailscale's bundled tailscaled) set IP_MTU_DISCOVER early in + * connect and abort on -ENOPROTOOPT. macOS has no direct + * equivalent; accept the option, cache the Linux PMTUD mode for + * getsockopt round-trip, and where the host can honour it, push + * the closest IP_DONTFRAG setting onto the underlying socket. + * Linux PMTUD modes: + * 0 DONT / 1 WANT -> allow fragmentation (DONTFRAG off) + * 2 DO / 3 PROBE / 4 INTERFACE -> set DF (DONTFRAG on) + * 5 OMIT -> behave like DONT (best-effort) + */ + if (!net_socket_fd_is_valid(fd)) + return -LINUX_EBADF; + if (optlen > sizeof(int)) + return -LINUX_EINVAL; + int value = 0; + if (optlen > 0 && guest_read_small(g, optval_gva, &value, optlen) < 0) + return -LINUX_EFAULT; + net_socket_cached_int_set(fd, LINUX_IPPROTO_IP, LINUX_IP_MTU_DISCOVER, + value); + host_fd_ref_t hr; + if (host_fd_ref_open(fd, &hr) == 0) { + int dontfrag = (value >= 2 && value <= 4) ? 1 : 0; + (void) setsockopt(hr.fd, IPPROTO_IP, IP_DONTFRAG, &dontfrag, + sizeof(dontfrag)); + host_fd_ref_close(&hr); + } + return 0; + } + if (level == LINUX_IPPROTO_IP && optname == LINUX_IP_RECVERR) { + /* No macOS equivalent for the Linux extended-error queue. Accept + * and discard; the queue stays empty, so subsequent recvmsg with + * MSG_ERRQUEUE returns -EAGAIN as Linux would for a quiescent + * connection. + */ if (!net_socket_fd_is_valid(fd)) return -LINUX_EBADF; if (optlen > sizeof(int)) @@ -854,7 +1015,14 @@ int64_t sys_getsockopt(guest_t *g, if (!net_socket_fd_is_valid(fd)) return -LINUX_EBADF; if (guest_optlen >= sizeof(int)) { + /* IP_MTU_DISCOVER round-trips through the per-fd cache so + * getsockopt reports what the guest last wrote via setsockopt. + * IP_RECVERR has no cache (the extended-error queue stays + * permanently empty), so it always reports 1. + */ int val = 1; + if (optname == LINUX_IP_MTU_DISCOVER) + (void) net_socket_cached_int_get(fd, level, optname, &val); uint32_t out_len = sizeof(int); if (guest_write_small(g, optval_gva, &val, sizeof(val)) < 0) return -LINUX_EFAULT; diff --git a/src/syscall/net.h b/src/syscall/net.h index 01ec76c..29ecbb4 100644 --- a/src/syscall/net.h +++ b/src/syscall/net.h @@ -26,6 +26,9 @@ #define NETLINK_ROUTE 0 /* Linux socket types + flags. */ +#define LINUX_SOCK_STREAM 1 +#define LINUX_SOCK_DGRAM 2 +#define LINUX_SOCK_SEQPACKET 5 #define LINUX_SOCK_NONBLOCK 0x800 #define LINUX_SOCK_CLOEXEC 0x80000 diff --git a/src/syscall/proc.c b/src/syscall/proc.c index 71d2ba3..51ea50f 100644 --- a/src/syscall/proc.c +++ b/src/syscall/proc.c @@ -54,6 +54,19 @@ static _Atomic uint64_t wxcount_to_rx = 0; /* RW->RX (exec fault) */ static _Atomic uint64_t wxcount_to_rw = 0; /* RX->RW (write fault) */ static _Atomic uint64_t sysreg_write_count = 0; /* EC=0x18 Dir=0 (DC CVAU, IC IVAU, etc.) */ +/* x86_64-via-Rosetta is on by default: the architecture is auto-detected + * from the ELF header (EM_X86_64), and rosetta is the only viable path for + * those binaries on Apple Silicon. The --no-rosetta CLI flag (or + * ELFUSE_NO_ROSETTA=1) disables it; without rosetta installed, the rosetta + * loader fails its access() check and surfaces an install hint regardless. + */ +static _Atomic bool rosetta_enabled = true; +/* Runtime indicator: distinct from rosetta_enabled (user opt-in). Set when + * the active guest_t is actually running under rosetta, so callers without + * direct guest_t access (proc_intercept_readlink, log paths) can branch on + * runtime state without threading g through every signature. + */ +static _Atomic bool rosetta_active = false; /* Process table for tracking fork children */ static proc_entry_t proc_table[PROC_TABLE_SIZE]; @@ -83,6 +96,26 @@ void proc_init(void) futex_init(); } +void proc_set_rosetta_enabled(bool enabled) +{ + atomic_store(&rosetta_enabled, enabled); +} + +bool proc_rosetta_enabled(void) +{ + return atomic_load(&rosetta_enabled); +} + +void proc_set_rosetta_active(bool active) +{ + atomic_store(&rosetta_active, active); +} + +bool proc_rosetta_active(void) +{ + return atomic_load(&rosetta_active); +} + void proc_request_exit_group(int code) { atomic_store(&exit_group_code, code); diff --git a/src/syscall/proc.h b/src/syscall/proc.h index 3b5cc94..6b1e161 100644 --- a/src/syscall/proc.h +++ b/src/syscall/proc.h @@ -82,6 +82,18 @@ void proc_set_elfuse_path(const char *path); /* Get the stored elfuse binary path. Returns NULL if not set. */ const char *proc_get_elfuse_path(void); +/* Process-wide feature gate for x86_64-via-Rosetta support. */ +void proc_set_rosetta_enabled(bool enabled); +bool proc_rosetta_enabled(void); + +/* Runtime indicator: true once the guest_t has been initialised in rosetta + * mode. Distinct from proc_rosetta_enabled which reflects the user opt-in. + * Code paths that lack direct guest_t access (proc_intercept_readlink) can + * branch on the runtime state without threading g through every signature. + */ +void proc_set_rosetta_active(bool active); +bool proc_rosetta_active(void); + /* Store the guest command line for /proc/self/cmdline emulation. * argv is a NULL-terminated array of strings. */ diff --git a/src/syscall/time.c b/src/syscall/time.c index a756d27..8a76c4b 100644 --- a/src/syscall/time.c +++ b/src/syscall/time.c @@ -15,9 +15,10 @@ #include "utils.h" +#include "runtime/thread.h" /* current_thread, guest_tid */ #include "syscall/abi.h" #include "syscall/internal.h" -#include "syscall/proc.h" /* proc_exit_group_requested */ +#include "syscall/proc.h" /* proc_exit_group_requested, proc_get_pid */ #include "syscall/signal.h" #include "syscall/time.h" @@ -178,14 +179,32 @@ static int translate_clockid(int linux_clockid) return CLOCK_MONOTONIC; default: /* Handle Linux dynamic CPU clock IDs (negative values). - * Decode: pid = ~(clockid >> 3), perthread = clockid & 4 - * Only support pid=0 (self); other pids have no macOS equivalent. + * Decode: encoded id = ~(clockid >> 3), perthread = clockid & 4, + * type bits = clockid & 3 (PROF=0, VIRT=1, SCHED=2). + * Linux's convention: + * encoded id == 0 -> "self" (process or thread variant) + * encoded id == pid -> that process + * per-thread variant: encoded id == 0 means current thread, + * or the target TID for pthread_getcpuclockid. + * + * The macOS host only exposes CLOCK_THREAD_CPUTIME_ID for the + * calling thread, so cross-thread or cross-process queries are + * unsupportable. Accept both process and per-thread clocks when + * they refer to self (encoded 0 or matching self pid/tid). Reject + * foreign ids and reserved type bits with -EINVAL. */ if (linux_clockid < 0) { - int pid = LINUX_CPUCLOCK_PID(linux_clockid); - if (pid != 0) - return -1; /* Other process CPU times are unavailable */ + int type_bits = linux_clockid & 3; + if (type_bits == 3) + return -1; /* Reserved type bits in dynamic clock id */ + int encoded_id = LINUX_CPUCLOCK_PID(linux_clockid); bool is_perthread = linux_clockid & LINUX_CPUCLOCK_PERTHREAD_MASK; + int self_pid = (int) proc_get_pid(); + int self_tid = + current_thread ? (int) current_thread->guest_tid : self_pid; + int self_match = is_perthread ? self_tid : self_pid; + if (encoded_id != 0 && encoded_id != self_match) + return -1; /* Foreign process/thread clocks unsupported */ return is_perthread ? CLOCK_THREAD_CPUTIME_ID : CLOCK_PROCESS_CPUTIME_ID; } diff --git a/tests/bench-rosetta.sh b/tests/bench-rosetta.sh new file mode 100755 index 0000000..48e031a --- /dev/null +++ b/tests/bench-rosetta.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash +# bench-rosetta.sh - Wall-clock benchmark harness for x86_64-via-Rosetta +# +# Copyright 2026 elfuse contributors +# SPDX-License-Identifier: Apache-2.0 +# +# Measures wall-clock time for a curated set of static x86_64 workloads +# running under elfuse + Rosetta. The intent is a stable reproducible +# regression baseline; absolute numbers are noisy on macOS so the script +# prints best-of-N runs and a coefficient of variation. +# +# The bench deliberately stays self-contained: +# - workloads come from the Alpine x86_64 staticbin tree +# - no external comparison runs (those need separate hardware access) +# +# To compare against native x86_64 hardware or aarch64 hosts, capture the +# same workloads' wall-clock there and paste them into the output yourself. +# +# Usage: tests/bench-rosetta.sh [path/to/elfuse] [iterations] + +set -euo pipefail + +ELFUSE_INPUT="${1:-build/elfuse}" +ITERS="${2:-5}" +case "$ELFUSE_INPUT" in + /*) ELFUSE="$ELFUSE_INPUT" ;; + *) ELFUSE="$(pwd)/$ELFUSE_INPUT" ;; +esac + +FIXTURES="${FIXTURES_DIR:-externals/test-fixtures}" +STATICBIN_LONG="${FIXTURES}/x86_64-musl/staticbin/bin" +ROSETTA_PATH=/Library/Apple/usr/libexec/oah/RosettaLinux/rosetta +SHORTDIR=/tmp/elfuse-br + +if [ ! -x "$ROSETTA_PATH" ]; then + printf 'rosetta translator not found at %s\n' "$ROSETTA_PATH" >&2 + exit 77 +fi +if [ ! -x "${STATICBIN_LONG}/busybox" ]; then + printf 'x86_64 fixture tree missing at %s\n' "$STATICBIN_LONG" >&2 + printf 'stage via: INCLUDE_X86_64=1 bash tests/fetch-fixtures.sh\n' >&2 + exit 77 +fi +if [ ! -x "$ELFUSE" ]; then + printf 'elfuse not found: %s\n' "$ELFUSE" >&2 + exit 1 +fi + +# Stage symlinks so paths stay inside rosetta's 42-byte caps cap. +rm -rf "$SHORTDIR" +mkdir -p "${SHORTDIR}/bin" "${SHORTDIR}/data" +staticbin_abs="$(cd "$STATICBIN_LONG" && pwd)" +ln -s "${staticbin_abs}/busybox" "${SHORTDIR}/bin/busybox" +for applet in echo cat seq factor sha256sum md5sum sha512sum sort wc \ + expr base64 cksum; do + ln -s busybox "${SHORTDIR}/bin/${applet}" +done + +trap 'rm -rf "$SHORTDIR"' EXIT + +# Pre-generate a small input file for hash/sort workloads. 64 KiB. +data="${SHORTDIR}/data/in.bin" +dd if=/dev/urandom of="$data" bs=1024 count=64 status=none + +# Capture wall-clock in nanoseconds across N iterations of CMD. Returns +# the best (minimum) sample and a basic spread (max - min). +# Args: