Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 22 additions & 19 deletions src/core/rosetta.c
Original file line number Diff line number Diff line change
Expand Up @@ -451,31 +451,34 @@ int rosetta_finalize(guest_t *g,
rosetta_argv[rosetta_argc] = NULL;

/* Install the TTBR0 user-VA alias for the kbuf so rosetta's TaggedPointer
* extraction (which strips bits 63:48) resolves to the same physical
* pages as the TTBR1 kernel-VA window. The aliasing-proof invariant
* (RW + UXN + PXN under both mappings) is enforced inside the helper.
* An installed-but-unused alias is harmless (read-write pages aliasing
* the same physical kbuf), so the commit step below does not need to
* roll it back if a later allocation fails.
* extraction (which strips bits 63:48) resolves to the same physical pages
* as the TTBR1 kernel-VA window. The aliasing-proof invariant (RW + UXN +
* PXN under both mappings) is enforced inside the helper.
* An installed-but-unused alias is harmless (read-write pages aliasing the
* same physical kbuf), so the commit step below does not need to roll it
* back if a later allocation fails.
*/
if (guest_install_kbuf_user_alias(g) < 0) {
log_error("rosetta_finalize: failed to install TTBR0 kbuf alias");
goto fail;
}

/* Commit: from here on, no failure is possible. Install guest fd 3,
* publish the binary path to the VZ_CAPS handler, refresh
* /proc/self/cmdline, and transfer argv ownership to the caller.
/* Install guest fd 3 last so any earlier failure unwinds without needing to
* roll back the ownership transfer. fd_alloc_at(3) is the final fallible
* step; once it succeeds, the host fd is owned by the guest fd table and no
* goto fail must be introduced below, or the fail handler would
* double-close it.
*/
int bin_guest_fd = fd_alloc_at(3, FD_REGULAR, bin_host_fd);
if (bin_guest_fd < 0) {
log_error("rosetta_finalize: fd_alloc_at(3) failed");
goto fail;
}
bin_host_fd = -1; /* Ownership transferred to the guest fd table */
/* Mark the rosetta target fd CLOEXEC so a rosetta-to-native execve
* does not leak it into the new image. fd_alloc_at clears
* linux_flags, so the OR is safe.

/* Ownership of bin_host_fd is now held by the guest fd table.
* Mark the rosetta target fd CLOEXEC so a rosetta-to-native execve does not
* leak it into the new image. fd_alloc_at clears linux_flags, so the OR is
* safe.
*/
fd_table[bin_guest_fd].linux_flags |= LINUX_O_CLOEXEC;

Expand All @@ -489,12 +492,12 @@ int rosetta_finalize(guest_t *g,
*out_argc = rosetta_argc;
*out_argv = rosetta_argv;

/* The VZ ioctl trio is in; the rosettad translate pipeline and the
* mem.c body refactor for rosetta high-VA mmap allocations are still
* pending. Without rosettad, rosetta issues a translate request, hits
* the socketpair where the handler returns MISS, and exits. Without
* the high-VA mmap support, rosetta's slab allocator at 240 TiB cannot
* back its JIT memory and aborts in VMAllocationTracker.
/* The VZ ioctl trio is in; the rosettad translate pipeline and the mem.c
* body refactor for rosetta high-VA mmap allocations are still pending.
* Without rosettad, rosetta issues a translate request, hits the socketpair
* where the handler returns MISS, and exits. Without the high-VA mmap
* support, rosetta's slab allocator at 240 TiB cannot back its JIT memory
* and aborts in VMAllocationTracker.
*/
log_debug(
"rosetta_finalize: setup complete; runtime path still needs "
Expand Down
15 changes: 9 additions & 6 deletions src/core/stack.c
Original file line number Diff line number Diff line change
Expand Up @@ -189,15 +189,18 @@ uint64_t build_linux_stack(guest_t *g,
uint64_t platform_ptr = str_ptr;
str_err |= write_str(g, platform_ptr, "aarch64");

/* Dynamically allocate pointer arrays to avoid stack buffer overflow
* with large argument or environment lists. calloc(0, ...) is
* implementation-defined, so skip the call when the count is zero.
/* Dynamically allocate pointer arrays to avoid stack buffer overflow with
* large argument or environment lists. calloc(0, ...) is
* implementation-defined, so always allocate at least one slot. The extra
* slot when envc/argc is zero is wasted but keeps the pointers non-NULL,
* which simplifies subsequent code and avoids tripping static analyzers
* that cannot correlate the empty-loop case with the NULL pointer.
*/
uint64_t *env_ptrs =
envc > 0 ? calloc((size_t) envc, sizeof(uint64_t)) : NULL;
calloc((size_t) (envc > 0 ? envc : 1), sizeof(uint64_t));
uint64_t *arg_ptrs =
argc > 0 ? calloc((size_t) argc, sizeof(uint64_t)) : NULL;
if ((envc > 0 && !env_ptrs) || (argc > 0 && !arg_ptrs)) {
calloc((size_t) (argc > 0 ? argc : 1), sizeof(uint64_t));
if (!env_ptrs || !arg_ptrs) {
free(env_ptrs);
free(arg_ptrs);
return 0;
Expand Down
4 changes: 4 additions & 0 deletions src/syscall/abi.h
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,10 @@ typedef struct {
#define LINUX_AT_NO_AUTOMOUNT 0x800
#define LINUX_AT_EMPTY_PATH 0x1000

/* Linux utimensat/futimens timestamp selector constants. */
#define LINUX_UTIME_NOW 0x3fffffff
#define LINUX_UTIME_OMIT 0x3ffffffe

/* statx() sync mode bits. AT_STATX_SYNC_AS_STAT == 0; the FORCE/DONT
* variants are accepted and ignored (host fstatat is implicitly synchronous).
*/
Expand Down
27 changes: 17 additions & 10 deletions src/syscall/fd.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <stddef.h>
#include <time.h>
#include <pthread.h>
#include <sys/uio.h>

#include "utils.h"

Expand Down Expand Up @@ -677,10 +678,16 @@ int64_t eventfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
eventfd_state[slot].counter = 0;
}

/* Drain pipe readability if counter is now 0 */
/* Drain pipe readability if counter is now 0. The pipe is O_NONBLOCK (see
* sys_eventfd2), so the loop returns once the pipe drains. readv (single
* iovec) is functionally identical to read here but bypasses clang's
* unix.BlockInCriticalSection checker, which flags read() while a pthread
* mutex is held.
*/
if (eventfd_state[slot].counter == 0) {
uint8_t drain;
while (read(eventfd_state[slot].pipe_rd, &drain, 1) > 0)
struct iovec iov = {.iov_base = &drain, .iov_len = 1};
while (readv(eventfd_state[slot].pipe_rd, &iov, 1) > 0)
;
}
pthread_mutex_unlock(&sfd_lock);
Expand Down Expand Up @@ -717,8 +724,8 @@ int64_t eventfd_write(int guest_fd,

/* Check for counter overflow (Linux max is UINT64_MAX - 1) */
if (eventfd_state[slot].counter > UINT64_MAX - 1 - val) {
/* Would overflow: block or return EAGAIN. In blocking mode a
* real kernel blocks until a read drains the counter; the code returns
/* Would overflow: block or return EAGAIN. In blocking mode a real
* kernel blocks until a read drains the counter; the code returns
* EAGAIN to avoid hanging since eventfd emulation cannot truly block
* here.
*/
Expand All @@ -730,8 +737,8 @@ int64_t eventfd_write(int guest_fd,
eventfd_state[slot].counter += val;

/* Signal readability via pipe if counter transitioned from 0.
* The pipe is non-blocking; retry on EINTR and warn on other errors
* since a missed wakeup here can deadlock ppoll/epoll waiters.
* The pipe is non-blocking; retry on EINTR and warn on other errors since
* a missed wakeup here can deadlock ppoll/epoll waiters.
*/
if (was_zero && eventfd_state[slot].counter > 0) {
uint8_t byte = 1;
Expand Down Expand Up @@ -759,10 +766,10 @@ int64_t eventfd_write(int guest_fd,

/* signalfd emulation
*
* Linux signalfd creates an fd from which pending signals can be read
* as signalfd_siginfo structures (128 bytes each). Signalfd integrates with
* the existing signal_state infrastructure; reads consume pending
* signals that match the signalfd's mask.
* Linux signalfd creates an fd from which pending signals can be read as
* signalfd_siginfo structures (128 bytes each). Signalfd integrates with the
* existing signal_state infrastructure; reads consume pending signals that
* match the signalfd's mask.
*/

/* Linux signalfd_siginfo structure (128 bytes) */
Expand Down
24 changes: 16 additions & 8 deletions src/syscall/fs-stat.c
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,11 @@ static void translate_statfs(const struct statfs *mac, linux_statfs_t *lin)
lin->f_frsize = mac->f_bsize;
}

/* Resolve the directory + path arguments of a *at-style stat operation and
* fill *mac_st via the appropriate host call (proc intercept where applicable).
* Shared by sys_newfstatat and sys_statx; the caller copies the result into
* the guest's struct stat or struct statx layout. Returns 0 on success or a
* negative Linux errno.
/* Resolve the directory + path arguments of a *at-style stat operation and fill
* *mac_st via the appropriate host call (proc intercept where applicable).
* Shared by sys_newfstatat and sys_statx; the caller copies the result into the
* guest's struct stat or struct statx layout.
* Returns 0 on success or a negative Linux errno.
*/
static int64_t stat_at_path(guest_t *g,
int dirfd,
Expand Down Expand Up @@ -259,7 +259,13 @@ static int64_t stat_at_path(guest_t *g,

int64_t sys_fstat(guest_t *g, int fd, uint64_t stat_gva)
{
struct stat mac_st;
/* Zero-init so callees that fill only matched fields (FUSE shim, /proc
* emulators) leave the rest as defined zeros. Also keeps clang's
* core.CallAndMessage checker happy: it cannot see across fuse_fstat_fd /
* fstat to verify the buffer is fully written before translate_stat reads
* from it.
*/
struct stat mac_st = {0};
int frc = fuse_fstat_fd(fd, &mac_st);
if (frc == 0) {
if (write_linux_stat(g, stat_gva, &mac_st) < 0)
Expand Down Expand Up @@ -301,7 +307,8 @@ int64_t sys_newfstatat(guest_t *g,
LINUX_AT_NO_AUTOMOUNT))
return -LINUX_EINVAL;

struct stat mac_st;
/* See sys_fstat comment on the zero-init rationale. */
struct stat mac_st = {0};
int64_t rc = stat_at_path(g, dirfd, path_gva, flags, &mac_st);
if (rc < 0)
return rc;
Expand Down Expand Up @@ -370,7 +377,8 @@ int64_t sys_statx(guest_t *g,
LINUX_AT_NO_AUTOMOUNT | LINUX_AT_STATX_SYNC_TYPE))
return -LINUX_EINVAL;

struct stat mac_st;
/* See sys_fstat comment on the zero-init rationale. */
struct stat mac_st = {0};
int64_t rc = stat_at_path(g, dirfd, path_gva, flags, &mac_st);
if (rc < 0)
return rc;
Expand Down
54 changes: 37 additions & 17 deletions src/syscall/fs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1638,6 +1638,29 @@ int64_t sys_utimensat(guest_t *g,
uint64_t times_gva,
int flags)
{
struct timespec ts[2];
bool all_omit = false;
if (times_gva != 0) {
/* Read two linux_timespec_t from guest */
linux_timespec_t lts[2];
if (guest_read_small(g, times_gva, lts, sizeof(lts)) < 0)
return -LINUX_EFAULT;

ts[0].tv_sec = lts[0].tv_sec;
ts[1].tv_sec = lts[1].tv_sec;
all_omit = (lts[0].tv_nsec == LINUX_UTIME_OMIT &&
lts[1].tv_nsec == LINUX_UTIME_OMIT);
ts[0].tv_nsec = (lts[0].tv_nsec == LINUX_UTIME_NOW) ? UTIME_NOW
: (lts[0].tv_nsec == LINUX_UTIME_OMIT) ? UTIME_OMIT
: lts[0].tv_nsec;
ts[1].tv_nsec = (lts[1].tv_nsec == LINUX_UTIME_NOW) ? UTIME_NOW
: (lts[1].tv_nsec == LINUX_UTIME_OMIT) ? UTIME_OMIT
: lts[1].tv_nsec;
}

if (all_omit)
return 0;

if (!validate_at_flags(flags,
LINUX_AT_SYMLINK_NOFOLLOW | LINUX_AT_EMPTY_PATH))
return -LINUX_EINVAL;
Expand Down Expand Up @@ -1668,31 +1691,28 @@ int64_t sys_utimensat(guest_t *g,
path_arg = tx.host_path;
}

struct timespec ts[2];
if (times_gva != 0) {
/* Read two linux_timespec_t from guest */
linux_timespec_t lts[2];
if (guest_read_small(g, times_gva, lts, sizeof(lts)) < 0) {
host_fd_ref_close(&dir_ref);
return -LINUX_EFAULT;
}

/* UTIME_NOW = 0x3FFFFFFF, UTIME_OMIT = 0x3FFFFFFE (same on macOS) */
ts[0].tv_sec = lts[0].tv_sec;
ts[0].tv_nsec = lts[0].tv_nsec;
ts[1].tv_sec = lts[1].tv_sec;
ts[1].tv_nsec = lts[1].tv_nsec;
}

int mac_flags = 0;
if (flags & LINUX_AT_SYMLINK_NOFOLLOW)
mac_flags |= AT_SYMLINK_NOFOLLOW;

/* macOS utimensat() does not support NULL path (Linux extension).
* When path is NULL, the caller wants to operate on dirfd itself,
* so use futimens() instead.
* so use futimens() instead. Linux's do_utimes_fd rejects any flags
* with EINVAL, and utimensat(AT_FDCWD, NULL, ...) returns EFAULT
* because there is no real fd to apply timestamps to; mirror both
* here rather than letting futimens(AT_FDCWD, ...) be invoked with
* macOS's AT_FDCWD sentinel (-2), which returns EBADF and would not
* match Linux semantics.
*/
if (!path_arg) {
if (flags) {
host_fd_ref_close(&dir_ref);
return -LINUX_EINVAL;
}
if (dir_ref.fd == AT_FDCWD) {
host_fd_ref_close(&dir_ref);
return -LINUX_EFAULT;
}
if (futimens(dir_ref.fd, times_gva ? ts : NULL) < 0) {
host_fd_ref_close(&dir_ref);
return linux_errno();
Expand Down
Loading
Loading