Skip to content

Codex sessions causing OOM causes full WSL system crash #18041

@kendonB

Description

@kendonB

What version of Codex CLI is running?

121

What subscription do you have?

Pro

Which model were you using?

gpt-5.4 xhigh

What platform is your computer?

WSL

What terminal emulator and version are you using (if applicable)?

Windows Terminal

What issue are you seeing?

If I run out of memory in a process run by a codex agent, my wsl crashes in a way that is unable to be recovered without a restart. The standard process killer works correctly and kills the process if I run it myself in an interactive session.

What steps can reproduce the bug?

Add memory_ramp_test.py

#!/usr/bin/env python3

import datetime as dt
import mmap
import os
import signal
import sys
import time


PAGE_SIZE = os.sysconf("SC_PAGE_SIZE")


def env_int(name, default):
    raw = os.environ.get(name)
    if raw is None or raw == "":
        return default
    return int(raw)


def read_meminfo():
    out = {}
    with open("/proc/meminfo", "r", encoding="utf-8") as handle:
        for line in handle:
            key, value = line.split(":", 1)
            parts = value.strip().split()
            if parts:
                out[key] = int(parts[0])
    return out


def read_status():
    out = {}
    with open("/proc/self/status", "r", encoding="utf-8") as handle:
        for line in handle:
            key, value = line.split(":", 1)
            out[key] = value.strip()
    return out


def mib_from_kib(kib):
    return kib / 1024


def timestamp():
    return dt.datetime.now(dt.timezone.utc).astimezone().isoformat(timespec="seconds")


def log(message):
    print(f"{timestamp()} {message}", flush=True)


def touch_mapping(mapping):
    length = len(mapping)
    for offset in range(0, length, PAGE_SIZE):
        mapping[offset] = 1
    mapping[length - 1] = 1


def main():
    chunk_mib = env_int("CHUNK_MIB", 512)
    max_alloc_mib = env_int("MAX_ALLOC_MIB", 20480)
    min_available_mib = env_int("MIN_AVAILABLE_MIB", 1024)
    sleep_seconds = env_int("SLEEP_SECONDS", 10)
    hold_seconds = env_int("HOLD_SECONDS", 300)

    mappings = []
    allocated_mib = 0
    stopping = False

    def handle_signal(signum, _frame):
        nonlocal stopping
        stopping = True
        log(f"received_signal={signum}; stopping after current step")

    signal.signal(signal.SIGTERM, handle_signal)
    signal.signal(signal.SIGINT, handle_signal)

    log(
        "memory_ramp_start "
        f"pid={os.getpid()} chunk_mib={chunk_mib} max_alloc_mib={max_alloc_mib} "
        f"min_available_mib={min_available_mib} sleep_seconds={sleep_seconds} hold_seconds={hold_seconds}"
    )

    try:
        step = 0
        while allocated_mib < max_alloc_mib and not stopping:
            meminfo = read_meminfo()
            available_mib = mib_from_kib(meminfo.get("MemAvailable", 0))
            if available_mib <= min_available_mib:
                log(
                    "stop_threshold_reached "
                    f"allocated_mib={allocated_mib} mem_available_mib={available_mib:.1f}"
                )
                break

            this_chunk_mib = min(chunk_mib, max_alloc_mib - allocated_mib)
            mapping = mmap.mmap(-1, this_chunk_mib * 1024 * 1024)
            touch_mapping(mapping)
            mappings.append(mapping)
            allocated_mib += this_chunk_mib
            step += 1

            status = read_status()
            meminfo = read_meminfo()
            log(
                "step "
                f"step={step} allocated_mib={allocated_mib} "
                f"vmrss={status.get('VmRSS', 'NA')} vmhwm={status.get('VmHWM', 'NA')} "
                f"mem_available_mib={mib_from_kib(meminfo.get('MemAvailable', 0)):.1f} "
                f"mem_free_mib={mib_from_kib(meminfo.get('MemFree', 0)):.1f} "
                f"swap_free_mib={mib_from_kib(meminfo.get('SwapFree', 0)):.1f}"
            )

            time.sleep(sleep_seconds)

        log(f"holding allocated_mib={allocated_mib} hold_seconds={hold_seconds}")
        time.sleep(hold_seconds)
        log(f"memory_ramp_done allocated_mib={allocated_mib}")
    except MemoryError:
        log(f"memory_error allocated_mib={allocated_mib}")
        return 2
    except BaseException as exc:
        log(f"unexpected_error type={type(exc).__name__} message={exc}")
        return 1

    return 0


if __name__ == "__main__":
    sys.exit(main())

Ask codex to run it in wsl. Wait for WSL crash.

What is the expected behavior?

The standard OS process killer kills the process.

Additional information

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingtool-callsIssues related to tool callingwindows-osIssues related to Codex on Windows systems

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions