diff --git a/nemo_retriever/src/nemo_retriever/graph/abstract_operator.py b/nemo_retriever/src/nemo_retriever/graph/abstract_operator.py
index b0f5ee5081..4dae44aa09 100644
--- a/nemo_retriever/src/nemo_retriever/graph/abstract_operator.py
+++ b/nemo_retriever/src/nemo_retriever/graph/abstract_operator.py
@@ -6,12 +6,54 @@
 
 from abc import ABC, abstractmethod
 import inspect
+import os
+import resource
+import time
 from typing import Any, TYPE_CHECKING
 
+from nemo_retriever.utils import stage_timing
+
 if TYPE_CHECKING:
     from nemo_retriever.graph.pipeline_graph import Graph, Node
 
 
+try:  # psutil is in the retriever runtime; degrade gracefully if missing
+    import psutil as _psutil
+
+    _PROC = _psutil.Process()
+except Exception:  # pragma: no cover
+    _psutil = None
+    _PROC = None
+
+
+def _safe_len(data: Any) -> int:
+    try:
+        return len(data)
+    except Exception:
+        return -1
+
+
+def _mem_snapshot() -> tuple[float, float]:
+    """Return (process_rss_mb, host_available_mb). Zeros if psutil unavailable."""
+    if _PROC is None or _psutil is None:
+        return 0.0, 0.0
+    try:
+        rss = _PROC.memory_info().rss / 1e6
+        avail = _psutil.virtual_memory().available / 1e6
+        return rss, avail
+    except Exception:
+        return 0.0, 0.0
+
+
+def _process_peak_rss_mb() -> float:
+    """ru_maxrss high-water mark for this worker process, since process start."""
+    try:
+        # On Linux ru_maxrss is in KiB.
+        return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0
+    except Exception:
+        return 0.0
+
+
 class AbstractOperator(ABC):
     """Base class for all pipeline operators."""
 
@@ -30,9 +72,39 @@ def process(self, data: Any, **kwargs: Any) -> Any: ...
     def postprocess(self, data: Any, **kwargs: Any) -> Any: ...
 
     def run(self, data: Any, **kwargs: Any) -> Any:
+        if not stage_timing.is_enabled():
+            data = self.preprocess(data, **kwargs)
+            data = self.process(data, **kwargs)
+            data = self.postprocess(data, **kwargs)
+            return data
+
+        stage = getattr(self, "_nr_stage_name", None) or type(self).__name__
+        n_in = _safe_len(data)
+        rss_b, avail_b = _mem_snapshot()
+        t0 = time.perf_counter()
         data = self.preprocess(data, **kwargs)
+        t1 = time.perf_counter()
         data = self.process(data, **kwargs)
+        t2 = time.perf_counter()
         data = self.postprocess(data, **kwargs)
+        t3 = time.perf_counter()
+        rss_a, avail_a = _mem_snapshot()
+        stage_timing.record_timing(
+            stage=stage,
+            n_rows_in=n_in,
+            n_rows_out=_safe_len(data),
+            preprocess_ms=(t1 - t0) * 1000.0,
+            process_ms=(t2 - t1) * 1000.0,
+            postprocess_ms=(t3 - t2) * 1000.0,
+            total_ms=(t3 - t0) * 1000.0,
+            worker_pid=os.getpid(),
+            wallclock_start=t0,
+            rss_before_mb=rss_b,
+            rss_after_mb=rss_a,
+            rss_peak_mb=_process_peak_rss_mb(),
+            avail_before_mb=avail_b,
+            avail_after_mb=avail_a,
+        )
         return data
 
     def __call__(self, data: Any, **kwargs: Any) -> Any:
diff --git a/nemo_retriever/src/nemo_retriever/graph/executor.py b/nemo_retriever/src/nemo_retriever/graph/executor.py
index 14a323ab08..701591d462 100644
--- a/nemo_retriever/src/nemo_retriever/graph/executor.py
+++ b/nemo_retriever/src/nemo_retriever/graph/executor.py
@@ -29,6 +29,7 @@
     VLLM_GPUS_PER_ACTOR,
     OCR_GPUS_PER_ACTOR,
 )
+from nemo_retriever.utils import stage_timing
 
 import logging
 
@@ -249,6 +250,10 @@ def ingest(self, data: Any, **kwargs: Any) -> Any:
             }
             ray_env_vars.update(collect_hf_runtime_env())
             ray_env_vars.update(collect_remote_auth_runtime_env())
+            for _name in (stage_timing.ENABLED_ENV, stage_timing.REPORT_PATH_ENV):
+                _val = os.environ.get(_name)
+                if _val:
+                    ray_env_vars[_name] = _val
             os.environ["HF_HUB_OFFLINE"] = ray_env_vars["HF_HUB_OFFLINE"]
             runtime_env = {"env_vars": ray_env_vars}
             ray.init(
@@ -257,6 +262,10 @@ def ingest(self, data: Any, **kwargs: Any) -> Any:
                 runtime_env=runtime_env,
             )
 
+        timing_enabled = stage_timing.is_enabled()
+        timing_collector = stage_timing.start_collector() if timing_enabled else None
+        timing_mem_sampler = stage_timing.start_memory_sampler(timing_collector) if timing_collector else None
+
         ctx = rd.DataContext.get_current()
         ctx.enable_rich_progress_bars = True
         ctx.use_ray_tqdm = False
@@ -273,6 +282,24 @@ def ingest(self, data: Any, **kwargs: Any) -> Any:
             except FileNotFoundError as exc:
                 raise_input_path_not_found(input_paths or [], exc)
         nodes = self._linearize(resolved_graph)
+        timing_call_index: Optional[int] = None
+        timing_node_names: List[str] = [n.name for n in nodes]
+        timing_graph_label: Optional[str] = None
+        if timing_enabled:
+            timing_call_index = stage_timing.next_call_index()
+            timing_graph_label = stage_timing.slugify_graph_label(timing_node_names)
+            report_path = stage_timing.resolve_report_path(timing_call_index, timing_graph_label)
+            logger.info(
+                "RayDataExecutor.ingest() #%02d | %d nodes: [%s] | label=%s | report -> %s",
+                timing_call_index,
+                len(timing_node_names),
+                ", ".join(timing_node_names),
+                timing_graph_label,
+                report_path if report_path is not None else "<stdout only>",
+            )
+            # Per-node operator class stamping happens inside the map_batches
+            # loop below via stage_timing.make_named_operator_class so two
+            # nodes sharing the same operator class get distinct stage names.
         for node in nodes:
             overrides = dict(self._node_overrides.get(node.name, {}))
             target_num_rows_per_block = overrides.pop("target_num_rows_per_block", None)
@@ -360,18 +387,24 @@ def ingest(self, data: Any, **kwargs: Any) -> Any:
                 group_keys = list(getattr(node.operator_class, "GLOBAL_BATCH_GROUP_KEYS", None) or ())
                 n_blocks = max(1, int(overrides.get("concurrency") or 1)) if group_keys else 1
                 if n_blocks > 1:
-                    ds = ds.repartition(num_blocks=n_blocks, keys=group_keys, shuffle=True)
+                    # ds = ds.repartition(num_blocks=n_blocks, keys=group_keys, shuffle=True)
+                    pass
                 else:
-                    ds = ds.repartition(num_blocks=1)
+                    # ds = ds.repartition(num_blocks=1)
+                    pass
             elif target_num_rows_per_block is not None and int(target_num_rows_per_block) > 0:
-                ds = ds.repartition(target_num_rows_per_block=int(target_num_rows_per_block))
+                # ds = ds.repartition(target_num_rows_per_block=int(target_num_rows_per_block))
+                pass
 
             # Pass the operator class directly to map_batches with
             # fn_constructor_kwargs for deferred construction on workers.
             # AbstractOperator.__call__ delegates to run(), so each stage
             # executes the full preprocess -> process -> postprocess chain.
+            operator_cls = node.operator_class
+            if timing_enabled:
+                operator_cls = stage_timing.make_named_operator_class(operator_cls, node.name)
             ds = ds.map_batches(
-                node.operator_class,
+                operator_cls,
                 batch_size=batch_size,
                 batch_format=batch_format,
                 num_cpus=num_cpus,
@@ -380,4 +413,39 @@ def ingest(self, data: Any, **kwargs: Any) -> Any:
                 **overrides,
             )
 
-        return ds.to_pandas()
+        result = ds.to_pandas()
+        if timing_collector is not None:
+            try:
+                # Stop the memory sampler before dumping so no late samples
+                # arrive after we've read the collector.
+                stage_timing.stop_memory_sampler(timing_mem_sampler)
+                ray_stats_text = None
+                try:
+                    ray_stats_text = ds.stats()
+                except Exception as exc:
+                    logger.warning("Failed to collect ds.stats(): %s", exc)
+                try:
+                    records = ray.get(timing_collector.dump.remote())
+                except Exception as exc:
+                    logger.warning("Failed to retrieve stage timing records: %s", exc)
+                    records = []
+                try:
+                    memory_samples = ray.get(timing_collector.dump_samples.remote())
+                except Exception as exc:
+                    logger.warning("Failed to retrieve memory samples: %s", exc)
+                    memory_samples = []
+                baseline_used_mb = (
+                    getattr(timing_mem_sampler, "baseline_sys_used_mb", 0.0) if timing_mem_sampler else None
+                )
+                stage_timing.write_report(
+                    records,
+                    ray_stats_text=ray_stats_text,
+                    call_index=timing_call_index,
+                    graph_label=timing_graph_label,
+                    node_names=timing_node_names,
+                    memory_samples=memory_samples,
+                    baseline_sys_used_mb=baseline_used_mb,
+                )
+            finally:
+                stage_timing.stop_collector(timing_collector)
+        return result
diff --git a/nemo_retriever/src/nemo_retriever/graph/operator_archetype.py b/nemo_retriever/src/nemo_retriever/graph/operator_archetype.py
index 3fb9682631..80ac18c61e 100644
--- a/nemo_retriever/src/nemo_retriever/graph/operator_archetype.py
+++ b/nemo_retriever/src/nemo_retriever/graph/operator_archetype.py
@@ -88,7 +88,14 @@ def _resolve_delegate(self, resources: ClusterResources | Resources | None = Non
         if operator_class is type(self):
             raise RuntimeError(f"{type(self).__name__} could not resolve a concrete hardware-specific operator.")
 
-        delegate = operator_class(**operator_kwargs)
+        resolved_kwargs = type(self).variant_operator_kwargs(operator_class, operator_kwargs)
+        delegate = operator_class(**resolved_kwargs)
+        # Propagate the pipeline-level stage name (set by RayDataExecutor on the
+        # archetype class) so the delegate's run() can label timing records
+        # consistently with the node name rather than the variant class name.
+        stage_name = getattr(self, "_nr_stage_name", None)
+        if stage_name is not None:
+            delegate._nr_stage_name = stage_name
         self._resolved_delegate = delegate
         self._resolved_delegate_key = cache_key
         return delegate
diff --git a/nemo_retriever/src/nemo_retriever/utils/stage_timing.py b/nemo_retriever/src/nemo_retriever/utils/stage_timing.py
new file mode 100644
index 0000000000..337d258823
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/utils/stage_timing.py
@@ -0,0 +1,617 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Per-stage / per-batch timing for the Ray batch executor.
+
+When the ``NR_STAGE_TIMING=1`` environment variable is set, the
+``RayDataExecutor`` starts a detached named Ray actor that collects
+records emitted by :class:`AbstractOperator.run` on every worker.
+After the pipeline materialises, the executor pulls the records,
+combines them with ``ds.stats()`` text, and writes a human-readable
+report (and optional JSON dump via ``NR_STAGE_TIMING_REPORT_PATH``).
+"""
+
+from __future__ import annotations
+
+import datetime
+import json
+import logging
+import os
+import re
+import threading
+from collections import defaultdict
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+
+logger = logging.getLogger(__name__)
+
+COLLECTOR_NAME = "nr_stage_timing_collector"
+ENABLED_ENV = "NR_STAGE_TIMING"
+REPORT_PATH_ENV = "NR_STAGE_TIMING_REPORT_PATH"
+
+# Driver-side counter of RayDataExecutor.ingest() invocations within this process.
+# Used by Phase 1 diagnostics to identify each graph execution distinctly.
+_INGEST_CALL_COUNTER = 0
+_INGEST_CALL_COUNTER_LOCK = threading.Lock()
+
+# Captured once per process so every report file from one run sorts together.
+_RUN_TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+
+
+def resolve_report_path(call_index: Optional[int], graph_label: Optional[str]) -> Optional[Path]:
+    """Compute the JSON output path for one timing report, or ``None`` if file writes are disabled.
+
+    The ``NR_STAGE_TIMING_REPORT_PATH`` env var is treated as a *base*:
+
+    * If it points to an existing directory (or ends with ``/``), files land
+      inside it as ``timing_<ts>_<NN>_<label>.json``.
+    * Otherwise it is split into a parent directory and a stem; files are
+      written next to it as ``<stem>_<ts>_<NN>_<label>.json``.  This means
+      ``NR_STAGE_TIMING_REPORT_PATH=/tmp/timing.json`` still works — you
+      get ``/tmp/timing_<ts>_<NN>_<label>.json`` instead of a single
+      overwritten ``/tmp/timing.json``.
+    """
+    env_value = os.environ.get(REPORT_PATH_ENV)
+    if not env_value:
+        return None
+    p = Path(env_value)
+    if env_value.endswith(os.sep) or env_value.endswith("/") or (p.exists() and p.is_dir()):
+        out_dir = p
+        stem = "timing"
+    else:
+        out_dir = p.parent if str(p.parent) else Path(".")
+        stem = p.stem if p.suffix else p.name
+        if not stem:
+            stem = "timing"
+    out_dir.mkdir(parents=True, exist_ok=True)
+    idx = f"{call_index:02d}" if call_index is not None else "00"
+    label = graph_label or "graph"
+    return out_dir / f"{stem}_{_RUN_TIMESTAMP}_{idx}_{label}.json"
+
+
+def next_call_index() -> int:
+    """Increment and return the per-process ingest-call counter (starts at 1)."""
+    global _INGEST_CALL_COUNTER
+    with _INGEST_CALL_COUNTER_LOCK:
+        _INGEST_CALL_COUNTER += 1
+        return _INGEST_CALL_COUNTER
+
+
+_SLUG_RE = re.compile(r"[^a-z0-9]+")
+
+
+def make_named_operator_class(base_cls: type, stage_name: str) -> type:
+    """Return a per-node subclass of *base_cls* carrying ``_nr_stage_name=stage_name``.
+
+    Two pipeline nodes can share the same operator class (e.g. duplicate
+    embedders).  Setting the attribute on the shared class would let one
+    node clobber another's stage label; per-node subclasses avoid that.
+    cloudpickle (which Ray Data uses) can serialise such dynamically
+    created subclasses by-value.
+    """
+    return type(
+        base_cls.__name__,
+        (base_cls,),
+        {"_nr_stage_name": stage_name, "__module__": base_cls.__module__},
+    )
+
+
+def slugify_graph_label(node_names: Iterable[str], *, max_len: int = 40) -> str:
+    """Build a short, filesystem-safe label from the ordered node names of a graph."""
+    parts = []
+    for name in node_names:
+        slug = _SLUG_RE.sub("-", str(name).lower()).strip("-")
+        if slug:
+            parts.append(slug)
+    label = "-".join(parts) if parts else "graph"
+    if len(label) > max_len:
+        label = label[: max_len - 1].rstrip("-") + "+"
+    return label
+
+
+@dataclass
+class StageRecord:
+    stage: str
+    n_rows_in: int
+    n_rows_out: int
+    preprocess_ms: float
+    process_ms: float
+    postprocess_ms: float
+    total_ms: float
+    worker_pid: int
+    wallclock_start: float
+    # Memory metrics (host-side, per worker process). All zero when psutil is
+    # unavailable. ``rss_peak_mb`` is the worker's process-lifetime high-water
+    # mark sampled at the end of the batch -- it only grows, so the diff across
+    # consecutive batches tells you which batch pushed the peak up.
+    rss_before_mb: float = 0.0
+    rss_after_mb: float = 0.0
+    rss_peak_mb: float = 0.0
+    avail_before_mb: float = 0.0
+    avail_after_mb: float = 0.0
+    extra: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class SystemSample:
+    """A single driver-side snapshot of host + Ray-worker memory.
+
+    Memory fields use **PSS** (proportional set size) on Linux so that shared
+    library / mmap pages are not double-counted across processes -- the sum
+    of PSS across processes ~= actual physical memory used.  When PSS is
+    unavailable (non-Linux), the sampler transparently falls back to RSS;
+    field names stay the same to keep the JSON schema stable.
+    """
+
+    t_rel_s: float
+    driver_pss_mb: float
+    workers_pss_mb: float
+    workload_pss_mb: float  # driver + workers (PSS)
+    sys_used_mb: float  # host-wide MemUsed from psutil.virtual_memory().used
+    sys_available_mb: float
+    sys_used_pct: float
+    n_workers: int
+
+
+def is_enabled() -> bool:
+    """Return True if stage timing is enabled via the env var."""
+    return os.environ.get(ENABLED_ENV, "").lower() in ("1", "true", "yes", "on")
+
+
+# Cache the actor handle per-worker so we don't pay the ray.get_actor cost per batch.
+_collector_cache: Dict[str, Any] = {"handle": None, "tried": False}
+
+
+def _get_collector() -> Any:
+    if _collector_cache["tried"]:
+        return _collector_cache["handle"]
+    _collector_cache["tried"] = True
+    try:
+        import ray  # local import: ray may not be installed in non-batch contexts
+
+        _collector_cache["handle"] = ray.get_actor(COLLECTOR_NAME)
+    except Exception:
+        _collector_cache["handle"] = None
+    return _collector_cache["handle"]
+
+
+def record_timing(**fields: Any) -> None:
+    """Fire-and-forget submission of a :class:`StageRecord` to the collector.
+
+    Safe to call from any worker. If the collector actor cannot be found
+    (e.g. timing disabled, or running outside Ray), this is a no-op.
+    """
+    handle = _get_collector()
+    if handle is None:
+        return
+    try:
+        handle.record.remote(StageRecord(**fields))
+    except Exception:
+        # Never let timing break the pipeline.
+        pass
+
+
+def _build_collector_class() -> Any:
+    import ray
+
+    @ray.remote(num_cpus=0)
+    class StageTimingCollector:
+        def __init__(self) -> None:
+            self._records: List[StageRecord] = []
+            self._samples: List[SystemSample] = []
+
+        def record(self, rec: StageRecord) -> None:
+            self._records.append(rec)
+
+        def record_sample(self, sample: SystemSample) -> None:
+            self._samples.append(sample)
+
+        def dump(self) -> List[Dict[str, Any]]:
+            return [asdict(r) for r in self._records]
+
+        def dump_samples(self) -> List[Dict[str, Any]]:
+            return [asdict(s) for s in self._samples]
+
+        def clear(self) -> None:
+            self._records.clear()
+            self._samples.clear()
+
+    return StageTimingCollector
+
+
+def _enumerate_ray_worker_processes() -> List[Any]:
+    """Return psutil.Process handles for Ray worker processes on this host.
+
+    Ray worker processes are named ``ray::<actor_class>`` (or ``ray::IDLE``).
+    Returns an empty list if psutil is unavailable.
+    """
+    try:
+        import psutil
+    except Exception:
+        return []
+    workers: List[Any] = []
+    try:
+        for p in psutil.process_iter(["pid", "name"]):
+            try:
+                name = p.info.get("name") or ""
+                if name.startswith("ray::"):
+                    workers.append(p)
+            except Exception:
+                continue
+    except Exception:
+        return []
+    return workers
+
+
+def _process_pss_bytes(p: Any) -> int:
+    """Return PSS in bytes for a psutil.Process, falling back to RSS off-Linux."""
+    try:
+        return int(p.memory_full_info().pss)
+    except (AttributeError, OSError, Exception):
+        try:
+            return int(p.memory_info().rss)
+        except Exception:
+            return 0
+
+
+class _MemorySampler(threading.Thread):
+    """Daemon thread that snapshots driver + worker memory every ``interval_s`` seconds.
+
+    Memory uses PSS (proportional set size) on Linux: a page mapped by N
+    processes contributes ``1/N`` of its size to each.  Summing PSS across
+    processes therefore approximates true physical-memory usage, unlike
+    summing RSS which double-counts shared library / mmap pages.
+    """
+
+    def __init__(self, collector_handle: Any, interval_s: float = 1.0) -> None:
+        super().__init__(daemon=True)
+        self._collector = collector_handle
+        self._interval = float(interval_s)
+        self._stop = threading.Event()
+        self._t0 = 0.0
+        # Captured eagerly in start_memory_sampler before run() begins.
+        self.baseline_sys_used_mb: float = 0.0
+
+    def run(self) -> None:  # pragma: no cover - thread loop
+        import time as _time
+
+        try:
+            import psutil
+        except Exception:
+            return
+        try:
+            driver = psutil.Process()
+        except Exception:
+            return
+        self._t0 = _time.perf_counter()
+        while not self._stop.wait(self._interval):
+            try:
+                drv_mem = _process_pss_bytes(driver)
+                workers = _enumerate_ray_worker_processes()
+                worker_mem = 0
+                live = 0
+                for p in workers:
+                    v = _process_pss_bytes(p)
+                    if v > 0:
+                        worker_mem += v
+                        live += 1
+                vm = psutil.virtual_memory()
+                sample = SystemSample(
+                    t_rel_s=_time.perf_counter() - self._t0,
+                    driver_pss_mb=drv_mem / 1e6,
+                    workers_pss_mb=worker_mem / 1e6,
+                    workload_pss_mb=(drv_mem + worker_mem) / 1e6,
+                    sys_used_mb=float(vm.used) / 1e6,
+                    sys_available_mb=vm.available / 1e6,
+                    sys_used_pct=float(vm.percent),
+                    n_workers=live,
+                )
+                try:
+                    self._collector.record_sample.remote(sample)
+                except Exception:
+                    pass
+            except Exception:
+                continue
+
+    def stop(self, timeout: float = 2.0) -> None:
+        self._stop.set()
+        if self.is_alive():
+            self.join(timeout=timeout)
+
+
+def start_memory_sampler(collector_handle: Any, interval_s: float = 1.0) -> Optional[_MemorySampler]:
+    """Start a daemon sampler thread.
+
+    Also captures a baseline of ``psutil.virtual_memory().used`` *before*
+    sampling begins so the report can show "memory the run added" rather
+    than absolute host usage.
+    """
+    if collector_handle is None:
+        return None
+    sampler = _MemorySampler(collector_handle, interval_s=interval_s)
+    try:
+        import psutil
+
+        sampler.baseline_sys_used_mb = float(psutil.virtual_memory().used) / 1e6
+    except Exception:
+        sampler.baseline_sys_used_mb = 0.0
+    sampler.start()
+    return sampler
+
+
+def stop_memory_sampler(sampler: Optional[_MemorySampler]) -> None:
+    """Stop the sampler thread (no-op if ``None``)."""
+    if sampler is not None:
+        try:
+            sampler.stop()
+        except Exception:
+            pass
+
+
+def start_collector() -> Any:
+    """Create the named, detached collector actor. Idempotent."""
+    import ray
+
+    try:
+        return ray.get_actor(COLLECTOR_NAME)
+    except Exception:
+        pass
+    cls = _build_collector_class()
+    return cls.options(name=COLLECTOR_NAME, lifetime="detached").remote()
+
+
+def stop_collector(handle: Any) -> None:
+    """Kill the collector actor if it exists. Safe to call with None."""
+    if handle is None:
+        return
+    try:
+        import ray
+
+        ray.kill(handle)
+    except Exception:
+        pass
+
+
+def _fmt_int(n: int) -> str:
+    return f"{n:,}" if n >= 0 else "-"
+
+
+def format_report(
+    records: List[Dict[str, Any]],
+    ray_stats_text: Optional[str] = None,
+    *,
+    call_index: Optional[int] = None,
+    graph_label: Optional[str] = None,
+    node_names: Optional[Iterable[str]] = None,
+    memory_samples: Optional[List[Dict[str, Any]]] = None,
+    baseline_sys_used_mb: Optional[float] = None,
+) -> str:
+    """Build a human-readable per-stage timing report from collected records."""
+    memory_block = ""
+    if not records:
+        body = "(no stage timing records were collected)"
+    else:
+        by_stage: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
+        for r in records:
+            by_stage[r["stage"]].append(r)
+
+        cols = ("stage", "batches", "rows_in", "total_s", "pre_ms/b", "proc_ms/b", "post_ms/b", "ms/row")
+        widths = (34, 9, 11, 10, 11, 11, 11, 10)
+        header = "".join(c.rjust(w) if i else c.ljust(w) for i, (c, w) in enumerate(zip(cols, widths)))
+        sep = "-" * len(header)
+        lines = [header, sep]
+
+        pipeline_total_ms = 0.0
+        ordered_stages = sorted(by_stage.items(), key=lambda kv: -sum(r["total_ms"] for r in kv[1]))
+        for stage, recs in ordered_stages:
+            n_batches = len(recs)
+            rows_in = sum(max(r["n_rows_in"], 0) for r in recs)
+            total_ms = sum(r["total_ms"] for r in recs)
+            pre_avg = sum(r["preprocess_ms"] for r in recs) / n_batches
+            proc_avg = sum(r["process_ms"] for r in recs) / n_batches
+            post_avg = sum(r["postprocess_ms"] for r in recs) / n_batches
+            ms_per_row = total_ms / rows_in if rows_in > 0 else 0.0
+            pipeline_total_ms += total_ms
+            row = (
+                stage[: widths[0]].ljust(widths[0])
+                + _fmt_int(n_batches).rjust(widths[1])
+                + _fmt_int(rows_in).rjust(widths[2])
+                + f"{total_ms / 1000:.2f}".rjust(widths[3])
+                + f"{pre_avg:.2f}".rjust(widths[4])
+                + f"{proc_avg:.2f}".rjust(widths[5])
+                + f"{post_avg:.2f}".rjust(widths[6])
+                + f"{ms_per_row:.3f}".rjust(widths[7])
+            )
+            lines.append(row)
+        lines.append(sep)
+        lines.append(f"sum of stage wall-time (worker-side, parallel): {pipeline_total_ms / 1000:.2f} s")
+        body = "\n".join(lines)
+
+        # Memory section: only emitted when at least one record carries a
+        # non-zero rss measurement (psutil was available on the workers).
+        has_mem = any(float(r.get("rss_peak_mb") or 0.0) > 0 for r in records)
+        if has_mem:
+            mcols = ("stage", "peak_rss_mb", "max_rss_mb", "mean_delta_mb", "min_avail_mb")
+            mwidths = (34, 14, 14, 16, 14)
+            mhead = "".join(c.rjust(w) if i else c.ljust(w) for i, (c, w) in enumerate(zip(mcols, mwidths)))
+            msep = "-" * len(mhead)
+            mlines = [mhead, msep]
+            for stage, recs in ordered_stages:
+                peak = max((float(r.get("rss_peak_mb") or 0.0) for r in recs), default=0.0)
+                max_after = max((float(r.get("rss_after_mb") or 0.0) for r in recs), default=0.0)
+                deltas = [float(r.get("rss_after_mb") or 0.0) - float(r.get("rss_before_mb") or 0.0) for r in recs]
+                mean_delta = sum(deltas) / len(deltas) if deltas else 0.0
+                min_avail = min(
+                    (
+                        float(r.get("avail_before_mb") or 0.0)
+                        for r in recs
+                        if float(r.get("avail_before_mb") or 0.0) > 0
+                    ),
+                    default=0.0,
+                )
+                mlines.append(
+                    stage[: mwidths[0]].ljust(mwidths[0])
+                    + f"{peak:.1f}".rjust(mwidths[1])
+                    + f"{max_after:.1f}".rjust(mwidths[2])
+                    + f"{mean_delta:+.2f}".rjust(mwidths[3])
+                    + f"{min_avail:.0f}".rjust(mwidths[4])
+                )
+            mlines.append(msep)
+            memory_block = "\n".join(mlines)
+
+    title = "NeMo Retriever - Stage Timing Report"
+    if call_index is not None:
+        title += f"  (graph #{call_index:02d}"
+        if graph_label:
+            title += f": {graph_label}"
+        title += ")"
+    header_lines = [
+        "=" * 96,
+        title,
+        "=" * 96,
+        "",
+        "Per-stage totals and per-batch / per-row averages (worker-side timing).",
+        "preprocess/process/postprocess columns are mean milliseconds per batch.",
+        "ms/row = total_ms / sum(rows_in)  - the per-chunk average across all batches.",
+    ]
+    if node_names:
+        nodes_str = " -> ".join(node_names)
+        if len(nodes_str) > 200:
+            nodes_str = nodes_str[:197] + "..."
+        header_lines.append(f"graph nodes: {nodes_str}")
+    header_lines.append("")
+    out = header_lines + [body]
+    if memory_block:
+        out += ["", "Per-stage memory (host-side, per worker process):", memory_block]
+    if memory_samples:
+        peak_workload = max((float(s.get("workload_pss_mb") or 0.0) for s in memory_samples), default=0.0)
+        peak_driver = max((float(s.get("driver_pss_mb") or 0.0) for s in memory_samples), default=0.0)
+        peak_workers = max((float(s.get("workers_pss_mb") or 0.0) for s in memory_samples), default=0.0)
+        min_avail = min(
+            (
+                float(s.get("sys_available_mb") or 0.0)
+                for s in memory_samples
+                if float(s.get("sys_available_mb") or 0.0) > 0
+            ),
+            default=0.0,
+        )
+        max_used_mb = max((float(s.get("sys_used_mb") or 0.0) for s in memory_samples), default=0.0)
+        max_used_pct = max((float(s.get("sys_used_pct") or 0.0) for s in memory_samples), default=0.0)
+        mean_workers = (
+            sum(int(s.get("n_workers") or 0) for s in memory_samples) / len(memory_samples) if memory_samples else 0.0
+        )
+        # "Memory the run added": host-used at peak minus host-used before
+        # the sampler started.  This is the cleanest "what did this run cost"
+        # number -- it ignores stale Ray IDLE workers, kernel caches, and
+        # anything else that was already resident before ingestion began.
+        delta_lines: List[str] = []
+        if baseline_sys_used_mb is not None and baseline_sys_used_mb > 0:
+            delta = max_used_mb - float(baseline_sys_used_mb)
+            delta_lines = [
+                f"  baseline host MemUsed (pre-run)          : {baseline_sys_used_mb:8.1f} MB",
+                f"  peak host MemUsed (during run)           : {max_used_mb:8.1f} MB",
+                f"  delta MemUsed (memory the run added)     : {delta:+8.1f} MB",
+            ]
+        out += [
+            "",
+            "Run-level memory (driver-sampled, PSS-based):",
+            f"  peak workload PSS (driver + ray workers) : {peak_workload:8.1f} MB",
+            f"  peak driver PSS                          : {peak_driver:8.1f} MB",
+            f"  peak ray-workers PSS (sum)               : {peak_workers:8.1f} MB",
+            *delta_lines,
+            f"  host worst-case available                : {min_avail:8.1f} MB",
+            f"  host worst-case used                     : {max_used_pct:8.1f} %",
+            f"  mean ray-worker count seen               : {mean_workers:8.1f}",
+            f"  sample count                             : {len(memory_samples)}",
+            "",
+            "  Note: PSS attributes shared pages proportionally across the",
+            "  processes mapping them; sum-of-PSS ~= true physical memory used.",
+            "  'delta MemUsed' is the most defensible 'this run's cost' figure.",
+        ]
+    if ray_stats_text:
+        out += [
+            "",
+            "=" * 96,
+            "Ray Data ds.stats() (driver-side wall time per stage)",
+            "=" * 96,
+            ray_stats_text.rstrip(),
+        ]
+    return "\n".join(out)
+
+
+def write_report(
+    records: List[Dict[str, Any]],
+    ray_stats_text: Optional[str] = None,
+    *,
+    call_index: Optional[int] = None,
+    graph_label: Optional[str] = None,
+    node_names: Optional[Iterable[str]] = None,
+    memory_samples: Optional[List[Dict[str, Any]]] = None,
+    baseline_sys_used_mb: Optional[float] = None,
+) -> str:
+    """Format and emit the report. Returns the report text."""
+    node_list = list(node_names) if node_names is not None else None
+    samples = list(memory_samples) if memory_samples is not None else None
+    text = format_report(
+        records,
+        ray_stats_text,
+        call_index=call_index,
+        graph_label=graph_label,
+        node_names=node_list,
+        memory_samples=samples,
+        baseline_sys_used_mb=baseline_sys_used_mb,
+    )
+    logger.info("\n%s", text)
+    out_path = resolve_report_path(call_index, graph_label)
+    if out_path is not None:
+        try:
+            peak_workload_pss_mb = (
+                max((float(s.get("workload_pss_mb") or 0.0) for s in samples), default=0.0) if samples else 0.0
+            )
+            min_sys_available_mb = (
+                min(
+                    (
+                        float(s.get("sys_available_mb") or 0.0)
+                        for s in samples
+                        if float(s.get("sys_available_mb") or 0.0) > 0
+                    ),
+                    default=0.0,
+                )
+                if samples
+                else 0.0
+            )
+            peak_sys_used_mb = (
+                max((float(s.get("sys_used_mb") or 0.0) for s in samples), default=0.0) if samples else 0.0
+            )
+            delta_sys_used_mb = (
+                peak_sys_used_mb - float(baseline_sys_used_mb)
+                if (baseline_sys_used_mb is not None and baseline_sys_used_mb > 0 and peak_sys_used_mb > 0)
+                else None
+            )
+            with open(out_path, "w") as f:
+                json.dump(
+                    {
+                        "call_index": call_index,
+                        "graph_label": graph_label,
+                        "node_names": node_list,
+                        "run_timestamp": _RUN_TIMESTAMP,
+                        "peak_workload_pss_mb": peak_workload_pss_mb,
+                        "min_sys_available_mb": min_sys_available_mb,
+                        "baseline_sys_used_mb": baseline_sys_used_mb,
+                        "peak_sys_used_mb": peak_sys_used_mb,
+                        "delta_sys_used_mb": delta_sys_used_mb,
+                        "records": records,
+                        "memory_samples": samples or [],
+                        "ray_stats": ray_stats_text,
+                        "report": text,
+                    },
+                    f,
+                    indent=2,
+                    default=str,
+                )
+            logger.info("Stage timing report written to %s", out_path)
+        except Exception as exc:
+            logger.warning("Failed to write stage timing report to %s: %s", out_path, exc)
+    return text
diff --git a/nemo_retriever/src/nemo_retriever/utils/stage_timing_memory.md b/nemo_retriever/src/nemo_retriever/utils/stage_timing_memory.md
new file mode 100644
index 0000000000..ae1224539a
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/utils/stage_timing_memory.md
@@ -0,0 +1,204 @@
+# Measuring "Real" Host Memory in Ray Data Runs
+
+This note explains why the stage-timing instrumentation in
+`stage_timing.py` had to grow two extra layers — **PSS accounting** and
+**baseline subtraction** — before its memory numbers reflected anything
+useful. If you just want the bottom line, skip to
+[How to read the report](#how-to-read-the-report).
+
+## The problem this solves
+
+The first cut of memory tracking did the obvious thing: each tick of the
+driver-side sampler enumerated `ray::*` processes on the host and summed
+their `psutil.Process().memory_info().rss`. The number it produced was
+catastrophically wrong, but in a way that *looked plausible*.
+
+Here is what a real ingestion run reported with that naive approach:
+
+```
+peak workload RSS (driver + ray workers) : 357,332 MB   (≈ 350 GB)
+peak ray-workers RSS (sum)               : 289,912 MB
+host worst-case used                     :       37.4 %
+```
+
+The host reported **37% used** — about 750 GB out of 2 TB — yet the
+"workload RSS" alone claimed 350 GB. Those two numbers cannot both be
+true at the same time. Something was lying.
+
+Two things were lying, actually.
+
+## Lie #1: summing RSS double-counts shared memory
+
+`memory_info().rss` (Resident Set Size) is "the total physical memory
+the kernel currently has committed for this process." It is a per-process
+quantity that includes:
+
+- The Python interpreter itself
+- Read-only library code (`libpython`, `libtorch_cpu`, `libcuda`)
+- Memory-mapped model weights
+- Copy-on-write pages inherited from `fork()`
+- The process's private heap
+
+Crucially, **the shared portions are mapped into every process's address
+space but only consume physical memory once**. When you sum RSS across N
+processes that all imported the same `transformers` and `torch`, you
+count those framework pages N times.
+
+For nv-ingest, this is severe. A typical worker process has roughly:
+
+- ~1 GB of Python + framework imports (almost entirely shared)
+- A few GB of model weights (often `mmap`-ed and shared between actor
+  replicas of the same class)
+- Hundreds of MB of decoded batch buffers (genuinely private)
+
+With ~150 worker processes on the box (Ray reuses a long-lived pool of
+`ray::IDLE` slots), the shared portion alone gets counted 150 times.
+Sum-of-RSS thus inflates the apparent footprint by 2–4×.
+
+### The fix: use **PSS** instead
+
+PSS (Proportional Set Size) is RSS-like, but each shared page is
+attributed as `1/N` of its size to each of the N processes mapping it.
+**Sum-of-PSS across all processes ≈ actual physical memory used.**
+
+`psutil.Process().memory_full_info().pss` exposes this on Linux. The
+sampler now reads PSS; RSS is only kept as a fallback for platforms
+without `/proc/<pid>/smaps`.
+
+The cost: `memory_full_info()` parses `/proc/<pid>/smaps`, which is
+~10× slower than `memory_info()`. We accommodate this by raising the
+sampler interval from 0.5 s to 1.0 s — on a 156-worker box this is
+imperceptible relative to the run length but eliminates the
+double-counting.
+
+After this change, the same ingestion run's "peak ray-workers PSS (sum)"
+dropped from 290 GB to roughly 80–150 GB. That is the *true* physical
+memory the Ray workers were holding at peak — every shared page
+counted exactly once.
+
+## Lie #2: idle workers hold memory from *prior* runs
+
+PSS fixes the double-counting, but a different problem remains: many of
+the `ray::*` processes on the host did not start because of *this* run.
+The cluster on the dev box is long-lived. After a run finishes, Ray
+keeps worker processes alive in a pool and returns them to the
+`ray::IDLE` state. Crucially:
+
+- Python's garbage collector frees objects lazily.
+- Even when freed, `glibc malloc` rarely returns pages to the OS — it
+  caches them in arena pools.
+- The CUDA context, if ever initialised, stays resident.
+
+So an `ru_maxrss` of 6 GB from yesterday's OCR run is still ~5–6 GB of
+RSS today, sitting on a `ray::IDLE` slot.
+
+Counting these in "this run's memory" is wrong. They were already
+resident *before* ingestion started. The PSS sum tells you "how much
+RAM is Ray currently holding on the host", which is honest but is not
+the question users were trying to answer.
+
+### The fix: **baseline subtraction**
+
+When the sampler starts, it captures `psutil.virtual_memory().used` —
+the kernel's host-wide MemUsed counter, in MB — and stores it as the
+**baseline**. Each subsequent sample also records the current MemUsed.
+At report time we compute:
+
+```
+delta_sys_used_mb = peak(sys_used_mb during run) - baseline_sys_used_mb
+```
+
+This is **the most defensible "what did this run cost" figure**:
+
+- It excludes everything that was already resident (idle workers,
+  kernel cache, OS, sshd, anyone else's processes).
+- It does not depend on us correctly attributing per-process memory —
+  it asks the kernel, which has the authoritative answer.
+- It captures the run's net effect on the host even if Ray spawned new
+  workers or recycled old ones.
+
+The trade-off: it includes anything *else* that grew during the run
+window (a coincidental compile, file cache growth, etc.). On a busy
+shared box this can add noise. On a dedicated ingest run it is the
+cleanest number available.
+
+## How to read the report
+
+The run-level memory section now looks like this:
+
+```
+Run-level memory (driver-sampled, PSS-based):
+  peak workload PSS (driver + ray workers) :   12303.9 MB
+  peak driver PSS                          :     158.7 MB
+  peak ray-workers PSS (sum)               :   12145.5 MB
+  baseline host MemUsed (pre-run)          :  632550.9 MB
+  peak host MemUsed (during run)           :  633113.4 MB
+  delta MemUsed (memory the run added)     :    +562.5 MB
+  host worst-case available                : 1531166.9 MB
+  host worst-case used                     :      29.3 %
+  mean ray-worker count seen               :     258.0
+  sample count                             :       2
+```
+
+What each number actually answers:
+
+| Line | Question it answers | When to trust it |
+|---|---|---|
+| **`delta MemUsed`** | "How much did this run grow the host's used memory?" | **Always.** This is the answer to "how much memory did this run use." |
+| `peak workload PSS` | "At peak, how much physical RAM was held by *this Ray cluster* (driver + all workers)?" | When you want to know Ray's footprint on this host, including idle holdovers. |
+| `peak ray-workers PSS (sum)` | "Of the workload PSS, how much is in the worker pool?" | When you want to see whether the driver or the workers are dominant. |
+| `host worst-case used` (%) | "How close did the run push the box to OOM?" | When the value approaches 100% — that is when this number actually matters. |
+| `mean ray-worker count seen` | "How many `ray::*` processes were resident on average?" | Sanity check: a number much higher than your stage count means a stale idle pool, which is why `peak workload PSS` may exceed `delta MemUsed` by a lot. |
+
+If you only want one number to quote in a report or PR description: it
+is `delta MemUsed`.
+
+## Why per-batch RSS is still useful
+
+The per-stage memory table in the report uses per-batch RSS (not PSS)
+captured *inside* each worker's `AbstractOperator.run()`. This is
+intentional:
+
+- It is a *per-process* measurement (one worker, one batch). There is
+  no double-counting problem to solve here — only one process is being
+  measured.
+- It is essentially free (one syscall per batch boundary).
+- It gives you `rss_peak_mb` (the worker's `ru_maxrss` high-water mark)
+  which reveals which stage made the worker grow.
+
+The driver sampler and the per-batch RSS measure different things and
+both belong in the report.
+
+## Caveats and known limitations
+
+1. **`ru_maxrss` is process-lifetime, not batch-scoped.** It only goes
+   up. We use the diff between consecutive `ru_maxrss` readings to
+   attribute peak-growth to a stage, which is correct in aggregate but
+   not perfect for any single batch.
+
+2. **PSS requires `/proc/<pid>/smaps`.** Available on Linux; the
+   sampler falls back to RSS on platforms without it. If you ever run
+   the timing on macOS or in a stripped-down container the numbers
+   will revert to being RSS-based and the double-counting returns.
+
+3. **Baseline is sampled once, before the sampler thread starts.** If
+   another process on the host allocates a lot of memory simultaneously
+   with your run, `delta MemUsed` will overstate the run's true cost.
+   On a dedicated ingest box this is fine; on a shared dev box, treat
+   `delta MemUsed` as a soft upper bound.
+
+4. **Idle workers carrying leaked state from previous runs.** PSS
+   correctly de-duplicates their shared pages, but their private heap
+   (cached models, malloc arena) still appears in `peak workload PSS`.
+   If you only want "what this run added", read `delta MemUsed`.
+
+## Quick reference
+
+- Implementation: `nemo_retriever/utils/stage_timing.py` —
+  `_MemorySampler`, `_process_pss_bytes`, `start_memory_sampler`.
+- Configuration: `NR_STAGE_TIMING=1` to enable the whole subsystem;
+  `NR_STAGE_TIMING_REPORT_PATH=<dir-or-file-stem>` to write JSON.
+- Visualization: `python -m nemo_retriever.utils.stage_timing_viz
+  --input <timing.json> --output-dir <dir>` produces
+  `memory_timeline.png` and `memory_overview.png` alongside the
+  timing charts.
diff --git a/nemo_retriever/src/nemo_retriever/utils/stage_timing_viz.py b/nemo_retriever/src/nemo_retriever/utils/stage_timing_viz.py
new file mode 100644
index 0000000000..ee1ae191ab
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/utils/stage_timing_viz.py
@@ -0,0 +1,575 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Visualise a stage-timing JSON dump produced by ``stage_timing.write_report``.
+
+Usage
+-----
+    python -m nemo_retriever.utils.stage_timing_viz \
+        --input /tmp/timing.json --output-dir /tmp/timing_charts
+
+Produces:
+    overview.png           - per-stage totals, ms/row, throughput, phase-breakdown
+    timeline.png           - per-batch Gantt across all stages
+    per_stage/<stage>.png  - latency distribution, time-series, batch-size scatter,
+                             phase breakdown for each stage
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Dict, List
+
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt  # noqa: E402
+import numpy as np  # noqa: E402
+import pandas as pd  # noqa: E402
+
+_PHASE_COLS = ("preprocess_ms", "process_ms", "postprocess_ms")
+_PHASE_COLORS = {"preprocess_ms": "#7ec8e3", "process_ms": "#ff7f50", "postprocess_ms": "#9ad19a"}
+
+
+def _safe_name(name: str) -> str:
+    return "".join(c if c.isalnum() or c in "-_." else "_" for c in name)
+
+
+def load_payload(path: str) -> dict:
+    """Load a timing JSON and return the full payload dict."""
+    with open(path) as f:
+        return json.load(f)
+
+
+def load_records(path: str) -> pd.DataFrame:
+    payload = load_payload(path) if isinstance(path, str) else path
+    records = payload.get("records", payload) if isinstance(payload, dict) else payload
+    if not records:
+        raise SystemExit(f"No records found in {path!r}")
+    df = pd.DataFrame(records)
+    # Zero the wallclock so the first batch starts at t=0 on charts.
+    df["t_start_s"] = df["wallclock_start"] - df["wallclock_start"].min()
+    df["t_end_s"] = df["t_start_s"] + df["total_ms"] / 1000.0
+    # Per-batch derived metrics
+    df["rows_per_s"] = np.where(df["total_ms"] > 0, df["n_rows_in"] / (df["total_ms"] / 1000.0), 0.0)
+    df["ms_per_row"] = np.where(df["n_rows_in"] > 0, df["total_ms"] / df["n_rows_in"], 0.0)
+    # Memory deltas (zero when memory tracking was disabled)
+    for col in ("rss_before_mb", "rss_after_mb", "rss_peak_mb", "avail_before_mb", "avail_after_mb"):
+        if col not in df.columns:
+            df[col] = 0.0
+    df["rss_delta_mb"] = df["rss_after_mb"] - df["rss_before_mb"]
+    # Stable per-stage batch index for time-series plots
+    df = df.sort_values(["stage", "t_start_s"]).reset_index(drop=True)
+    df["batch_idx"] = df.groupby("stage").cumcount()
+    return df
+
+
+def load_samples(payload_or_path) -> pd.DataFrame:
+    """Return a DataFrame of memory samples, or an empty frame if none were collected."""
+    payload = load_payload(payload_or_path) if isinstance(payload_or_path, str) else payload_or_path
+    samples = payload.get("memory_samples") or []
+    if not samples:
+        return pd.DataFrame()
+    sdf = pd.DataFrame(samples)
+    if "t_rel_s" in sdf.columns:
+        sdf = sdf.sort_values("t_rel_s").reset_index(drop=True)
+    return sdf
+
+
+def _stage_aggregates(df: pd.DataFrame) -> pd.DataFrame:
+    g = df.groupby("stage", sort=False)
+    agg = g.agg(
+        n_batches=("total_ms", "size"),
+        rows_in=("n_rows_in", "sum"),
+        total_ms=("total_ms", "sum"),
+        mean_pre_ms=("preprocess_ms", "mean"),
+        mean_proc_ms=("process_ms", "mean"),
+        mean_post_ms=("postprocess_ms", "mean"),
+        median_total_ms=("total_ms", "median"),
+        p95_total_ms=("total_ms", lambda s: float(np.percentile(s, 95))),
+        max_total_ms=("total_ms", "max"),
+        peak_rss_mb=("rss_peak_mb", "max"),
+        max_rss_after_mb=("rss_after_mb", "max"),
+        mean_delta_mb=("rss_delta_mb", "mean"),
+        min_avail_mb=("avail_before_mb", "min"),
+    )
+    agg["ms_per_row"] = np.where(agg["rows_in"] > 0, agg["total_ms"] / agg["rows_in"], 0.0)
+    agg["rows_per_s"] = np.where(agg["total_ms"] > 0, agg["rows_in"] / (agg["total_ms"] / 1000.0), 0.0)
+    agg = agg.sort_values("total_ms", ascending=False)
+    return agg
+
+
+def _stage_has_memory(agg: pd.DataFrame) -> bool:
+    return bool(agg["peak_rss_mb"].max() > 0)
+
+
+def _stage_color_map(stages):
+    """Stable color per stage so the same stage gets the same wedge color across panels."""
+    cmap = plt.get_cmap("tab20")
+    return {s: cmap(i % cmap.N) for i, s in enumerate(stages)}
+
+
+def _pie(ax, labels, values, colors, *, title, value_unit, hide_below_pct=2.0):
+    """Render a pie with auto-suppressed tiny labels and a value-aware autopct.
+
+    Wedges whose share is below ``hide_below_pct`` percent get their label and
+    percent text suppressed in the wedge (they still appear in the legend),
+    keeping pies legible when one or two stages dominate.
+    """
+    values = np.asarray(values, dtype=float)
+    total = values.sum()
+    if total <= 0:
+        ax.axis("off")
+        ax.set_title(title)
+        ax.text(0.5, 0.5, "no data", ha="center", va="center", transform=ax.transAxes)
+        return
+    shares = 100.0 * values / total
+
+    def _autopct(pct):
+        if pct < hide_below_pct:
+            return ""
+        return f"{pct:.1f}%"
+
+    wedge_labels = [lbl if shares[i] >= hide_below_pct else "" for i, lbl in enumerate(labels)]
+    wedges, _texts, _autotexts = ax.pie(
+        values,
+        labels=wedge_labels,
+        colors=[colors[lbl] for lbl in labels],
+        autopct=_autopct,
+        startangle=90,
+        pctdistance=0.72,
+        textprops={"fontsize": 9},
+        wedgeprops={"edgecolor": "white", "linewidth": 0.8},
+    )
+    ax.set_title(title)
+    # Legend shows every stage with its absolute value (so small slices aren't lost)
+    legend_labels = [
+        f"{lbl}: {v:.2f} {value_unit} ({shares[i]:.1f}%)" for i, (lbl, v) in enumerate(zip(labels, values))
+    ]
+    ax.legend(wedges, legend_labels, loc="center left", bbox_to_anchor=(1.0, 0.5), fontsize=8, frameon=False)
+
+
+def plot_memory_timeline(sdf: pd.DataFrame, payload: dict, out_path: str) -> bool:
+    """Driver-sampled memory timeline: PSS components + host MemUsed (delta).
+
+    Returns ``True`` if a plot was written, ``False`` if there were no samples.
+    """
+    if sdf.empty or "t_rel_s" not in sdf.columns:
+        return False
+    fig, ax = plt.subplots(figsize=(15, 6))
+    t = sdf["t_rel_s"].values
+
+    # PSS components (left axis, MB)
+    if "workload_pss_mb" in sdf.columns:
+        ax.plot(t, sdf["workload_pss_mb"], color="#2a4d8f", lw=2.2, label="workload PSS (driver + workers)")
+    if "workers_pss_mb" in sdf.columns:
+        ax.plot(t, sdf["workers_pss_mb"], color="#5b8fd6", lw=1.4, label="ray workers PSS (sum)")
+    if "driver_pss_mb" in sdf.columns:
+        ax.plot(t, sdf["driver_pss_mb"], color="#cc6633", lw=1.4, label="driver PSS")
+
+    # Host MemUsed delta (the "what did this run cost" line)
+    baseline = payload.get("baseline_sys_used_mb")
+    if "sys_used_mb" in sdf.columns and baseline:
+        delta = sdf["sys_used_mb"].values - float(baseline)
+        ax.plot(t, delta, color="#222222", lw=2.2, linestyle="--", label=f"host MemUsed - baseline ({baseline:.0f} MB)")
+
+    ax.set_xlabel("wallclock seconds (relative to sampler start)")
+    ax.set_ylabel("memory (MB)")
+    ax.set_title("Run-level memory timeline (driver-sampled, PSS-based)")
+    ax.grid(True, linestyle=":", alpha=0.4)
+    ax.legend(loc="upper left", fontsize=9, framealpha=0.95)
+    ax.set_ylim(bottom=min(0, ax.get_ylim()[0]))
+
+    # Worker count on a twin axis (so memory plot stays readable)
+    if "n_workers" in sdf.columns:
+        ax2 = ax.twinx()
+        ax2.plot(t, sdf["n_workers"], color="#888888", lw=1.0, linestyle=":", label="n ray workers")
+        ax2.set_ylabel("n ray workers (idle + active)", color="#666666")
+        ax2.tick_params(axis="y", colors="#666666")
+
+    # Annotated peak
+    if "workload_pss_mb" in sdf.columns:
+        peak_idx = int(np.argmax(sdf["workload_pss_mb"].values))
+        peak_t = float(sdf["t_rel_s"].iloc[peak_idx])
+        peak_v = float(sdf["workload_pss_mb"].iloc[peak_idx])
+        ax.annotate(
+            f"peak {peak_v:,.0f} MB",
+            xy=(peak_t, peak_v),
+            xytext=(10, 14),
+            textcoords="offset points",
+            fontsize=9,
+            arrowprops=dict(arrowstyle="->", color="#444"),
+        )
+
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=140)
+    plt.close(fig)
+    return True
+
+
+def plot_memory_overview(agg: pd.DataFrame, payload: dict, out_path: str) -> bool:
+    """Per-stage memory overview: peak RSS, mean delta, host headroom, share of peak.
+
+    Returns ``True`` if a plot was written, ``False`` if no memory data exists.
+    """
+    if not _stage_has_memory(agg):
+        return False
+    stages = list(agg.index)
+    colors = _stage_color_map(stages)
+    fig, axes = plt.subplots(2, 2, figsize=(17, 10))
+    fig.suptitle("Stage Timing - Memory Overview", fontsize=15, fontweight="bold")
+
+    # 1. Peak RSS per stage (per-worker high-water mark)
+    ax = axes[0, 0]
+    vals = agg["peak_rss_mb"].values
+    bars = ax.bar(range(len(stages)), vals, color=[colors[s] for s in stages])
+    ax.set_xticks(range(len(stages)))
+    ax.set_xticklabels(stages, rotation=35, ha="right", fontsize=9)
+    ax.set_title("Peak per-worker RSS by stage (ru_maxrss)")
+    ax.set_ylabel("MB")
+    for b, v in zip(bars, vals):
+        ax.annotate(
+            f"{v:,.0f}",
+            xy=(b.get_x() + b.get_width() / 2, v),
+            xytext=(0, 3),
+            textcoords="offset points",
+            ha="center",
+            fontsize=8,
+        )
+    ax.set_ylim(0, max(vals) * 1.15 if max(vals) > 0 else 1)
+
+    # 2. Mean RSS delta per batch (retained allocation)
+    ax = axes[0, 1]
+    vals = agg["mean_delta_mb"].values
+    bcolors = ["#cc4444" if v > 0 else "#4f9d4f" for v in vals]
+    bars = ax.bar(range(len(stages)), vals, color=bcolors)
+    ax.set_xticks(range(len(stages)))
+    ax.set_xticklabels(stages, rotation=35, ha="right", fontsize=9)
+    ax.set_title("Mean RSS delta per batch (retained per call)")
+    ax.set_ylabel("MB per batch (+ retained, - released)")
+    ax.axhline(0, color="#888", lw=0.6)
+    for b, v in zip(bars, vals):
+        ax.annotate(
+            f"{v:+.2f}",
+            xy=(b.get_x() + b.get_width() / 2, v),
+            xytext=(0, 3 if v >= 0 else -10),
+            textcoords="offset points",
+            ha="center",
+            fontsize=8,
+        )
+
+    # 3. Share of peak RSS (pie)
+    ax = axes[1, 0]
+    vals = agg["peak_rss_mb"].values.astype(float)
+    if vals.sum() > 0:
+        _pie(ax, stages, vals, colors, title="Share of peak RSS across stages", value_unit="MB")
+    else:
+        ax.axis("off")
+        ax.set_title("Share of peak RSS across stages")
+        ax.text(0.5, 0.5, "no data", ha="center", va="center")
+
+    # 4. Run-level summary box
+    ax = axes[1, 1]
+    ax.axis("off")
+    lines = ["Run-level memory summary"]
+
+    def fmt(k, v, unit="MB", prec=1):
+        return f"  {k:<32s} {v:>10,.{prec}f} {unit}" if v is not None else f"  {k:<32s} {'-':>10s}"
+
+    base = payload.get("baseline_sys_used_mb")
+    peak = payload.get("peak_sys_used_mb")
+    delta = payload.get("delta_sys_used_mb")
+    lines += [
+        fmt("peak workload PSS", payload.get("peak_workload_pss_mb")),
+        fmt("baseline host MemUsed (pre-run)", base),
+        fmt("peak host MemUsed (during run)", peak),
+        fmt("delta MemUsed (run added)", delta) if delta is not None else fmt("delta MemUsed", None),
+        fmt("min host available", payload.get("min_sys_available_mb")),
+    ]
+    lines.append("")
+    lines.append("This 'delta MemUsed' is the most defensible")
+    lines.append("'this run's cost' number.  PSS de-duplicates")
+    lines.append("shared pages so sum-of-PSS ~ true physical RAM.")
+    ax.text(
+        0.02,
+        0.98,
+        "\n".join(lines),
+        ha="left",
+        va="top",
+        family="monospace",
+        fontsize=11,
+        transform=ax.transAxes,
+    )
+
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=140)
+    plt.close(fig)
+    return True
+
+
+def plot_overview(df: pd.DataFrame, agg: pd.DataFrame, out_path: str) -> None:
+    stages = list(agg.index)
+    colors = _stage_color_map(stages)
+    fig, axes = plt.subplots(2, 2, figsize=(17, 11))
+    fig.suptitle("Stage Timing - Overview", fontsize=15, fontweight="bold")
+
+    _pie(
+        axes[0, 0],
+        stages,
+        agg["total_ms"].values / 1000.0,
+        colors,
+        title="Share of total wall time (worker-side sum)",
+        value_unit="s",
+    )
+    _pie(
+        axes[0, 1],
+        stages,
+        agg["ms_per_row"].values,
+        colors,
+        title="Per-row cost share (ms/row across stages)",
+        value_unit="ms/row",
+    )
+    _pie(
+        axes[1, 0],
+        stages,
+        agg["rows_in"].values.astype(float),
+        colors,
+        title="Share of rows processed per stage",
+        value_unit="rows",
+    )
+
+    # Phase breakdown: single pie of the mean phase split across all stages.
+    ax = axes[1, 1]
+    phase_labels = ["preprocess", "process", "postprocess"]
+    phase_values = np.array(
+        [
+            float(df["preprocess_ms"].sum()),
+            float(df["process_ms"].sum()),
+            float(df["postprocess_ms"].sum()),
+        ]
+    )
+    phase_colors = {label: _PHASE_COLORS[col] for label, col in zip(phase_labels, _PHASE_COLS)}
+    _pie(
+        ax,
+        phase_labels,
+        phase_values,
+        phase_colors,
+        title="Where time is spent overall (per phase, all stages)",
+        value_unit="ms",
+    )
+
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=140)
+    plt.close(fig)
+
+
+def plot_timeline(df: pd.DataFrame, agg: pd.DataFrame, out_path: str) -> None:
+    stages = list(agg.index)
+    stage_to_y = {s: i for i, s in enumerate(stages)}
+    cmap = plt.get_cmap("tab20")
+    colors = {s: cmap(i % cmap.N) for i, s in enumerate(stages)}
+
+    fig, ax = plt.subplots(figsize=(15, max(3.0, 0.5 * len(stages) + 2)))
+    for _, row in df.iterrows():
+        y = stage_to_y[row["stage"]]
+        width_s = max(row["total_ms"] / 1000.0, 1e-4)
+        ax.barh(
+            y=y,
+            width=width_s,
+            left=row["t_start_s"],
+            height=0.7,
+            color=colors[row["stage"]],
+            edgecolor="black",
+            linewidth=0.3,
+        )
+    ax.set_yticks(list(stage_to_y.values()))
+    ax.set_yticklabels(list(stage_to_y.keys()))
+    ax.invert_yaxis()
+    ax.set_xlabel("wallclock seconds (relative to first batch)")
+    ax.set_title("Per-batch timeline (one bar per batch, colored by stage)")
+    ax.grid(True, axis="x", linestyle=":", alpha=0.4)
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=140)
+    plt.close(fig)
+
+
+def plot_stage_detail(stage: str, sdf: pd.DataFrame, out_path: str) -> None:
+    has_mem = "rss_peak_mb" in sdf.columns and float(sdf["rss_peak_mb"].max()) > 0
+    nrows = 3 if has_mem else 2
+    fig, axes = plt.subplots(nrows, 2, figsize=(14, 4.5 * nrows))
+    fig.suptitle(f"Stage detail - {stage}", fontsize=14, fontweight="bold")
+
+    # 1. Latency time-series (per batch) with phase breakdown stacked
+    ax = axes[0, 0]
+    x = sdf["batch_idx"].values
+    pre = sdf["preprocess_ms"].values
+    proc = sdf["process_ms"].values
+    post = sdf["postprocess_ms"].values
+    ax.bar(x, pre, color=_PHASE_COLORS["preprocess_ms"], label="preprocess", width=1.0)
+    ax.bar(x, proc, bottom=pre, color=_PHASE_COLORS["process_ms"], label="process", width=1.0)
+    ax.bar(x, post, bottom=pre + proc, color=_PHASE_COLORS["postprocess_ms"], label="postprocess", width=1.0)
+    ax.set_title("Per-batch latency over experiment (stacked phases)")
+    ax.set_xlabel("batch index (chronological)")
+    ax.set_ylabel("ms")
+    ax.legend(loc="upper right", fontsize=8)
+
+    # 2. Latency distribution (histogram + percentiles)
+    ax = axes[0, 1]
+    total = sdf["total_ms"].values
+    ax.hist(total, bins=min(30, max(5, len(total) // 2)), color="#7a7a7a", edgecolor="white")
+    for q, color, label in [
+        (50, "#2266aa", "p50"),
+        (95, "#cc6633", "p95"),
+        (100, "#aa2222", "max"),
+    ]:
+        v = float(np.percentile(total, q)) if q < 100 else float(total.max())
+        ax.axvline(v, color=color, linestyle="--", linewidth=1.2, label=f"{label} = {v:.1f} ms")
+    ax.set_title("Distribution of total per-batch latency")
+    ax.set_xlabel("ms")
+    ax.set_ylabel("# batches")
+    ax.legend(fontsize=8)
+
+    # 3. Batch size vs latency
+    ax = axes[1, 0]
+    rows = sdf["n_rows_in"].values
+    ax.scatter(rows, total, alpha=0.7, color="#33558c", s=24)
+    if len(rows) >= 3 and np.ptp(rows) > 0:
+        coef = np.polyfit(rows, total, 1)
+        xs = np.array([rows.min(), rows.max()])
+        ax.plot(
+            xs,
+            coef[0] * xs + coef[1],
+            color="#cc4444",
+            linestyle="--",
+            label=f"fit: {coef[0]:.2f} ms/row + {coef[1]:.1f} ms",
+        )
+        ax.legend(fontsize=8)
+    ax.set_title("Batch size vs. latency")
+    ax.set_xlabel("rows in batch")
+    ax.set_ylabel("total ms")
+
+    # 4. Phase share (pie of mean phase time)
+    ax = axes[1, 1]
+    means = [sdf[c].mean() for c in _PHASE_COLS]
+    labels = ["preprocess", "process", "postprocess"]
+    colors_l = [_PHASE_COLORS[c] for c in _PHASE_COLS]
+    if sum(means) > 0:
+        ax.pie(means, labels=labels, colors=colors_l, autopct="%1.1f%%", startangle=90)
+        ax.set_title("Where time is spent on average (per phase)")
+    else:
+        ax.axis("off")
+        ax.text(0.5, 0.5, "no data", ha="center", va="center")
+
+    # 5 + 6. Memory panels (only when memory data is present)
+    if has_mem:
+        x = sdf["batch_idx"].values
+
+        # 5. RSS over batches: before / after / peak overlaid
+        ax = axes[2, 0]
+        ax.plot(x, sdf["rss_before_mb"], color="#5b8fd6", lw=1.2, label="rss before")
+        ax.plot(x, sdf["rss_after_mb"], color="#2a4d8f", lw=1.5, label="rss after")
+        ax.plot(x, sdf["rss_peak_mb"], color="#cc4444", lw=1.5, linestyle="--", label="rss peak (ru_maxrss)")
+        ax.fill_between(x, sdf["rss_before_mb"], sdf["rss_after_mb"], color="#5b8fd6", alpha=0.15)
+        ax.set_title("Worker RSS over batches")
+        ax.set_xlabel("batch index (chronological)")
+        ax.set_ylabel("MB")
+        ax.legend(loc="upper left", fontsize=8)
+        ax.grid(True, linestyle=":", alpha=0.4)
+
+        # 6. RSS delta per batch (retained memory per call)
+        ax = axes[2, 1]
+        deltas = sdf["rss_delta_mb"].values
+        bar_colors = ["#cc4444" if v > 0 else "#4f9d4f" for v in deltas]
+        ax.bar(x, deltas, color=bar_colors, width=1.0)
+        ax.axhline(0, color="#888", lw=0.6)
+        ax.set_title(f"RSS delta per batch (mean = {float(deltas.mean()):+.2f} MB)")
+        ax.set_xlabel("batch index (chronological)")
+        ax.set_ylabel("delta MB (+ retained, - released)")
+        ax.grid(True, linestyle=":", alpha=0.4)
+
+    # Stats footnote
+    rows_in_total = int(sdf["n_rows_in"].sum())
+    total_s = float(sdf["total_ms"].sum() / 1000.0)
+    ms_row = total_s * 1000.0 / rows_in_total if rows_in_total else 0.0
+    footnote = (
+        f"{len(sdf)} batches | {rows_in_total} rows in | "
+        f"{total_s:.2f} s total | {ms_row:.3f} ms/row | "
+        f"mean batch: {sdf['total_ms'].mean():.1f} ms (p95 {np.percentile(total, 95):.1f} ms)"
+    )
+    if has_mem:
+        footnote += (
+            f" | peak rss: {float(sdf['rss_peak_mb'].max()):,.0f} MB"
+            f" | mean delta: {float(sdf['rss_delta_mb'].mean()):+.2f} MB/batch"
+        )
+    fig.text(0.5, 0.005, footnote, ha="center", fontsize=9, style="italic")
+
+    fig.tight_layout(rect=(0, 0.025, 1, 1))
+    fig.savefig(out_path, dpi=140)
+    plt.close(fig)
+
+
+def main(argv: List[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("--input", "-i", default="/tmp/timing.json", help="Path to timing JSON.")
+    parser.add_argument(
+        "--output-dir",
+        "-o",
+        default="/tmp/timing_charts",
+        help="Directory to write PNGs into (created if missing).",
+    )
+    parser.add_argument(
+        "--show-summary",
+        action="store_true",
+        help="Print the per-stage aggregate table to stdout.",
+    )
+    args = parser.parse_args(argv)
+
+    payload = load_payload(args.input)
+    df = load_records(payload)
+    sdf_samples = load_samples(payload)
+    agg = _stage_aggregates(df)
+
+    out_dir = Path(args.output_dir)
+    (out_dir / "per_stage").mkdir(parents=True, exist_ok=True)
+
+    overview_png = out_dir / "overview.png"
+    timeline_png = out_dir / "timeline.png"
+    memory_overview_png = out_dir / "memory_overview.png"
+    memory_timeline_png = out_dir / "memory_timeline.png"
+
+    plot_overview(df, agg, str(overview_png))
+    plot_timeline(df, agg, str(timeline_png))
+    wrote_mem_overview = plot_memory_overview(agg, payload, str(memory_overview_png))
+    wrote_mem_timeline = plot_memory_timeline(sdf_samples, payload, str(memory_timeline_png))
+
+    per_stage_paths: Dict[str, str] = {}
+    for stage in agg.index:
+        stage_df = df[df["stage"] == stage].reset_index(drop=True)
+        out_path = out_dir / "per_stage" / f"{_safe_name(stage)}.png"
+        plot_stage_detail(stage, stage_df, str(out_path))
+        per_stage_paths[stage] = str(out_path)
+
+    if args.show_summary:
+        print(agg.round(3).to_string())
+
+    print(f"Wrote overview        -> {overview_png}")
+    print(f"Wrote timeline        -> {timeline_png}")
+    if wrote_mem_overview:
+        print(f"Wrote memory overview -> {memory_overview_png}")
+    else:
+        print("Skipped memory overview (no per-batch memory data in JSON).")
+    if wrote_mem_timeline:
+        print(f"Wrote memory timeline -> {memory_timeline_png}")
+    else:
+        print("Skipped memory timeline (no memory_samples in JSON).")
+    for stage, p in per_stage_paths.items():
+        print(f"Wrote per-stage       -> {p}    ({stage})")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())