pulseengine · avrabe · May 3, 2026
diff --git a/.github/workflows/engine-bench-renode-synth.yml b/.github/workflows/engine-bench-renode-synth.yml
@@ -306,7 +306,7 @@ jobs:
                   if line.startswith('E,'):
                       print(f"R1,{variant},{line}")
                   elif line.startswith(('drops,', 'samples,', 'build,',
-                                        'cycles_per_sec,', 'target_samples,')):
+                                        'cycles_per_sec,', 'target_samples,', 'overhead_cycles,')):
                       print(f"M,R1,{variant},{line}")
                   elif line == '=== END ===':
                       print(f"M,R1,{variant},END")
@@ -322,7 +322,7 @@ jobs:
                   if line.startswith('E,'):
                       print(f"R1,{variant},{line}")
                   elif line.startswith(('drops,', 'samples,', 'build,',
-                                        'cycles_per_sec,', 'target_samples,')):
+                                        'cycles_per_sec,', 'target_samples,', 'overhead_cycles,')):
                       print(f"M,R1,{variant},{line}")
                   elif line == '=== END ===':
                       print(f"M,R1,{variant},END")
@@ -341,7 +341,7 @@ jobs:
                   if line.startswith('E,'):
                       print(f"R1,{variant},{line}")
                   elif line.startswith(('drops,', 'samples,', 'build,',
-                                        'cycles_per_sec,', 'target_samples,')):
+                                        'cycles_per_sec,', 'target_samples,', 'overhead_cycles,')):
                       print(f"M,R1,{variant},{line}")
                   elif line == '=== END ===':
                       print(f"M,R1,{variant},END")
@@ -357,7 +357,7 @@ jobs:
                   if line.startswith('E,'):
                       print(f"R1,{variant},{line}")
                   elif line.startswith(('drops,', 'samples,', 'build,',
-                                        'cycles_per_sec,', 'target_samples,')):
+                                        'cycles_per_sec,', 'target_samples,', 'overhead_cycles,')):
                       print(f"M,R1,{variant},{line}")
                   elif line == '=== END ===':
                       print(f"M,R1,{variant},END")
@@ -377,7 +377,7 @@ jobs:
                   if line.startswith('E,'):
                       print(f"R1,{variant},{line}")
                   elif line.startswith(('drops,', 'samples,', 'build,',
-                                        'cycles_per_sec,', 'target_samples,')):
+                                        'cycles_per_sec,', 'target_samples,', 'overhead_cycles,')):
                       print(f"M,R1,{variant},{line}")
                   elif line == '=== END ===':
                       print(f"M,R1,{variant},END")

diff --git a/benches/engine_control/README.md b/benches/engine_control/README.md
@@ -46,6 +46,46 @@ This replaces the in-firmware histogram+mean approach whose mean
 divisor (reader `count`) diverged from the numerator (ISR event sum)
 when the sweep truncated early, invalidating the published deltas.
 
+## Framework overhead compensation
+
+Every `algo_cycles` and `handoff_cycles` value emitted on the wire is
+the raw measurement **minus** a constant `bench_overhead_cycles`,
+measured at boot before any per-event timing begins. The
+`measure_overhead()` routine in `src/main.c` runs
+
+```c
+start = k_cycle_get_32();
+end   = k_cycle_get_32();
+delta = end - start;
+```
+
+1000 times under `irq_lock`, sorts the deltas, and stores the
+**median** as `bench_overhead_cycles`. That value is then subtracted
+(saturating at 0) from every per-event count before it reaches the
+CSV stream, so what's reported is the work between the cycle-counter
+reads, not the cost of the cycle-counter reads themselves.
+
+The compensation is **visible**: the measured value is emitted as a
+metadata line `overhead_cycles,<value>` in the CSV header, preserved
+into the artifact bundle, and surfaced in `analyze.py`'s report header
+as "Overhead subtracted (cycles): baseline ...; gale ..." — a
+reviewer can audit the subtraction and re-add it if they want the
+raw numbers back.
+
+This matches the upstream Zephyr 4.4 `ztest_bench` framework's `ctrl`
+benchmark pattern (`subsys/testsuite/ztest/benchmark/`), which
+measures and subtracts the cost of an `empty_function` call from
+every reported result. Pre-compensation and post-compensation numbers
+are **different measurements** — do not combine them in a single
+comparison table.
+
+## Scope and non-claims
+
+See [SCOPE.md](SCOPE.md) for the explicit list of what this bench
+measures and what it does NOT measure. That file is the source of
+truth for any downstream copy (blog posts, reports). Do not embed
+scope claims in published copy without first updating SCOPE.md.
+
 ## Building
 
 ```sh

diff --git a/benches/engine_control/SCOPE.md b/benches/engine_control/SCOPE.md
@@ -0,0 +1,148 @@
+# `engine_control` bench — scope, non-claims, and source of truth
+
+This file is the **source of truth** for what the `engine_control`
+benchmark measures, what it does not measure, and what kind of
+evidence its numbers constitute. Subsequent blog posts, reports,
+internal memos, and external citations import language from here.
+**Do not** embed scope claims directly in published copy without
+first updating this file. Inconsistency between published copy and
+this file is a defect in the published copy.
+
+## What is measured
+
+Cycle counts on the named target at the named clock frequency, under
+nominal contention from the bench harness only (no peripheral
+traffic, no DMA, no inter-core activity, no production workload):
+
+- **`algo_cycles`** — ISR-side `control_step()` execution time:
+  cycle counter at ISR entry → cycle counter immediately after
+  `control_step()` returns. Pure C, identical between baseline and
+  gale builds; serves as the integrity check (medians must agree
+  within 10%).
+- **`handoff_cycles`** — ISR-side primitive cost: cycle counter
+  immediately after `control_step()` → cycle counter at end of ISR.
+  Covers `ring_buf_put` + `k_sem_give`. The measured engineering
+  delta between baseline (stock Zephyr primitives) and gale
+  (verified-Rust replacements) lives here.
+
+Both values have **framework overhead compensation** applied: a
+constant `bench_overhead_cycles` (median of 1000 empty
+`k_cycle_get_32()`-pair measurements taken at boot under `irq_lock`)
+is subtracted from every emitted value. The compensation constant is
+emitted in the CSV header (`overhead_cycles,<value>`) and surfaced in
+the analyzer's report header so any reader can audit and re-add it.
+Matches Zephyr 4.4 `ztest_bench`'s `ctrl` pattern.
+
+The current measurement target is one of:
+- **Renode 1.16.0** (CI default, container-pinned), or
+- **Renode nightly** (CI cycle-model A/B control), or
+- **Real silicon** (when item 1 lands; STM32F4 Discovery via SWO/DWT
+  capture).
+
+The current Cortex-M target clock is **168 MHz** on `stm32f4_disco`,
+**100 kHz tick** on `qemu_cortex_m3` (smoke). Numbers are not
+comparable across these targets at face value because the cycle unit
+differs.
+
+## What is NOT measured
+
+This bench produces engineering measurements; it is **not**
+certification evidence and does **not** measure any of the following:
+
+- **Peripheral contention** — no SPI, I²C, UART RX, GPIO toggle, or
+  bus-master traffic during the measurement window. The ISR is
+  driven by an internal `k_timer`, not by an external sensor.
+- **DMA-driven I/O** — real flight controllers receive sensor data
+  via DMA-complete IRQs with bursty alignment characteristics
+  (cache, bus arbitration). This bench uses a synthetic timer ISR
+  with no DMA path.
+- **SMP / multi-core** — single-CPU only. The `gale_spinlock`
+  primitive ships in the codebase but its actual hazard
+  (concurrent CAS from another core) is **not exercised by this
+  bench**. SMP coverage is a separate workflow (`zephyr-smp-test`
+  on `qemu_x86_64`) with known runtime issues.
+- **WCET (Worst-Case Execution Time)** — the bench reports observed
+  cycle distributions. It does **not** prove a worst-case bound.
+  Establishing WCET requires static analysis tooling such as
+  **AbsInt aiT**, **Rapita RapiTime**, or **OTAWA** combined with
+  microarchitectural models for the specific MCU. Worst-case-observed
+  numbers, when added later under the bench-rigor work item 6, are
+  **not** WCET claims and must be labeled as `worst_observed`,
+  not `wcet`. The distinction is unambiguous and not negotiable in
+  published copy: an observation is not a proof.
+- **Power consumption** — the bench measures cycles, not energy or
+  current. For embedded deployment the relevant figure is often
+  µJ/op or mA average, neither of which this bench produces.
+- **Memory pressure** — peak heap, peak stack high-water mark, slab
+  fragmentation. Stack high-water-mark capture is planned (work
+  item 5, gated on real-silicon anchor first).
+- **Fault tolerance** — stuck-sensor inputs, dropped messages,
+  scheduler-induced timeouts, watchdog resets. The bench operates
+  under **nominal** scheduling only. Fault-injection coverage is
+  out of scope here and belongs in a v2 of the flight bench.
+- **Long-duration drift** — runs are seconds to minutes, not hours.
+  32-bit cycle-counter wrap behavior, accumulated heap fragmentation,
+  ring-buffer head/tail drift over multi-hour operation are not
+  observable in this bench.
+
+## Status of the published delta
+
+The headline `−34.5%` handoff-cycle delta (gale vs GCC baseline) is:
+
+- **Real** — the cross-Renode A/B (1.16.0 vs nightly) shows 0.0%
+  drift on identical ELFs across simulator versions, ruling out the
+  cycle model as the source. The synth-vs-rustc-direct cross-check
+  shows synth's codegen agrees with (in fact slightly outperforms)
+  rustc-direct, ruling out a synth miscompile.
+- **Tool-bounded** — produced by the on-target `k_cycle_get_32`
+  reading inside Renode's per-block cost simulation. **Not** anchored
+  to a real silicon measurement until work item 1 lands.
+- **Workload-bounded** — measured in the engine_control ISR shape
+  (one timer ISR, one ring + sem hop). **Not** generalizable to
+  composed workloads (use `flight_control` for that, with its own
+  scope file).
+
+## What kind of evidence this is
+
+**Engineering measurement** under controlled simulation, with the
+methodology and toolchain enumerated in the build manifest. Suitable
+for:
+
+- Internal regression detection (CI-gated p99 ≤ 2× baseline asserts)
+- Engineering decisions about primitive choice
+- Public claims of the form "we measure X cycles under conditions Y"
+  with conditions Y enumerated above
+
+**Not** suitable for:
+
+- Certification submissions to DO-178C, ISO 26262, IEC 61508, or any
+  other safety standard. Certification evidence requires qualified
+  tools, independent verification, requirements traceability, and
+  WCET via static analysis — none of which this bench provides.
+- Marketing copy that elides the conditions
+- Citation as "verified-for-flight" performance
+
+Short version for first paragraphs of any blog post: *"Cycle
+measurements under Renode-simulated Cortex-M4F at 168 MHz on a
+synthetic ISR workload. Engineering measurement, not certification
+evidence; see SCOPE.md for the full enumeration of what is and isn't
+measured."*
+
+## When to update this file
+
+Whenever:
+
+- The measurement target changes (e.g., real silicon arrives — work
+  item 1).
+- The compensation regime changes (e.g., overhead compensation lands
+  — work item 2; algorithm or constants change later).
+- The non-claims list changes (e.g., SMP coverage is added; fault
+  injection is added in a v2).
+- A reviewer raises a scope question that the current text does not
+  unambiguously answer.
+
+Pre-compensation and post-compensation numbers are **different
+measurements**. When the compensation regime changes, anchor
+explicitly in published copy: *"Numbers below are
+overhead-compensated; pre-compensation reference values are at
+[link]"* — never combine them in the same comparison table.
diff --git a/benches/engine_control/analyze.py b/benches/engine_control/analyze.py
@@ -50,6 +50,11 @@ class Meta:
     build: str = "?"
     cycles_per_sec: int = 0
     target_samples: int = 0
+    # Per-run framework overhead (cycles), measured at boot via
+    # measure_overhead() in main.c and subtracted from every algo /
+    # handoff value before emit. Tracked here so the report header
+    # can surface the compensation that's been applied.
+    overhead_cycles: dict[str, int] = field(default_factory=dict)
     # Per-run drops/samples, keyed by run id ("R1", "R2", ...)
     drops: dict[str, int] = field(default_factory=dict)
     samples: dict[str, int] = field(default_factory=dict)
@@ -109,6 +114,11 @@ def parse_events(path: Path) -> tuple[list[Sample], Meta]:
                     meta.target_samples = int(parts[4])
                 except ValueError:
                     pass
+            elif tail == "overhead_cycles" and len(parts) >= 5:
+                try:
+                    meta.overhead_cycles[run] = int(parts[4])
+                except ValueError:
+                    pass
     return samples, meta
 
 
@@ -271,6 +281,19 @@ def render(base_s: list[Sample], gale_s: list[Sample],
     if hz:
         lines.append(f"- Cycle counter:   {hz:,} Hz "
                      f"(1 cycle ≈ {1e9/hz:.1f} ns)")
+    # Surface the framework-overhead compensation that's been applied
+    # on-target so a reviewer can audit the subtraction. Per audit P7
+    # / ztest_bench parity: every algo / handoff value below is the
+    # raw measurement minus this constant.
+    base_oh = base_m.overhead_cycles
+    gale_oh = gale_m.overhead_cycles
+    if base_oh or gale_oh:
+        b_str = ", ".join(f"{r}={v}" for r, v in sorted(base_oh.items())) \
+                or "n/a"
+        g_str = ", ".join(f"{r}={v}" for r, v in sorted(gale_oh.items())) \
+                or "n/a"
+        lines.append(f"- Overhead subtracted (cycles): "
+                     f"baseline {b_str}; gale {g_str}")
     lines.append("")
 
     # Per-step tables

diff --git a/benches/engine_control/src/main.c b/benches/engine_control/src/main.c
@@ -36,6 +36,7 @@
 #include <zephyr/sys/ring_buffer.h>
 #include <stdbool.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include "control.h"
@@ -204,12 +205,55 @@ static void crank_isr(struct k_timer *t)
 
 static uint32_t count = 0;
 
+/*
+ * Bench framework overhead — measured at boot before any per-event
+ * timing begins (see measure_overhead). Subtracted from every algo /
+ * handoff cycle count emitted to the CSV stream so the published
+ * numbers reflect work between the cycle-counter reads, not the cost
+ * of the cycle-counter reads themselves. The measured value is also
+ * emitted as a metadata line so reviewers can audit the compensation
+ * step. Same idiom as Zephyr 4.4 ztest_bench's `ctrl` benchmark.
+ */
+#define OVERHEAD_SAMPLES 1000U
+static uint32_t bench_overhead_cycles = 0U;
+
+static int cmp_u32(const void *a, const void *b)
+{
+	uint32_t x = *(const uint32_t *)a;
+	uint32_t y = *(const uint32_t *)b;
+	return (x > y) - (x < y);
+}
+
+static void measure_overhead(void)
+{
+	static uint32_t samples[OVERHEAD_SAMPLES];
+	unsigned int key = irq_lock();
+	for (uint32_t i = 0; i < OVERHEAD_SAMPLES; i++) {
+		uint32_t a = k_cycle_get_32();
+		uint32_t b = k_cycle_get_32();
+		samples[i] = b - a;
+	}
+	irq_unlock(key);
+	qsort(samples, OVERHEAD_SAMPLES, sizeof(uint32_t), cmp_u32);
+	bench_overhead_cycles = samples[OVERHEAD_SAMPLES / 2];  /* median */
+}
+
+/* Saturating subtraction — never report a negative cycle count. With
+ * a quiet measurement window plus interrupt-locked overhead probe, the
+ * compensated value should rarely if ever underflow, but we clip to 0
+ * defensively rather than silently wrapping. */
+static inline uint32_t compensate(uint32_t raw)
+{
+	return raw > bench_overhead_cycles ? raw - bench_overhead_cycles : 0U;
+}
+
 static void emit_event(const struct crank_sample *s)
 {
 	uint32_t handoff = g_handoff_by_slot[s->seq % RING_CAPACITY_SAMPLES];
 	printf("E,%u,%u,%u,%u,%u\n",
 	       (unsigned)s->seq, (unsigned)s->step, (unsigned)s->rpm,
-	       (unsigned)s->algo_cycles, (unsigned)handoff);
+	       (unsigned)compensate(s->algo_cycles),
+	       (unsigned)compensate(handoff));
 }
 
 static void reader_loop(void)
@@ -239,7 +283,12 @@ static void print_csv_header(void)
 	);
 	printf("cycles_per_sec,%u\n", hz);
 	printf("target_samples,%u\n", TOTAL_SAMPLES);
+	/* Visible compensation: every algo / handoff value below has had
+	 * this many cycles subtracted. Median of 1000 empty cycle-counter
+	 * read pairs measured at boot under irq_lock (see measure_overhead). */
+	printf("overhead_cycles,%u\n", bench_overhead_cycles);
 	printf("# event rows: E,<seq>,<step>,<rpm>,<algo_cycles>,<handoff_cycles>\n");
+	printf("# algo / handoff cycles are AFTER subtracting overhead_cycles\n");
 }
 
 static void print_csv_footer(void)
@@ -343,6 +392,12 @@ int main(void)
 
 	k_thread_priority_set(k_current_get(), 5);
 
+	/* Measure framework overhead BEFORE the CSV header so the value
+	 * is recorded in the header line. Runs at thread context with
+	 * IRQs locked for the inner loop only — no other threads exist
+	 * yet, so this is the quietest the system will ever be. */
+	measure_overhead();
+
 	/* Emit CSV header BEFORE starting the sweep so stdout ordering
 	 * is deterministic: header, then events interleaved with sweep
 	 * progress printk, then footer. */

diff --git a/benches/engine_control/tag_events.py b/benches/engine_control/tag_events.py
@@ -33,6 +33,7 @@ def main(argv: list[str]) -> int:
                     print(f"M,R{run_id},{variant},END")
                 elif (line.startswith("cycles_per_sec,")
                       or line.startswith("target_samples,")
+                      or line.startswith("overhead_cycles,")
                       or line.startswith("build,")):
                     print(f"M,R{run_id},{variant},{line}")
                 elif line.startswith("#"):