diff --git a/docs/dfx/dep_gen.md b/docs/dfx/dep_gen.md index 867472b7c..c1e83f8d4 100644 --- a/docs/dfx/dep_gen.md +++ b/docs/dfx/dep_gen.md @@ -1,4 +1,4 @@ -# dep_gen — Complete Per-Submit Dependency Graph (v2, Tensor-Annotated) +# dep_gen — Complete Per-Submit Dependency Graph (Tensor-Annotated) ## 1. Background & Motivation @@ -61,8 +61,8 @@ appear too. This is the guarantee against silent shotgun modifications — anyone who changes `compute_task_fanin` semantics will trip the gate immediately and know to update the annotated mirror. -- **Output.** `/deps.json` — v2 schema with `tasks[]`, - `tensors[]`, and tensor-annotated `edges[]` (see §4). +- **Output.** `/deps.json` — strided-Tensor schema with + `tasks[]`, `tensors[]`, and tensor-annotated `edges[]` (see §4). --- @@ -97,29 +97,35 @@ The standard SceneTest path --- -## 4. Output: `deps.json` (v2) +## 4. Output: `deps.json` ```json { - "version": 2, "tasks": [ - {"task_id": "0", "scope": "auto"}, - {"task_id": "4294967296", "scope": "auto"} + {"task_id": "0", "scope": "auto", "args": []}, + {"task_id": "4294967296", "scope": "auto", "args": [ + {"idx": 0, "type": "INPUT", "tensor_id": "13451765318376212391", + "dtype": "FLOAT32", "shape": [16384], + "start_offset": "0", "strides": [1]} + ]} ], "tensors": [ {"tensor_id": "13451765318376212391", "buffer_addr": "29204938752", "version": 0, - "dtype": "FLOAT32", "ndims": 1, "raw_shapes": [16384]} + "dtype": "FLOAT32", "buffer_numel": "16384"} ], "edges": [ {"pred": "0", "succ": "4294967296", "arg": 0, "source": "creator", "tensor_id": "13451765318376212391", "consumer_dtype": "FLOAT32", - "consumer_shape": [16384], "consumer_offset": [0]}, + "consumer_shape": [16384], + "consumer_start_offset": "0", "consumer_strides": [1]}, {"pred": "4294967296", "succ": "4294967298", "arg": 0, "source": "tensormap", "overlap": "covered", "tensor_id": "9514117477438350967", "consumer_dtype": "FLOAT32", - "consumer_shape": [16384], "consumer_offset": [0], - "producer_shape": [16384], "producer_offset": [0]} + "consumer_shape": [16384], + "consumer_start_offset": "0", "consumer_strides": [1], + "producer_shape": [16384], + "producer_start_offset": "0", "producer_strides": [1]} ] } ``` @@ -153,8 +159,9 @@ this block. One entry per unique `(buffer_addr, version)` pair touched by the trace. `tensor_id` is a stable FNV-1a 64-bit hash of that pair — identical inputs across runs yield the same id, making `deps.json` files diffable. -`raw_shapes` describes the **underlying buffer**, not the slice; -per-edge slice information lives in the `edges[]` entries. +`buffer_numel` is the element count of the **underlying buffer**, not the +slice; per-edge slice geometry (`shape` + `start_offset` + `strides`) +lives in the `edges[]` entries. ### `edges[]` @@ -168,8 +175,12 @@ Each edge is `{pred, succ}` plus annotation. Fields: | `overlap` | string | `source=tensormap` | `covered` (producer slice fully contains consumer slice) or `other` | | `tensor_id` | uint64 (string) | not `explicit` | Identity of the underlying tensor; cross-references `tensors[]` | | `consumer_dtype` | string | not `explicit` | Element type the consumer reads as | -| `consumer_shape`, `consumer_offset` | uint32 array | not `explicit` | The slice the consumer actually reads | -| `producer_shape`, `producer_offset` | uint32 array | `source=tensormap` | The slice the producer wrote (recovered from the live tensormap entry) | +| `consumer_shape` | uint32 array | not `explicit` | Per-dim element count of the consumer slice | +| `consumer_start_offset` | uint64 (string) | not `explicit` | Element offset of the consumer slice into the buffer | +| `consumer_strides` | uint32 array | not `explicit` | Per-dim stride (in elements) of the consumer slice; runtime invariant > 0 | +| `producer_shape` | uint32 array | `source=tensormap` | Per-dim element count of the producer slice | +| `producer_start_offset` | uint64 (string) | `source=tensormap` | Element offset of the producer slice | +| `producer_strides` | uint32 array | `source=tensormap` | Per-dim stride of the producer slice; runtime invariant > 0 | A single `(pred, succ)` pair can appear in `edges[]` multiple times if the producer drives the consumer through multiple slots, multiple @@ -222,9 +233,10 @@ Each arg row carries a 4-line block: ```text arg [ ?] : -raw: [...] # underlying buffer (from tensors[].raw_shapes) -shape: [...] # slice this slot accesses -offset: [...] # slice start in the raw buffer +storage: elems # underlying buffer size +shape: [...] # slice this slot accesses +strides: [...] # per-dim element strides +start_offset: (elem) # slice start in the underlying buffer ``` `` is `T` from `tensors[]` order, so two slots referencing @@ -270,7 +282,7 @@ for this tool. ## 6. Relationship to `fanout[]` + Validation Gate -When checking fanout coverage, project v2 edges down to a +When checking fanout coverage, project annotated edges down to a `{(pred, succ)}` set first — the per-edge annotation distinguishes sources / args / slices, so the raw `edges[]` count is a superset of the underlying task-pair count. @@ -342,7 +354,7 @@ list; only the dep_gen replay graph loses the tail. | AICPU writer | `src/a2a3/platform/{include,src}/aicpu/dep_gen_collector_aicpu.{h,cpp}` | Single-instance write path; weak-fallback exported to host build | | Host collector | `src/a2a3/platform/{include/host,src/host}/dep_gen_collector.{h,cpp}` | `ProfilerBase` — drains ring → `records_` vector | | Capture call site | `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp` `submit_task_common` | One conditional block that snapshots inputs into the ring when `is_dep_gen_enabled()`; fires for both `submit_task` and `submit_dummy_task`. Dep-only tasks land in the record stream with valid tensor/dep info but no kernel_id field (the schema does not carry kernel_id), so replay treats them as ordinary dep nodes — viewers do not currently distinguish dummy from real tasks. | -| Replay | `src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.{h,cpp}` | Pure CPU; runs dual-pass differential replay — `compute_task_fanin` (oracle) + inlined STEP A/B mirror (annotated) against two `PTO2TensorMap` instances. Emits v2 `deps.json` when both passes agree per record. | +| Replay | `src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.{h,cpp}` | Pure CPU; runs dual-pass differential replay — `compute_task_fanin` (oracle) + inlined STEP A/B mirror (annotated) against two `PTO2TensorMap` instances. Emits `deps.json` when both passes agree per record. | | Device-runner hookup | `src/a2a3/platform/{onboard,sim}/host/device_runner.cpp` | post-`reconcile_counters` calls `dep_gen_replay_emit_deps_json(records.data(), records.size(), deps_path, nullptr)` | | Viewer | `simpler_setup/tools/deps_to_graph.py` | `deps.json` → pan/zoom HTML | | Test | `tests/st/a2a3/tensormap_and_ringbuffer/dep_gen_capture/test_dep_gen_capture.py` | Smoke test + `fanout ⊆ deps` validation gate | diff --git a/docs/dfx/l2-swimlane-profiling.md b/docs/dfx/l2-swimlane-profiling.md index 6f1dfe680..4f7cc24f3 100644 --- a/docs/dfx/l2-swimlane-profiling.md +++ b/docs/dfx/l2-swimlane-profiling.md @@ -25,14 +25,13 @@ available. ## 2. Overview -- **Per-task AICore timing** — `start_time`, `end_time`, - `duration`, plus AICPU-stamped `dispatch_time` / `finish_time`. +- **Per-task AICore timing** — `start_time_us`, `end_time_us`, + `duration_us`, plus AICPU-stamped `dispatch_time_us` / `finish_time_us`. - **Per-task fanout chain** — successor `task_id`s recorded in the L2 record so dependency arrows show up in the Perfetto view. - **AICPU scheduler phases** — per-iteration breakdown into - `SCHED_COMPLETE` / `SCHED_DISPATCH` / `SCHED_SCAN` / - `SCHED_IDLE_WAIT`. + `complete` / `dispatch` / `scan` / `idle`. - **Orchestrator phase summary** — cumulative cycle counts for the orchestrator's nine sub-steps (sync / alloc / params / lookup / heap / insert / fanin / finalize / scope_end). @@ -57,10 +56,10 @@ backward-compatible with the old boolean behavior). | Level | Collects | Notes | | ----- | -------- | ----- | | 0 | Nothing (disabled) | Default when flag is absent | -| 1 | AICore timing only (start/end/task_id/func_id/core_type) | No AICPU timestamps, no fanout | -| 2 | + dispatch_time, finish_time, fanout | Full per-task record | -| 3 | + Scheduler phases (`SCHED_*`) | Skips orchestrator phases | -| 4 | + Orchestrator phases | Full collection | +| 1 | AICore timing only (start_time_us/end_time_us/task_id/func_id/core_type) | No AICPU timestamps, no fanout | +| 2 | + dispatch_time_us, finish_time_us, fanout | Full per-task record | +| 3 | + scheduler phases (`aicpu_scheduler_phases[]`) | Skips orchestrator phases | +| 4 | + orchestrator phases (`aicpu_orchestrator_phases[]`) | Full collection | ```bash # Standalone runner @@ -88,8 +87,8 @@ dispatch/finish timestamps and fanout are recorded only at level >= 2, scheduler phase records only at level >= 3, and orchestrator phase records only at level >= 4. -The JSON output `"version"` field directly reflects the -perf_level: `1` = AICore timing only, `2` = +dispatch/fanout, +The JSON output `"l2_perf_level"` field is the captured perf_level: +`1` = AICore timing only, `2` = +dispatch/fanout, `3` = +scheduler phases, `4` = +orchestrator phases. `--rounds > 1` collects only on the **first** round so warm-up @@ -118,22 +117,29 @@ you pass to `swimlane_converter`. Important fields per task: | Field | Meaning | | ----- | ------- | -| `task_id` | Runtime task id, hex (low 32 bits = AICore register token; full 64 bits filled by AICPU) | +| `task_id` | Runtime task id (`(ring_id << 32) \| local_id`); also exposed split as`ring_id` | | `func_id` | Kernel function id | -| `core_type` | `0` = AIC, `1` = AIV | -| `start_time` / `end_time` / `duration` | AICore device-clock cycles (`get_sys_cnt`) | -| `dispatch_time` | AICPU timestamp when this task was dispatched | -| `finish_time` | AICPU timestamp when AICPU observed FIN | -| `fanout[]` / `fanout_count` | Successor task ids, used by Perfetto dependency arrows | +| `core_id` / `core_type` | Physical core index and `"aic"` / `"aiv"` string | +| `start_time_us` / `end_time_us` / `duration_us` | AICore execution window in microseconds | +| `dispatch_time_us` | AICPU timestamp when this task was dispatched (filled at level >= 2) | +| `finish_time_us` | AICPU timestamp when AICPU observed FIN (filled at level >= 2) | +| `fanout[]` / `fanout_count` | Successor task ids (level >= 2), used by Perfetto dependency arrows | -Phase records (per scheduler thread): +Phase records (per scheduler thread, level >= 3 for +`aicpu_scheduler_phases[]` and level >= 4 for +`aicpu_orchestrator_phases[]`): | Field | Meaning | | ----- | ------- | -| `start_time` / `end_time` | Phase start / end timestamps | -| `loop_iter` | Scheduler loop iteration number | -| `phase_id` | One of `SCHED_COMPLETE` / `SCHED_DISPATCH` / `SCHED_SCAN` / `SCHED_IDLE_WAIT`, or `ORCH_*` for orchestrator phases | +| `start_time_us` / `end_time_us` | Phase start / end timestamps in microseconds | +| `phase` | Lowercase phase name. Scheduler: `complete` / `dispatch` / `scan` / `idle`. Orchestrator: `orch_*` (sync / alloc / params / lookup / heap / insert / fanin / finalize / scope_end). | +| `loop_iter` (scheduler) / `submit_idx` (orchestrator) | Iteration / submit-call counter for the producing thread | | `tasks_processed` (scheduler) / `task_id` (orchestrator) | Phase-specific union field | +| `pop_hit` / `pop_miss` (dispatch only) | Ready-queue pop deltas since the previous dispatch emit | + +`core_to_thread[]` (level >= 3) maps `core_id` (array index) to the +scheduler thread index that retired that core's tasks (`-1` = +unassigned). ### 3.3 Convert and view in Perfetto @@ -162,7 +168,7 @@ in. The trace contains: channel). Each task shows `func_name(t)`; dependency arrows follow `fanout[]`. - **AICPU View** — scheduler thread lanes with per-iteration - phase blocks coloured by `phase_id`. + phase blocks coloured by `phase`. - **AICPU Scheduler** — orchestrator phase summary at the top. When the run also emitted a device log (`device-*` file under @@ -206,12 +212,13 @@ schema and L3 example. What the swimlane shows: - **Per-task wall-clock placement.** Where each task ran on which - AICore, with start / end / duration in device cycles. -- **Dispatch and finish overhead.** `dispatch_time` and - `finish_time` come from AICPU, so the gap between - `dispatch_time` and `start_time` is the AICPU→AICore - hand-off latency, and the gap between `end_time` and - `finish_time` is the FIN-observation latency. + AICore, with `start_time_us` / `end_time_us` / `duration_us` in + microseconds (converted from device cycles). +- **Dispatch and finish overhead.** `dispatch_time_us` and + `finish_time_us` come from AICPU, so the gap between + `dispatch_time_us` and `start_time_us` is the AICPU→AICore + hand-off latency, and the gap between `end_time_us` and + `finish_time_us` is the FIN-observation latency. - **Dependency chains.** `fanout[]` lets Perfetto draw arrows between predecessor and successor tasks. - **Scheduler-loop time decomposition.** Per-iteration AICPU @@ -279,7 +286,7 @@ platform-owned AICore state, and never reassigned — so AICore is fully decoupled from any AICPU-side records-buffer rotation. AICPU, on observing FIN, validates the slot's register token, copies the slot record into the current `L2PerfBuffer::records[count]`, fills -`func_id` / `core_type` / `dispatch_time` / `finish_time` / `fanout`, +`func_id` / `core_type` / `dispatch_time_us` / `finish_time_us` / `fanout`, advances `count`, and rotates the records buffer in place when it fills up. The ring is sized to the runtime's in-flight issue depth (2 for dual-issue today; raise to the next power of two when issue @@ -619,7 +626,7 @@ data (only `tensormap_and_ringbuffer` does, and only when `AicpuPhaseHeader` was not initialized. Verify the runtime sets the magic in its scheduler init path. -**`dispatch_time` < `finish_time` mismatch.** Verify the runtime +**`dispatch_time_us` < `finish_time_us` mismatch.** Verify the runtime overwrites `task_id` with the full encoding on FIN (`tensormap_and_ringbuffer` does `(ring_id << 32) | local_id`); a half-filled record means AICore diff --git a/simpler_setup/tools/README.md b/simpler_setup/tools/README.md index 49817aa53..a1a548440 100644 --- a/simpler_setup/tools/README.md +++ b/simpler_setup/tools/README.md @@ -133,7 +133,7 @@ Analyze AICPU scheduler overhead and quantitatively decompose the sources of Tai `sched_overhead_analysis` reads two artifacts produced by the runtime: -1. **Perf profiling data** (`l2_perf_records_*.json`, v2): per-task Exec / Head OH / Tail OH time breakdowns plus `aicpu_scheduler_phases` — per-thread, per-loop-iteration phase records carrying scan / complete / dispatch / idle timings and per-emit pop_hit / pop_miss deltas. +1. **Perf profiling data** (`l2_perf_records_*.json`, l2_perf_level >= 3): per-task Exec / Head OH / Tail OH time breakdowns plus `aicpu_scheduler_phases` — per-thread, per-loop-iteration phase records carrying scan / complete / dispatch / idle timings and per-emit pop_hit / pop_miss deltas. 2. **`deps.json`** (optional, dep_gen replay output): structural task DAG. When colocated with the perf JSON, Part 2 prints per-thread fanout / fanin aggregates derived from it. ### Basic Usage @@ -167,7 +167,7 @@ Output is emitted in three parts: - **Part 2: AICPU scheduler loop breakdown** — per-scheduler-thread loop statistics, per-phase (scan / complete / dispatch / idle) time ratios, pop_hit / pop_miss totals, and (when deps.json is available) per-thread fanout / fanin aggregates - **Part 3: Tail OH distribution & cause analysis** — Tail OH quantile distribution (P10–P99), correlation between scheduler loop iteration time and Tail OH, and data-driven insights into the dominant phase -The perf JSON must have non-empty `aicpu_scheduler_phases` (rerun the case with `--enable-l2-swimlane` if the tool reports the field is missing). +The perf JSON must be captured at l2_perf_level >= 3 so that `aicpu_scheduler_phases` is non-empty (rerun the case with `--enable-l2-swimlane` if the tool reports the field is missing). --- @@ -283,15 +283,35 @@ The analysis tools share the same input format - the `l2_perf_records_*.json` fi ```json { + "l2_perf_level": 4, "tasks": [ { "task_id": 0, "func_id": 0, - "core_id": 0, - "core_type": "aic", - "start_time_us": 100.0, - "end_time_us": 250.5, - "duration_us": 150.5 + "core_id": 7, + "core_type": "aiv", + "ring_id": 0, + "start_time_us": 47.46, + "end_time_us": 55.9, + "duration_us": 8.44, + "dispatch_time_us": 45.94, + "finish_time_us": 60.52, + "fanout": [4294967299, 4294967297, 4294967296], + "fanout_count": 3 + }, + { + "task_id": 4294967296, + "func_id": 1, + "core_id": 7, + "core_type": "aiv", + "ring_id": 1, + "start_time_us": 68.68, + "end_time_us": 70.42, + "duration_us": 1.74, + "dispatch_time_us": 68.24, + "finish_time_us": 71.2, + "fanout": [4294967298], + "fanout_count": 1 } ] } @@ -300,6 +320,15 @@ The analysis tools share the same input format - the `l2_perf_records_*.json` fi Dependency edges come from `deps.json` (dep_gen replay) at post-process time — not from the perf JSON. See [`swimlane_converter --deps-json`](#swimlane_converter). +Top-level layout depends on `l2_perf_level`: + +- All levels: `l2_perf_level`, `tasks[]` (per-task fields above). +- `>= 3`: also `aicpu_scheduler_phases[]` (per-thread phase records: + scan / complete / dispatch / idle) and `core_to_thread[]` (core_id → + scheduler thread index). +- `>= 4`: also `aicpu_orchestrator_phases[]` (per-task orchestrator + phase records). + ### Kernel Config Format To display meaningful function names in the output, provide a `kernel_config.py` file: @@ -379,10 +408,11 @@ For batch-run hardware regression, see the dev-only script - Check the kernel_config.py file format - Make sure every KERNELS entry has a 'func_id' and 'name' field -### Error: Unsupported version +### Error: Unsupported l2_perf_level -- The tools only support version 1 of the profiling data format -- Regenerate the profiling data with the latest runtime +- The tools accept l2_perf_level 1–4 (the integer captured at runtime + via `--enable-l2-swimlane `) +- Regenerate the profiling data with a supported level ### Error: Perf JSON missing required fields for scheduler overhead analysis @@ -407,7 +437,7 @@ For batch-run hardware regression, see the dev-only script | ---- | ---- | ------- | ------ | | `l2_perf_records_*.json` | Runtime | Raw timing profiling data | JSON | | `merged_swimlane_*.json` | swimlane_converter | Perfetto visualization | Chrome Trace Event JSON | -| `deps.json` | Runtime (dep_gen replay) | Structural task dependency graph + per-edge tensor info | JSON (v2) | +| `deps.json` | Runtime (dep_gen replay) | Structural task dependency graph + per-edge tensor info | JSON | | `deps_graph.html` | deps_to_graph | Pan/zoom dependency graph viewer | HTML (self-contained) | --- diff --git a/simpler_setup/tools/deps_to_graph.py b/simpler_setup/tools/deps_to_graph.py index a5f1f917f..ec185e6ce 100644 --- a/simpler_setup/tools/deps_to_graph.py +++ b/simpler_setup/tools/deps_to_graph.py @@ -46,7 +46,7 @@ def _normalize_task_id(v): """Unsigned 64-bit task id (matches deps.json edges and l2_perf task_id). - Accepts ints (legacy) and strings (current schema): deps.json v2 emits all + Accepts ints (legacy) and strings (current schema): deps.json emits all uint64 fields as quoted strings to dodge JSON-number precision loss in JavaScript-based consumers, since tensor_ids (FNV hashes) and buffer addresses routinely exceed Number.MAX_SAFE_INTEGER (2^53 - 1).""" @@ -107,31 +107,26 @@ def fmt(task_id): def _load_deps_edges(deps_path): - """Parse deps.json (v2) into renderer-friendly pieces. + """Parse deps.json into renderer-friendly pieces. Returns a 5-tuple: edges: sorted list of unique (pred, succ) pairs — what the graph - renders as arrows. v2 may have multiple annotated edges sharing - the same (pred, succ) (distinct arg / source / slice); they - collapse to one arrow here. + renders as arrows. Multiple annotated edges sharing the same + (pred, succ) (distinct arg / source / slice) collapse to one + arrow here. nodes: sorted list of all referenced task ids. annotations: dict[(pred, succ) -> list[dict]] of annotation rows - (one per annotated edge in v2), keyed in insertion order so + (one per annotated edge), keyed in insertion order so ``--show-tensor-info`` can resolve per-edge tensor identities and target the right input port on the consumer node. - tensor_table: dict[tensor_id -> dict] from the v2 tensors[] block. - task_table: dict[task_id -> dict] from the v2 tasks[] block, + tensor_table: dict[tensor_id -> dict] from the tensors[] block. + task_table: dict[task_id -> dict] from the tasks[] block, carrying the per-arg input/output slot info that the ``--show-tensor-info`` view renders as compartments inside each task node. - - Raises ValueError if ``version`` is not 3 — older versions are no longer supported. """ with open(deps_path) as f: data = json.load(f) - version = data.get("version") - if version != 3: - raise ValueError(f"deps.json version={version!r}; only v3 is supported (regenerate with current dep_gen)") edges_raw = data.get("edges", []) seen: set[tuple[int, int]] = set() edges: list[tuple[int, int]] = [] @@ -400,9 +395,8 @@ def _arg_row_html(arg, tensor_table, side): tname = _short_tensor_label(tid, tensor_table) dtype = arg.get("dtype") shape = arg.get("shape") - # v3 schema: per-slot strided descriptor (start_offset is uint64 quoted as - # string, stride is per-dim int32 array). Older v2 args used a multi-dim - # `offset` array — viewers consuming older logs should bump producer first. + # Strided-Tensor per-slot descriptor: start_offset is uint64 quoted as + # string, stride is per-dim int32 array. start_offset = arg.get("start_offset") strides = arg.get("strides") buffer_numel = None @@ -573,7 +567,7 @@ def emit_dot(edges, nodes, meta, direction="LR", annotations=None, tensor_table= # port; pick one representative annotation per (pred, succ) for the # producer-port match (multiple annotations sharing the pair all # target the same arg in practice — distinct args produce distinct - # edges in v2). + # edges). rows = annotations.get((pred, succ), []) if not rows: lines.append(f" {_node_id(pred)} -> {_node_id(succ)};") diff --git a/simpler_setup/tools/sched_overhead_analysis.py b/simpler_setup/tools/sched_overhead_analysis.py index 5ab4e7b36..1ac737070 100644 --- a/simpler_setup/tools/sched_overhead_analysis.py +++ b/simpler_setup/tools/sched_overhead_analysis.py @@ -11,7 +11,8 @@ Inputs: 1. Per-task perf profiling data (l2_perf_records_*.json) with - ``aicpu_scheduler_phases`` populated by ``--enable-l2-swimlane``. + ``aicpu_scheduler_phases`` populated by ``--enable-l2-swimlane`` at + level >= 3. 2. deps.json (optional, dep_gen replay output) colocated with the perf JSON, used to derive per-thread fanout / fanin DAG stats. @@ -29,8 +30,9 @@ def _to_uint64(v): - """Coerce JSON-encoded uint64 (int or string after the deps.json v2 schema - bump in #769) to a Python int. Returns None when unparseable.""" + """Coerce a JSON-encoded uint64 (int, or string — deps.json quotes uint64s + so JavaScript-based consumers don't lose precision past 2^53 - 1) to a + Python int. Returns None when unparseable.""" try: n = int(v) except (TypeError, ValueError): @@ -292,7 +294,7 @@ def run_analysis( # noqa: PLR0912, PLR0915 print_sources: Whether to print selected input files. perf_data: Optional pre-parsed perf JSON dict. When provided, skip re-reading from disk — main() already parses the file to probe - for v2 phase data, so passing the result through saves a second + for phase data, so passing the result through saves a second load on large artifacts. deps_json_path: Optional deps.json (dep_gen replay output) co-located with the perf JSON. When present, per-thread fanout / fanin @@ -485,7 +487,7 @@ def run_analysis( # noqa: PLR0912, PLR0915 else: pop_hit = pop_miss = 0 pop_hit_rate = 0.0 - print(" Pop: (no per-emit pop deltas in input — needs --enable-l2-swimlane)") + print(" Pop: (no per-emit pop deltas in input — needs --enable-l2-swimlane at level >= 3)") print() print("=" * 90) diff --git a/simpler_setup/tools/swimlane_converter.py b/simpler_setup/tools/swimlane_converter.py index 5841617cb..ebe4c6820 100644 --- a/simpler_setup/tools/swimlane_converter.py +++ b/simpler_setup/tools/swimlane_converter.py @@ -92,7 +92,8 @@ def read_perf_data(filepath): filepath: Path to input JSON file Returns: - dict: Parsed performance data with key: + dict: Parsed performance data with keys: + - l2_perf_level - tasks (list) Raises: @@ -101,8 +102,13 @@ def read_perf_data(filepath): with open(filepath) as f: data = json.load(f) - if "tasks" not in data: - raise ValueError("Missing required field: tasks") + required_fields = ["l2_perf_level", "tasks"] + for field in required_fields: + if field not in data: + raise ValueError(f"Missing required field: {field}") + + if data["l2_perf_level"] not in [1, 2, 3, 4]: + raise ValueError(f"Unsupported l2_perf_level: {data['l2_perf_level']} (expected 1, 2, 3, or 4)") return data @@ -135,13 +141,6 @@ def load_deps_json(deps_path): if not isinstance(edges, list): print(f"Warning: {deps_path} has no 'edges' array", file=sys.stderr) return None - version = data.get("version") - if version != 2: - print( - f"Warning: {deps_path} version={version!r}; only v2 is supported.", - file=sys.stderr, - ) - return None # The converter only needs flow-event endpoints (not the per-edge tensor # annotations). Project annotated edges down to a (pred, succ) set and # dedup so multiple annotated edges sharing the same pair (distinct arg @@ -394,15 +393,15 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915 output_path: Path to output JSON file func_id_to_name: Optional dict mapping func_id to function name verbose: Print progress information - scheduler_phases: Optional list of per-thread phase record lists (version 2) - orchestrator_phases: Optional list of per-task orchestrator phase records (version 2) + scheduler_phases: Optional list of per-thread phase record lists (l2_perf_level >= 3) + orchestrator_phases: Optional list of per-task orchestrator phase records (l2_perf_level >= 4) core_to_thread: Optional list mapping core_id (index) to scheduler thread index (-1 = unassigned) Generates processes in the trace: - pid=1 "AICore View": start_time_us to end_time_us (kernel execution) - pid=2 "AICPU View": dispatch_time_us to finish_time_us (AICPU perspective) - - pid=3 "AICPU Scheduler": scheduler phase bars (version 2) - - pid=4 "AICPU Orchestrator": orchestrator phase bars or summary (version 2) + - pid=3 "AICPU Scheduler": scheduler phase bars (l2_perf_level >= 3) + - pid=4 "AICPU Orchestrator": orchestrator phase bars or summary (l2_perf_level >= 4) """ if verbose: print("Generating Chrome Trace JSON...") @@ -699,7 +698,7 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915 if hb_violation_count > 0: print(f" Happens-before violations: {hb_violation_count} edge(s) flagged as 'hb_violation'") - # AICPU Scheduler phase events (version 2) + # AICPU Scheduler phase events (l2_perf_level >= 3) if scheduler_phases: # Process metadata events.append( @@ -756,7 +755,7 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915 } events.append(event) - # AICPU Orchestrator lane (version 2) + # AICPU Orchestrator lane (l2_perf_level >= 4) # # Per-event AicpuPhaseRecord[] is the single source of truth for # orchestrator timing. There is no separate aggregate summary — the @@ -1147,10 +1146,12 @@ def _resolve_output_path(args, input_path): def _print_verbose_data_info(data, verbose): - """Print verbose summary of loaded performance data including phase counts.""" + """Print verbose summary of loaded performance data, including phase counts + when present (l2_perf_level >= SCHED_PHASES).""" if not verbose: return print("\n=== Performance Data ===") + print(f" L2 perf level: {data['l2_perf_level']}") print(f" Task Count: {len(data['tasks'])}") if data["tasks"]: start_times = [t["start_time_us"] for t in data["tasks"]] diff --git a/src/a2a3/platform/src/host/l2_perf_collector.cpp b/src/a2a3/platform/src/host/l2_perf_collector.cpp index 745dab8b6..715ee66d6 100644 --- a/src/a2a3/platform/src/host/l2_perf_collector.cpp +++ b/src/a2a3/platform/src/host/l2_perf_collector.cpp @@ -595,7 +595,9 @@ int L2PerfCollector::export_swimlane_json() { // Fanout fields are emitted as empty/zero — the device-side hot path no // longer carries them. Downstream (swimlane_converter.py) joins fanout // from the sibling deps.json (dep_gen output). + int l2_perf_level = static_cast(l2_perf_level_); outfile << "{\n"; + outfile << " \"l2_perf_level\": " << l2_perf_level << ",\n"; outfile << " \"tasks\": [\n"; for (size_t i = 0; i < tagged_records.size(); ++i) { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md index 566249ac7..863299dbc 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md @@ -166,8 +166,9 @@ Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX ``` Per-thread fanout / fanin edge counts and ready-queue pop hit / miss -stats live in the v2 JSON `aicpu_scheduler_phases[]` and `deps.json`; -consume them via `simpler_setup/tools/sched_overhead_analysis.py`. +stats live in `aicpu_scheduler_phases[]` (in `l2_perf_records.json` +captured at l2_perf_level >= 3) and `deps.json`; consume them via +`simpler_setup/tools/sched_overhead_analysis.py`. --- diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp index 808fce372..027805918 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp @@ -11,9 +11,9 @@ /** * @file dep_gen_replay.cpp - * @brief Replay in-memory DepGenRecord stream → deps.json (v2, tensor-annotated) - * via a host-resident PTO2TensorMap, with a differential check against - * the runtime template `compute_task_fanin`. + * @brief Replay in-memory DepGenRecord stream → deps.json (strided tensor + * representation, tensor-annotated) via a host-resident PTO2TensorMap, + * with a differential check against the runtime template `compute_task_fanin`. * * Two passes run per record against two parallel PTO2TensorMap instances that * evolve in lockstep: @@ -113,7 +113,7 @@ int32_t count_outputs(const DepGenRecord *records, size_t n) { } // --------------------------------------------------------------------------- -// v2 schema accumulators +// JSON output accumulators (in-memory tables that get serialized at the end) // --------------------------------------------------------------------------- // Edge categories — matches the three places a runtime fanin edge is born. @@ -240,7 +240,7 @@ uint64_t make_tensor_id(uint64_t buffer_addr, int32_t version) { return h; } -// Register a tensor in the v2 tensors[] table on first sight of (addr, +// Register a tensor in the tensors[] table on first sight of (addr, // version). buffer_numel describes the underlying storage size in elements; // per-edge fields describe the slice via (start_offset, strides[]). Subsequent // sightings of the same (addr, version) are no-ops. @@ -288,7 +288,7 @@ void fill_producer(EdgeAnnot &e, const PTO2TensorMapEntry &entry) { } // --------------------------------------------------------------------------- -// JSON writer (v2) +// JSON writer // --------------------------------------------------------------------------- void write_uint_array(std::ofstream &out, const uint32_t *data, uint32_t n) { @@ -300,7 +300,7 @@ void write_uint_array(std::ofstream &out, const uint32_t *data, uint32_t n) { out << ']'; } -bool write_deps_json_v2( +bool write_deps_json( const char *path, const std::vector &tasks, const std::vector &tensors, const std::vector &edges ) { @@ -309,13 +309,11 @@ bool write_deps_json_v2( LOG_ERROR("dep_gen replay: failed to open '%s' for write", path); return false; } - // Schema v3: strided tensor representation. - // tensors[*]: buffer_numel replaces raw_shapes (storage size in elements) - // edges[*]: consumer/producer_offset[] -> start_offset (uint64) + strides[] (int32) - // tasks[*].args[*]: offset[] -> start_offset + strides[] - out << "{\"version\":3"; - - out << ",\"tasks\":["; + // Strided tensor representation. tensors[].buffer_numel is the underlying + // storage element count; tasks[].args[] and edges[] carry per-slice + // geometry as (start_offset uint64, strides[] uint32 — runtime invariant + // forbids zero / negative strides, see runtime/tensor.h). + out << "{\"tasks\":["; for (size_t i = 0; i < tasks.size(); i++) { if (i > 0) out << ','; const auto &t = tasks[i]; @@ -448,7 +446,7 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c LOG_ERROR("dep_gen replay: num_records=%zu but records pointer is null", num_records); return -1; } - LOG_INFO_V0("dep_gen replay: processing %zu in-memory records (v2, dual-pass)", num_records); + LOG_INFO_V0("dep_gen replay: processing %zu in-memory records (dual-pass)", num_records); // Per-ring task window sizes — tensormap masks slot indices and requires // each to be a power of two. Auto-size from the records themselves so each @@ -495,7 +493,7 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c return -3; } - // v2 accumulators. + // JSON output accumulators. std::vector task_table; std::vector tensor_table; std::unordered_map tensor_index; // tensor_id → table idx @@ -763,12 +761,12 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c tm_oracle.destroy(); tm_annot.destroy(); - if (!write_deps_json_v2(deps_json_path, task_table, tensor_table, annot_edges)) { + if (!write_deps_json(deps_json_path, task_table, tensor_table, annot_edges)) { return -5; } LOG_INFO_V0( - "dep_gen replay: wrote deps.json v2 to %s (tasks=%zu, tensors=%zu, edges=%zu)", deps_json_path, - task_table.size(), tensor_table.size(), annot_edges.size() + "dep_gen replay: wrote deps.json to %s (tasks=%zu, tensors=%zu, edges=%zu)", deps_json_path, task_table.size(), + tensor_table.size(), annot_edges.size() ); return 0; } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h index ea39bf7ea..daef4dfdd 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h @@ -29,26 +29,33 @@ * 1 KB GM store off the scheduler critical path). Replay sees every * submit and reconstructs the complete dependency graph. * - * Output format (deps.json, v2): + * Output format (deps.json, strided tensor representation): * - * {"version":2, - * "tasks": [{"task_id":, "scope":"auto|manual"}, ...], + * {"tasks": [{"task_id":, "scope":"auto|manual", + * "args":[{"idx":, "type":"", + * "tensor_id":, "dtype":"...", "shape":[...], + * "start_offset":, "strides":[...]}, ...]}, ...], * "tensors": [{"tensor_id":, "buffer_addr":, "version":, - * "dtype":"FLOAT32", "ndims":, "raw_shapes":[...]}, ...], + * "dtype":"FLOAT32", "buffer_numel":}, ...], * "edges": [{"pred":, "succ":, "arg":, * "source":"explicit|creator|tensormap", * "overlap":"covered|other" (tensormap only), * "tensor_id": (non-explicit), - * "consumer_dtype":"...", "consumer_shape":[...], "consumer_offset":[...], - * "producer_shape":[...] (tensormap), "producer_offset":[...] (tensormap)}, + * "consumer_dtype":"...", "consumer_shape":[...], + * "consumer_start_offset":, "consumer_strides":[...], + * "producer_shape":[...] (tensormap), + * "producer_start_offset": (tensormap), + * "producer_strides":[...] (tensormap)}, * ...]} * * - All task ids are ``PTO2TaskId::raw`` values (``(ring_id << 32) | local_id``). * - ``tensor_id`` is a stable FNV-1a hash of ``(buffer_addr, version)``. + * - ``buffer_numel`` is the underlying storage element count; tensor shapes + * are carried per-arg / per-edge alongside ``start_offset`` + ``strides``. * - Distinct producers / arg indices / sources keep their own edges; per-record * deduplication of producer ids mirrors the runtime * ``PTO2FaninBuilder::append_fanin_or_fail`` semantics so the set of - * ``(pred, succ)`` pairs in v2 is identical to what the runtime would have + * ``(pred, succ)`` pairs is identical to what the runtime would have * recorded. * * Self-checking: the replay runs two parallel tensormap instances per record — diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index fd356ccc6..ffa6fd335 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -431,9 +431,10 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa cycles_to_us(l2_perf.sched_complete_perf_cycle), l2_perf.sched_complete_perf_cycle * 100.0 / c_parent ); - // pop_hit / pop_miss per-emit deltas live in each v2 JSON dispatch - // record's extras; sum-of-deltas equals the run-cumulative tracked - // in this struct (final-drain emit covers the trailing-idle tail). + // pop_hit / pop_miss per-emit deltas live in each dispatch-phase + // record's extras in aicpu_scheduler_phases[]; sum-of-deltas equals + // the run-cumulative tracked in this struct (final-drain emit covers + // the trailing-idle tail). LOG_INFO_V9( "Thread %d: dispatch : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_dispatch_cycle), l2_perf.sched_dispatch_cycle * 100.0 / sched_total diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index 439ff2e61..6304c99df 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -92,7 +92,8 @@ int SchedulerContext::pop_ready_tasks_batch( int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count); #endif // pop_hit / pop_miss are PTO2_PROFILING-gated (not the inner verbose tier) - // so the v2 JSON dispatch records carry queue-health stats on default builds. + // so dispatch-phase records in aicpu_scheduler_phases[] carry queue-health + // stats on default builds. if (count > 0) { l2_perf.pop_hit += count; } else { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h index a20454615..53cd87b8d 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h @@ -348,9 +348,10 @@ struct alignas(64) SchedL2PerfCounters { uint64_t sched_loop_count{0}; uint32_t phase_complete_count{0}; uint32_t phase_dispatch_count{0}; - // Run-cumulative pop counters; the v2 JSON dispatch-record emitter writes - // per-emit deltas computed as (current - pop_*_at_last_emit) and the - // end-of-run cold-path log reads the cumulatives directly. + // Run-cumulative pop counters; the dispatch-phase record emitter + // (aicpu_scheduler_phases[]) writes per-emit deltas computed as + // (current - pop_*_at_last_emit) and the end-of-run cold-path log reads + // the cumulatives directly. uint64_t pop_hit{0}; uint64_t pop_miss{0}; uint64_t pop_hit_at_last_emit{0}; diff --git a/src/a5/platform/src/host/l2_perf_collector.cpp b/src/a5/platform/src/host/l2_perf_collector.cpp index 93bb9f83d..028d374a6 100644 --- a/src/a5/platform/src/host/l2_perf_collector.cpp +++ b/src/a5/platform/src/host/l2_perf_collector.cpp @@ -579,9 +579,9 @@ int L2PerfCollector::export_swimlane_json() { return -1; } - int version = static_cast(l2_perf_level_); + int l2_perf_level = static_cast(l2_perf_level_); outfile << "{\n"; - outfile << " \"version\": " << version << ",\n"; + outfile << " \"l2_perf_level\": " << l2_perf_level << ",\n"; outfile << " \"tasks\": [\n"; for (size_t i = 0; i < tagged_records.size(); ++i) { diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md index 34e17cc86..79ae71b24 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md +++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md @@ -166,8 +166,9 @@ Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX ``` Per-thread fanout / fanin edge counts and ready-queue pop hit / miss -stats live in the v2 JSON `aicpu_scheduler_phases[]` and `deps.json`; -consume them via `simpler_setup/tools/sched_overhead_analysis.py`. +stats live in `aicpu_scheduler_phases[]` (in `l2_perf_records.json` +captured at l2_perf_level >= 3) and `deps.json`; consume them via +`simpler_setup/tools/sched_overhead_analysis.py`. --- diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index aab3f22d4..ca01eaa0c 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -431,9 +431,10 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa cycles_to_us(l2_perf.sched_complete_perf_cycle), l2_perf.sched_complete_perf_cycle * 100.0 / c_parent ); - // pop_hit / pop_miss per-emit deltas live in each v2 JSON dispatch - // record's extras; sum-of-deltas equals the run-cumulative tracked - // in this struct (final-drain emit covers the trailing-idle tail). + // pop_hit / pop_miss per-emit deltas live in each dispatch-phase + // record's extras in aicpu_scheduler_phases[]; sum-of-deltas equals + // the run-cumulative tracked in this struct (final-drain emit covers + // the trailing-idle tail). LOG_INFO_V9( "Thread %d: dispatch : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_dispatch_cycle), l2_perf.sched_dispatch_cycle * 100.0 / sched_total diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index 6bd212f84..5af5c95ed 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -87,7 +87,8 @@ int SchedulerContext::pop_ready_tasks_batch( int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count); #endif // pop_hit / pop_miss are PTO2_PROFILING-gated (not the inner verbose tier) - // so the v2 JSON dispatch records carry queue-health stats on default builds. + // so dispatch-phase records in aicpu_scheduler_phases[] carry queue-health + // stats on default builds. if (count > 0) { l2_perf.pop_hit += count; } else { diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h index 84991ae8e..ca099982f 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h @@ -350,9 +350,10 @@ struct alignas(64) SchedL2PerfCounters { uint64_t sched_loop_count{0}; uint32_t phase_complete_count{0}; uint32_t phase_dispatch_count{0}; - // Run-cumulative pop counters; the v2 JSON dispatch-record emitter writes - // per-emit deltas computed as (current - pop_*_at_last_emit) and the - // end-of-run cold-path log reads the cumulatives directly. + // Run-cumulative pop counters; the dispatch-phase record emitter + // (aicpu_scheduler_phases[]) writes per-emit deltas computed as + // (current - pop_*_at_last_emit) and the end-of-run cold-path log reads + // the cumulatives directly. uint64_t pop_hit{0}; uint64_t pop_miss{0}; uint64_t pop_hit_at_last_emit{0}; diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py index 35a9d3276..428e6efbd 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py @@ -142,15 +142,15 @@ def _post_validate(self, case): return with deps_path.open() as f: deps = json.load(f) - # v3 schema: annotated edges with tasks[] / tensors[] sidecars carrying - # strided slice descriptors (start_offset + stride[]). Project annotated - # edges down to a (pred, succ) set for the existing structural checks; - # the annotation sanity check below verifies the tensor metadata path. - assert deps.get("version") == 3, f"deps.json version {deps.get('version')} != 3" + # Strided-Tensor schema: annotated edges with tasks[] / tensors[] + # sidecars carrying strided slice descriptors (start_offset + + # stride[]). Project annotated edges down to a (pred, succ) set for + # the existing structural checks; the annotation sanity check below + # verifies the tensor metadata path. raw_edges = deps.get("edges", []) deps_edges = set() for e in raw_edges: - assert isinstance(e, dict), f"v2 edge must be an object, got {type(e).__name__}: {e!r}" + assert isinstance(e, dict), f"deps.json edge must be an object, got {type(e).__name__}: {e!r}" pred, succ = e.get("pred"), e.get("succ") if pred is None or succ is None: continue @@ -174,13 +174,13 @@ def _post_validate(self, case): bad = {e for e in deps_edges if e[0] not in valid_ids or e[1] not in valid_ids} assert not bad, f"deps.json contains edges referencing unknown task ids: {bad}" - # ---- v2 annotated-edge sanity ---- - # Replay always emits the v2 schema with the tensor-info sidecar; the - # differential check inside the replay would have failed the run before - # we got here if the annotated pass disagreed with compute_task_fanin. - # These assertions just confirm the schema actually carries the - # expected blocks (so e.g. a future "always write empty arrays" bug - # would surface here, not silently in a downstream viewer). + # ---- Annotated-edge sanity ---- + # Replay always emits the tensor-info sidecar; the differential check + # inside the replay would have failed the run before we got here if + # the annotated pass disagreed with compute_task_fanin. These + # assertions just confirm the schema actually carries the expected + # blocks (so e.g. a future "always write empty arrays" bug would + # surface here, not silently in a downstream viewer). tasks = deps.get("tasks", []) tensors = deps.get("tensors", []) task_ids = {int(t["task_id"]) for t in tasks if "task_id" in t} diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen_chain.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen_chain.py index 90910bab6..cd2c3bc96 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen_chain.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen_chain.py @@ -161,7 +161,6 @@ def _post_validate(self, case): with deps_path.open() as f: deps = json.load(f) - assert deps.get("version") == 3, f"deps.json version {deps.get('version')} != 3" raw_edges = deps.get("edges", []) # Project annotated edges → (pred, succ) — we only care about graph diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py index 43e4d7daa..c2b3a18e1 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py @@ -53,6 +53,7 @@ def validate_perf_artifact(case_label: str, *, expected_task_count: int | None = with perf.open() as f: data = json.load(f) + assert data.get("l2_perf_level") in (1, 2, 3, 4), f"unexpected l2_perf_level: {data.get('l2_perf_level')}" tasks = data.get("tasks") assert isinstance(tasks, list), "tasks field missing or not a list" assert len(tasks) > 0, f"perf records empty under {perf}"