diff --git a/Makefile b/Makefile index f70265e..db2298c 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,8 @@ PYTHON ?= python ENV ?= tinyodomex OXIOD_ZIP ?= OxIOD.zip URBANSOUND8K_ARGS ?= +PYTHON_LIB_DIR := $(shell $(PYTHON) -c 'import sys; print(sys.prefix + "/lib")') +TEST_ENV := LD_LIBRARY_PATH="$(PYTHON_LIB_DIR):$(LD_LIBRARY_PATH)" help: @echo "Targets:" @@ -25,13 +27,13 @@ install: pip install -e . --no-deps test: - pytest test/ + $(TEST_ENV) $(PYTHON) -m pytest test/ integration-test: - RUN_INTEGRATION_TESTS=1 pytest test/integration/ + $(TEST_ENV) RUN_INTEGRATION_TESTS=1 $(PYTHON) -m pytest test/integration/ test-all: - RUN_INTEGRATION_TESTS=1 pytest test/ + $(TEST_ENV) RUN_INTEGRATION_TESTS=1 $(PYTHON) -m pytest test/ start-gpu: $(PYTHON) src/nas_model_client.py $(ARGS) diff --git a/README.md b/README.md index 18f646b..ccbe59e 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ For the source-level architecture and extension points, see 1. **Training only** Use this when you want to run NAS/training without talking to hardware. - Start from `src/config/nas_config.yaml`, set `device.hil: false`, and read + Start from `src/config/nas_config_stm32.yaml`, set `device.hil: false`, and read [src/config/README.md](src/config/README.md) plus [src/README.md](src/README.md). For the UrbanSound8K audio DS-CNN path, start from [src/config/nas_config_audio_stm32.yaml](src/config/nas_config_audio_stm32.yaml) @@ -44,7 +44,7 @@ For the source-level architecture and extension points, see 3. **STM32 HIL** Use this for the current STM32 N6 backend. - Start from [src/config/nas_config.yaml](src/config/nas_config.yaml), then + Start from [src/config/nas_config_stm32.yaml](src/config/nas_config_stm32.yaml), then use [src/config/nas_config_audio_stm32.yaml](src/config/nas_config_audio_stm32.yaml) for the audio DS-CNN HIL path. Then read [src/tinyodom/microcontrollers/README.md](src/tinyodom/microcontrollers/README.md) @@ -183,10 +183,10 @@ refreshes the repo-local STM32 vendor subsets. The shipped starting points are: -- [src/config/nas_config.yaml](src/config/nas_config.yaml) - Default STM32-oriented config for the current `STM32_NUCLEO_N657X0_Q` - backend. This is the main starting point for STM32 runs and the general - example config for the repo. +- [src/config/nas_config_stm32.yaml](src/config/nas_config_stm32.yaml) + STM32-oriented config for the current `STM32_NUCLEO_N657X0_Q` + backend. This is the main starting point for STM32 runs and the most + complete commented example config in the repo. - [src/config/nas_config_ble.yaml](src/config/nas_config_ble.yaml) BLE-focused starting point for `ARDUINO_NANO_33_BLE_SENSE`. - [src/config/nas_config_portenta.yaml](src/config/nas_config_portenta.yaml) diff --git a/analysis_scripts/arena_latency_curve/run_arena_latency_curve.py b/analysis_scripts/arena_latency_curve/run_arena_latency_curve.py index 7bcf3b8..9be637e 100644 --- a/analysis_scripts/arena_latency_curve/run_arena_latency_curve.py +++ b/analysis_scripts/arena_latency_curve/run_arena_latency_curve.py @@ -276,7 +276,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: ) parser.add_argument( "--config", - default=str(SRC_DIR / "config" / "nas_config.yaml"), + default=str(SRC_DIR / "config" / "nas_config_stm32.yaml"), help="Path to TinyODOM config YAML.", ) parser.add_argument( diff --git a/analysis_scripts/arena_latency_curve/run_arena_latency_curve_failure_probe.py b/analysis_scripts/arena_latency_curve/run_arena_latency_curve_failure_probe.py index a1b83da..95f0ffd 100644 --- a/analysis_scripts/arena_latency_curve/run_arena_latency_curve_failure_probe.py +++ b/analysis_scripts/arena_latency_curve/run_arena_latency_curve_failure_probe.py @@ -280,7 +280,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: ) parser.add_argument( "--config", - default=str(SRC_DIR / "config" / "nas_config.yaml"), + default=str(SRC_DIR / "config" / "nas_config_stm32.yaml"), help="Path to TinyODOM config YAML.", ) parser.add_argument( diff --git a/analysis_scripts/cadenced_portenta_h7/README.md b/analysis_scripts/cadenced_portenta_h7/README.md index d975603..15047c9 100644 --- a/analysis_scripts/cadenced_portenta_h7/README.md +++ b/analysis_scripts/cadenced_portenta_h7/README.md @@ -42,7 +42,7 @@ Common overrides: ```bash python analysis_scripts/cadenced_portenta_h7/run_cadenced_portenta_h7.py \ - --config src/config/nas_config.yaml \ + --config src/config/nas_config_stm32.yaml \ --repeats 3 \ --cores cm7 cm4 \ --latency-budget-ms 200 \ diff --git a/analysis_scripts/cadenced_portenta_h7/run_cadenced_portenta_h7.py b/analysis_scripts/cadenced_portenta_h7/run_cadenced_portenta_h7.py index 6a46d0c..87dcfff 100644 --- a/analysis_scripts/cadenced_portenta_h7/run_cadenced_portenta_h7.py +++ b/analysis_scripts/cadenced_portenta_h7/run_cadenced_portenta_h7.py @@ -475,7 +475,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: ) parser.add_argument( "--config", - default=str(REPO_ROOT / "src" / "config" / "nas_config.yaml"), + default=str(REPO_ROOT / "src" / "config" / "nas_config_stm32.yaml"), help="Path to TinyODOM config YAML.", ) parser.add_argument( diff --git a/analysis_scripts/clock_tick_latency/run_clock_tick_latency.py b/analysis_scripts/clock_tick_latency/run_clock_tick_latency.py index 01615fb..917853b 100644 --- a/analysis_scripts/clock_tick_latency/run_clock_tick_latency.py +++ b/analysis_scripts/clock_tick_latency/run_clock_tick_latency.py @@ -206,7 +206,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: ) parser.add_argument( "--config", - default=str(SRC_DIR / "config" / "nas_config.yaml"), + default=str(SRC_DIR / "config" / "nas_config_stm32.yaml"), help="Path to TinyODOM config YAML.", ) parser.add_argument( diff --git a/analysis_scripts/hil_noise_analysis/README.md b/analysis_scripts/hil_noise_analysis/README.md index a547faf..98512b1 100644 --- a/analysis_scripts/hil_noise_analysis/README.md +++ b/analysis_scripts/hil_noise_analysis/README.md @@ -73,7 +73,7 @@ Dataset-specific generated headers live alongside them: ## Config selection -Set the following in `src/config/nas_config.yaml` to choose a variant when `energy_aware: true`: +Set the following in `src/config/nas_config_stm32.yaml` to choose a variant when `energy_aware: true`: - `input_mode: "uniform"` uses `sketches/tinyodom_inference_energy.ino` - `input_mode: "oxiod_representative"` uses `sketches/analysis_sketches/tinyodom_inference_representative.ino` with `oxiod_input_data.h` @@ -97,7 +97,7 @@ python analysis_scripts/hil_noise_analysis/oxiod_input_profile.py --split train # 2) On the GPU host, train and package the fixed 50-epoch artifact python analysis_scripts/hil_noise_analysis/train_noise_scan_model.py \ - --config src/config/nas_config.yaml \ + --config src/config/nas_config_stm32.yaml \ --epochs 50 \ --out-dir analysis_scripts/hil_noise_analysis/artifacts \ --artifact-prefix noise_scan_50ep @@ -126,7 +126,7 @@ just a trained-vs-untrained comparison: ```bash # 1) Train and export checkpoint stages on the GPU host python analysis_scripts/hil_noise_analysis/epoch_sweep/train_epoch_sweep.py \ - --config src/config/nas_config.yaml \ + --config src/config/nas_config_stm32.yaml \ --out-dir analysis_scripts/hil_noise_analysis/epoch_sweep/artifacts \ --artifact-prefix noise_scan_epoch_sweep @@ -135,7 +135,7 @@ python analysis_scripts/hil_noise_analysis/epoch_sweep/audit_fresh_untrained_tfl # 3) Run HIL metrics across staged checkpoints python analysis_scripts/hil_noise_analysis/epoch_sweep/hil_epoch_sweep_scan.py \ - --config src/config/nas_config.yaml \ + --config src/config/nas_config_stm32.yaml \ --training-csv analysis_scripts/hil_noise_analysis/epoch_sweep/artifacts/epoch_sweep_training_stats.csv \ --runs 1 \ --input-modes uniform diff --git a/analysis_scripts/hil_noise_analysis/epoch_sweep/README.md b/analysis_scripts/hil_noise_analysis/epoch_sweep/README.md index e73856a..c5e6968 100644 --- a/analysis_scripts/hil_noise_analysis/epoch_sweep/README.md +++ b/analysis_scripts/hil_noise_analysis/epoch_sweep/README.md @@ -32,7 +32,7 @@ This folder contains a two-step experiment flow: ```bash python analysis_scripts/hil_noise_analysis/epoch_sweep/train_epoch_sweep.py \ - --config src/config/nas_config.yaml \ + --config src/config/nas_config_stm32.yaml \ --out-dir analysis_scripts/hil_noise_analysis/epoch_sweep/artifacts \ --artifact-prefix noise_scan_epoch_sweep \ --max-epochs 500 \ @@ -46,7 +46,7 @@ python analysis_scripts/hil_noise_analysis/epoch_sweep/train_epoch_sweep.py \ ```bash python analysis_scripts/hil_noise_analysis/epoch_sweep/train_epoch_sweep.py \ - --config src/config/nas_config.yaml \ + --config src/config/nas_config_stm32.yaml \ --out-dir analysis_scripts/hil_noise_analysis/epoch_sweep/artifacts \ --plots-dir analysis_scripts/hil_noise_analysis/epoch_sweep/artifacts/plots \ --csv-path analysis_scripts/hil_noise_analysis/epoch_sweep/artifacts/epoch_sweep_training_stats.csv \ @@ -76,7 +76,7 @@ scp -r analysis_scripts/hil_noise_analysis/epoch_sweep/artifacts : argparse.ArgumentParser: "using harness timing/energy telemetry." ) ) - parser.add_argument("--config", default=str(SRC_DIR / "config" / "nas_config.yaml"), help="TinyODOM config path (optional defaults source).") + parser.add_argument("--config", default=str(SRC_DIR / "config" / "nas_config_stm32.yaml"), help="TinyODOM config path (optional defaults source).") parser.add_argument("--dut-port", default=None, help="DUT serial port (default from config or /dev/ttyACM0).") parser.add_argument("--harness-port", default=None, help="Harness serial port (default from config or /dev/ttyACM1).") parser.add_argument("--harness-fqbn", default=None, help="Harness board FQBN (default from config or arduino:mbed_nano:nano33ble).") diff --git a/analysis_scripts/static_memory_proxy/.gitignore b/analysis_scripts/static_memory_proxy/.gitignore new file mode 100644 index 0000000..f8a4f0f --- /dev/null +++ b/analysis_scripts/static_memory_proxy/.gitignore @@ -0,0 +1,3 @@ +# Generated outputs from compute_static_memory_proxy.py +*.csv +*.png diff --git a/analysis_scripts/static_memory_proxy/README.md b/analysis_scripts/static_memory_proxy/README.md new file mode 100644 index 0000000..bfce93c --- /dev/null +++ b/analysis_scripts/static_memory_proxy/README.md @@ -0,0 +1,83 @@ +# Static Memory Proxy + +Offline prototype for adding a second cheap proxy line beside FLOPs for OdomTCN analysis. This does not modify NAS training, configs, export, flashing, or HIL. It reads logged trial CSVs, rebuilds each OdomTCN candidate from logged hyperparameters, and writes an augmented CSV with static memory proxy columns. + +## Proxy Definition + +The script estimates static memory traffic as: + +```text +memory_traffic_bytes = + sum_layers(input_activation_bytes + layer_weight_bytes + output_activation_bytes) +``` + +The prototype assumes batch size 1 and deployment dtype bytes: + +```text +float -> 4 bytes +int8_ptq -> 1 byte +``` + +If a row does not expose quantization mode, the script defaults to `int8_ptq` and records that in `proxy_quantization_mode_source`. + +## Outputs + +The augmented CSV keeps all original columns and adds: + +```text +proxy_weight_bytes +proxy_activation_bytes +proxy_memory_traffic_bytes +proxy_dtype_bytes +proxy_warning_count +proxy_quantization_mode +proxy_quantization_mode_source +``` + +Use `--include-layer-details` to also write `proxy_layer_details_json`. + +## Run + +From the repository root: + +```bash +python analysis_scripts/static_memory_proxy/compute_static_memory_proxy.py \ + --config src/config/nas_config_memory_proxy.yaml \ + --trials-csv path/to/trials.csv \ + --output-csv analysis_scripts/static_memory_proxy/stm32_trials_with_memory_proxy.csv \ + --plot \ + --plot-dir analysis_scripts/static_memory_proxy +``` + +This writes: + +```text +analysis_scripts/static_memory_proxy/stm32_trials_with_memory_proxy.csv +analysis_scripts/static_memory_proxy/stm32_trials_with_memory_proxy_flops_vs_memory_traffic.png +analysis_scripts/static_memory_proxy/stm32_trials_with_memory_proxy_rmse_total_vs_memory_traffic.png +analysis_scripts/static_memory_proxy/stm32_trials_with_memory_proxy_energy_mj_per_inference_vs_memory_traffic.png +``` + +For multiple input CSVs, use `--output-dir` instead of `--output-csv`. + +## Plots + +With `--plot`, the script writes scatter plots for available columns: + +```text +flops vs proxy_memory_traffic_bytes +rmse_total vs proxy_memory_traffic_bytes +energy_mj_per_inference vs proxy_memory_traffic_bytes +``` + +It also prints Spearman and Kendall rank correlations for each plotted pair. These are rank correlations, so they measure monotonic ordering rather than exact linear fit. + +## Warning Count + +`proxy_warning_count` is the number of layer-level estimates where the script could not read exact symbolic activation tensors directly from Keras and used an inference path. OdomTCN uses the custom `TCN` layer; its residual blocks and child layers are visible, and weights are counted directly, but nested child input/output tensors are not exposed cleanly by Keras after build. The script infers those internal activation shapes from timestep count and channel/filter counts and marks them as warnings. + +A nonzero warning count does not mean the row failed. It means part of the estimate is architecture-aware static inference rather than a direct Keras tensor-shape read. + +## Limitations + +This is not predicted latency and not measured energy. It ignores cache behavior, DMA, tiling, operator fusion, im2col or temporary buffers, allocator overhead, flash/SRAM placement, alignment, backend-specific rereads, and kernel implementation details. Use it as a ranking proxy to compare against FLOPs and measured energy before deciding whether to wire it into NAS. diff --git a/analysis_scripts/static_memory_proxy/compute_static_memory_proxy.py b/analysis_scripts/static_memory_proxy/compute_static_memory_proxy.py new file mode 100644 index 0000000..768a436 --- /dev/null +++ b/analysis_scripts/static_memory_proxy/compute_static_memory_proxy.py @@ -0,0 +1,1156 @@ +"""Compute static memory-traffic proxy metrics for logged OdomTCN trials.""" + +from __future__ import annotations + +import argparse +import ast +import json +import math +import os +import sys +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Iterable + +os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2") + +REPO_ROOT = Path(__file__).resolve().parents[2] +SRC_DIR = REPO_ROOT / "src" +if str(SRC_DIR) not in sys.path: + sys.path.insert(0, str(SRC_DIR)) + +import pandas as pd # noqa: E402 +import tensorflow as tf # noqa: E402 +import yaml # noqa: E402 + +from tinyodom.model_families.odom_tcn import OdomTCNFamily # noqa: E402 +from tinyodom.pipeline_types import ModelBuildContext, TargetSpec # noqa: E402 + + +DTYPE_BYTES = { + "float": 4, + "float32": 4, + "int8": 1, + "int8_ptq": 1, +} +PROXY_COLUMNS = [ + "proxy_weight_bytes", + "proxy_activation_bytes", + "proxy_memory_traffic_bytes", + "proxy_dtype_bytes", + "proxy_warning_count", + "proxy_quantization_mode", + "proxy_quantization_mode_source", +] + + +@dataclass +class LayerEstimate: + """Memory proxy estimate for one accounted layer operation. + + Parameters + ---------- + layer_name : str + Human-readable layer path. + layer_type : str + Keras layer class name. + input_activation_bytes : int + Estimated bytes read from the input activation tensor. + weight_bytes : int + Estimated bytes read from layer weights. + output_activation_bytes : int + Estimated bytes written to the output activation tensor. + traffic_bytes : int + Sum of input activation, weight, and output activation bytes. + warning : str + Empty string when shapes were directly available, otherwise a short + explanation of the approximation used. + shape_source : str + Source used for activation shape estimates. + """ + + layer_name: str + layer_type: str + input_activation_bytes: int + weight_bytes: int + output_activation_bytes: int + traffic_bytes: int + warning: str + shape_source: str + + +@dataclass +class ProxyEstimate: + """Aggregate proxy metrics for one trial row. + + Parameters + ---------- + weight_bytes : int + Total unique model weight bytes accounted in the proxy. + activation_bytes : int + Sum of per-layer input and output activation bytes. + memory_traffic_bytes : int + Sum of per-layer input activation, weight, and output activation bytes. + dtype_bytes : int + Deployment dtype width in bytes. + warning_count : int + Number of layer operations whose activation shape was inferred or + unavailable. + layer_details : list[LayerEstimate] + Per-layer estimate details. + """ + + weight_bytes: int + activation_bytes: int + memory_traffic_bytes: int + dtype_bytes: int + warning_count: int + layer_details: list[LayerEstimate] + + +def _parse_args() -> argparse.Namespace: + """Parse command-line arguments. + + Returns + ------- + argparse.Namespace + Parsed CLI options. + """ + + parser = argparse.ArgumentParser( + description="Augment OdomTCN NAS trial CSVs with static memory-traffic proxy metrics." + ) + parser.add_argument("--config", required=True, type=Path, help="NAS config YAML path.") + parser.add_argument( + "--trials-csv", + required=True, + type=Path, + nargs="+", + help="One or more NAS trial CSVs to augment.", + ) + parser.add_argument( + "--output-csv", + type=Path, + help="Output CSV path. Only valid when one --trials-csv is provided.", + ) + parser.add_argument( + "--output-dir", + type=Path, + help="Directory for augmented CSVs when processing one or more inputs.", + ) + parser.add_argument( + "--input-shape", + help="Override input shape as TIMESTEPS,INPUT_DIM. Defaults to row hparams, then config.", + ) + parser.add_argument( + "--max-rows", + type=int, + help="Process only the first N rows. Useful for quick validation.", + ) + parser.add_argument( + "--include-layer-details", + action="store_true", + help="Include proxy_layer_details_json in the augmented CSV.", + ) + parser.add_argument( + "--plot", + action="store_true", + help="Write scatter plots and print optional correlations.", + ) + parser.add_argument( + "--plot-dir", + type=Path, + help="Directory for plot PNGs. Defaults to each output CSV directory.", + ) + return parser.parse_args() + + +def _load_yaml(path: Path) -> dict[str, Any]: + """Load a YAML file as a plain dictionary. + + Parameters + ---------- + path : pathlib.Path + YAML file path. + + Returns + ------- + dict[str, Any] + Parsed YAML payload. + """ + + with path.open("r", encoding="utf-8") as handle: + payload = yaml.safe_load(handle) or {} + if not isinstance(payload, dict): + raise ValueError(f"Expected mapping at config root: {path}") + return payload + + +def _is_missing(value: Any) -> bool: + """Return whether a CSV cell should be treated as missing. + + Parameters + ---------- + value : Any + Candidate cell value. + + Returns + ------- + bool + True when the value is absent or NaN-like. + """ + + if value is None: + return True + try: + return bool(pd.isna(value)) + except (TypeError, ValueError): + return False + + +def _parse_cell(value: Any) -> Any: + """Parse one CSV cell into a Python value when possible. + + Parameters + ---------- + value : Any + Raw CSV cell value. + + Returns + ------- + Any + Parsed scalar/list/dict value, or the original value. + """ + + if _is_missing(value): + return None + if not isinstance(value, str): + return value + text = value.strip() + if text == "": + return None + lowered = text.lower() + if lowered in {"true", "false"}: + return lowered == "true" + if lowered in {"none", "null"}: + return None + if text[:1] in {"[", "{", "("}: + for parser in (json.loads, ast.literal_eval): + try: + return parser(text) + except (ValueError, SyntaxError, TypeError, json.JSONDecodeError): + continue + try: + if any(marker in text for marker in (".", "e", "E")): + return float(text) + return int(text) + except ValueError: + return text + + +def _first_present(row: pd.Series, names: Iterable[str]) -> tuple[Any, str | None]: + """Return the first present value from a row. + + Parameters + ---------- + row : pandas.Series + Input row. + names : Iterable[str] + Candidate column names in priority order. + + Returns + ------- + tuple[Any, str | None] + Parsed value and the column that supplied it. + """ + + for name in names: + if name in row.index and not _is_missing(row[name]): + parsed = _parse_cell(row[name]) + if parsed is not None: + return parsed, name + return None, None + + +def _row_value(row: pd.Series, logical_name: str) -> tuple[Any, str | None]: + """Resolve a logical trial field across supported CSV schemas. + + Parameters + ---------- + row : pandas.Series + Input row. + logical_name : str + Unprefixed hparam or metric name. + + Returns + ------- + tuple[Any, str | None] + Parsed value and source column. + """ + + return _first_present( + row, + ( + logical_name, + f"hparam__{logical_name}", + f"user_attrs_hparam__{logical_name}", + f"params_{logical_name}", + f"user_attrs_{logical_name}", + ), + ) + + +def _config_input_shape(config: dict[str, Any]) -> tuple[int, int] | None: + """Infer a model input shape from the run config without loading data. + + Parameters + ---------- + config : dict[str, Any] + Parsed NAS config. + + Returns + ------- + tuple[int, int] | None + ``(timesteps, input_dim)`` when inferable. + """ + + dataset = config.get("dataset", {}) or {} + params = dataset.get("params", {}) or {} + timesteps = params.get("window_size") + input_dim = params.get("input_dim") + if input_dim is None and str(dataset.get("name", "")).strip().lower() == "oxiod": + input_dim = 10 + if timesteps is None or input_dim is None: + return None + return int(timesteps), int(input_dim) + + +def _parse_input_shape_override(raw_value: str | None) -> tuple[int, int] | None: + """Parse the optional CLI input-shape override. + + Parameters + ---------- + raw_value : str | None + Raw CLI value. + + Returns + ------- + tuple[int, int] | None + Parsed shape or None when no override was provided. + """ + + if raw_value is None: + return None + parts = [part.strip() for part in raw_value.split(",")] + if len(parts) != 2: + raise ValueError("--input-shape must use TIMESTEPS,INPUT_DIM format.") + timesteps, input_dim = (int(parts[0]), int(parts[1])) + if timesteps <= 0 or input_dim <= 0: + raise ValueError("--input-shape dimensions must be positive.") + return timesteps, input_dim + + +def _resolve_input_shape( + row: pd.Series, + config_shape: tuple[int, int] | None, + override_shape: tuple[int, int] | None, +) -> tuple[int, int]: + """Resolve the OdomTCN logical input shape for one row. + + Parameters + ---------- + row : pandas.Series + Input trial row. + config_shape : tuple[int, int] | None + Shape inferred from config. + override_shape : tuple[int, int] | None + Explicit CLI override. + + Returns + ------- + tuple[int, int] + ``(timesteps, input_dim)``. + """ + + if override_shape is not None: + return override_shape + row_timesteps, _ = _row_value(row, "timesteps") + row_input_dim, _ = _row_value(row, "input_dim") + if row_timesteps is not None and row_input_dim is not None: + return int(row_timesteps), int(row_input_dim) + if config_shape is not None: + return config_shape + raise ValueError( + "Could not resolve input shape. Provide hparam__timesteps/hparam__input_dim " + "columns or pass --input-shape TIMESTEPS,INPUT_DIM." + ) + + +def _normalize_quantization_mode(row: pd.Series) -> tuple[str, int, str]: + """Resolve quantization mode and dtype width for one row. + + Parameters + ---------- + row : pandas.Series + Input trial row. + + Returns + ------- + tuple[str, int, str] + Normalized mode, dtype bytes, and source description. + """ + + value, source = _first_present( + row, + ( + "quantization_mode", + "params_quantization_mode", + "user_attrs_quantization_mode", + "hparam__quantization_mode", + "user_attrs_hparam__quantization_mode", + ), + ) + if value is None: + return "int8_ptq", DTYPE_BYTES["int8_ptq"], "defaulted_missing" + mode = str(value).strip().lower() + if mode not in DTYPE_BYTES: + raise ValueError(f"Unsupported quantization mode '{value}'. Expected float or int8_ptq.") + if mode in {"float32"}: + mode = "float" + if mode in {"int8"}: + mode = "int8_ptq" + return mode, DTYPE_BYTES[mode], str(source) + + +def _trial_hparams(row: pd.Series, input_shape: tuple[int, int]) -> dict[str, Any]: + """Decode OdomTCN hyperparameters from one trial row. + + Parameters + ---------- + row : pandas.Series + Input trial row. + input_shape : tuple[int, int] + Logical model input shape. + + Returns + ------- + dict[str, Any] + Build-time OdomTCN hyperparameters. + """ + + raw: dict[str, Any] = {} + for name in ( + "nb_filters", + "kernel_size", + "dropout_rate", + "use_skip_connections", + "norm_flag", + "dilations", + "dilations_index", + ): + value, _ = _row_value(row, name) + if value is not None: + raw[name] = value + missing = [ + name + for name in ("nb_filters", "kernel_size", "dropout_rate", "use_skip_connections", "norm_flag") + if name not in raw + ] + if "dilations" not in raw and "dilations_index" not in raw: + missing.append("dilations or dilations_index") + if missing: + raise ValueError(f"Missing OdomTCN hparam columns: {', '.join(missing)}") + + ctx = _build_context(input_shape) + decoded = OdomTCNFamily().decode_trial_hparams(raw, ctx, {}) + decoded["nb_filters"] = int(decoded["nb_filters"]) + decoded["kernel_size"] = int(decoded["kernel_size"]) + decoded["dropout_rate"] = float(decoded["dropout_rate"]) + decoded["use_skip_connections"] = bool(decoded["use_skip_connections"]) + decoded["norm_flag"] = bool(decoded["norm_flag"]) + decoded["dilations"] = [int(value) for value in decoded["dilations"]] + return decoded + + +def _build_context(input_shape: tuple[int, int]) -> ModelBuildContext: + """Build the minimal OdomTCN model context used by this analysis. + + Parameters + ---------- + input_shape : tuple[int, int] + Logical model input shape. + + Returns + ------- + tinyodom.pipeline_types.ModelBuildContext + Build context with the odometry two-head target spec. + """ + + return ModelBuildContext( + input_shape=input_shape, + input_dtype="float32", + target_spec=TargetSpec( + task_type="regression", + output_names=["velx", "vely"], + output_shapes=[(1,), (1,)], + ), + ) + + +def _shape_elements(shape_like: Any) -> int | None: + """Count elements in one tensor shape with batch size fixed to one. + + Parameters + ---------- + shape_like : Any + Tensor, TensorShape, tuple, or nested collection of those. + + Returns + ------- + int | None + Element count, or None when any non-batch dimension is unknown. + """ + + if shape_like is None: + return None + if isinstance(shape_like, (list, tuple)) and shape_like and not all( + isinstance(dim, (int, type(None))) for dim in shape_like + ): + total = 0 + for item in shape_like: + item_elements = _shape_elements(item) + if item_elements is None: + return None + total += item_elements + return total + shape = getattr(shape_like, "shape", shape_like) + try: + dims = list(shape.as_list()) + except AttributeError: + try: + dims = list(shape) + except TypeError: + return None + if not dims: + return 1 + if dims[0] is None: + dims[0] = 1 + elements = 1 + for dim in dims: + if dim is None: + return None + elements *= int(dim) + return int(elements) + + +def _layer_tensor_elements(layer: tf.keras.layers.Layer, attr_name: str) -> int | None: + """Read layer input/output tensor element counts. + + Parameters + ---------- + layer : tensorflow.keras.layers.Layer + Layer to inspect. + attr_name : str + Either ``"input"`` or ``"output"``. + + Returns + ------- + int | None + Element count when Keras exposes a concrete symbolic tensor shape. + """ + + try: + return _shape_elements(getattr(layer, attr_name)) + except (AttributeError, RuntimeError, ValueError): + return None + + +def _weight_bytes(layer: tf.keras.layers.Layer, dtype_bytes: int, seen_weights: set[int]) -> int: + """Count unique weight bytes for one layer. + + Parameters + ---------- + layer : tensorflow.keras.layers.Layer + Layer whose weights should be counted. + dtype_bytes : int + Deployment dtype width. + seen_weights : set[int] + Mutable set of already-counted Keras variable identities. + + Returns + ------- + int + Bytes for weights not previously seen. + """ + + total = 0 + for weight in getattr(layer, "weights", []) or []: + key = id(weight) + if key in seen_weights: + continue + seen_weights.add(key) + elements = _shape_elements(weight) + if elements is not None: + total += elements * dtype_bytes + return int(total) + + +def _make_estimate( + *, + layer_name: str, + layer_type: str, + input_elements: int | None, + weight_bytes: int, + output_elements: int | None, + dtype_bytes: int, + warning: str, + shape_source: str, +) -> LayerEstimate: + """Create a layer estimate from element counts. + + Parameters + ---------- + layer_name : str + Layer path. + layer_type : str + Layer class name. + input_elements : int | None + Input activation elements. + weight_bytes : int + Weight bytes. + output_elements : int | None + Output activation elements. + dtype_bytes : int + Deployment dtype width. + warning : str + Warning text, if any. + shape_source : str + Shape source label. + + Returns + ------- + LayerEstimate + Completed per-layer estimate. + """ + + input_bytes = 0 if input_elements is None else int(input_elements) * dtype_bytes + output_bytes = 0 if output_elements is None else int(output_elements) * dtype_bytes + return LayerEstimate( + layer_name=layer_name, + layer_type=layer_type, + input_activation_bytes=input_bytes, + weight_bytes=int(weight_bytes), + output_activation_bytes=output_bytes, + traffic_bytes=input_bytes + int(weight_bytes) + output_bytes, + warning=warning, + shape_source=shape_source, + ) + + +def _generic_layer_estimate( + layer: tf.keras.layers.Layer, + dtype_bytes: int, + seen_weights: set[int], +) -> LayerEstimate: + """Estimate memory traffic for a standard Keras layer. + + Parameters + ---------- + layer : tensorflow.keras.layers.Layer + Layer to inspect. + dtype_bytes : int + Deployment dtype width. + seen_weights : set[int] + Mutable set of already-counted Keras weights. + + Returns + ------- + LayerEstimate + Estimate based on Keras symbolic input/output tensor shapes. + """ + + input_elements = _layer_tensor_elements(layer, "input") + output_elements = _layer_tensor_elements(layer, "output") + warning = "" + shape_source = "keras_tensor" + if input_elements is None or output_elements is None: + warning = "missing_keras_activation_shape" + shape_source = "unavailable" + return _make_estimate( + layer_name=layer.name, + layer_type=type(layer).__name__, + input_elements=input_elements, + weight_bytes=_weight_bytes(layer, dtype_bytes, seen_weights), + output_elements=output_elements, + dtype_bytes=dtype_bytes, + warning=warning, + shape_source=shape_source, + ) + + +def _conv_output_channels(layer: tf.keras.layers.Layer) -> int | None: + """Infer output channel count for a Conv1D-like layer. + + Parameters + ---------- + layer : tensorflow.keras.layers.Layer + Candidate Conv1D layer. + + Returns + ------- + int | None + Output channel count when available. + """ + + filters = getattr(layer, "filters", None) + if filters is not None: + return int(filters) + weights = getattr(layer, "weights", []) or [] + if weights: + shape = list(weights[0].shape) + if shape: + return int(shape[-1]) + return None + + +def _estimate_tcn_layer( + layer: tf.keras.layers.Layer, + dtype_bytes: int, + seen_weights: set[int], +) -> list[LayerEstimate]: + """Estimate a keras-tcn TCN layer, including exposed residual blocks. + + Parameters + ---------- + layer : tensorflow.keras.layers.Layer + TCN layer to inspect. + dtype_bytes : int + Deployment dtype width. + seen_weights : set[int] + Mutable set of already-counted Keras weights. + + Returns + ------- + list[LayerEstimate] + Per-operation estimates for the TCN internals and final output slice. + """ + + input_shape = getattr(getattr(layer, "input", None), "shape", None) + output_elements = _layer_tensor_elements(layer, "output") + residual_blocks = list(getattr(layer, "residual_blocks", []) or []) + if input_shape is None or len(input_shape) < 3 or not residual_blocks: + estimate = _generic_layer_estimate(layer, dtype_bytes, seen_weights) + estimate.warning = "tcn_internal_layers_unavailable" + estimate.shape_source = "keras_tensor_black_box" + return [estimate] + + timesteps = int(input_shape[1]) + current_channels = int(input_shape[2]) + estimates: list[LayerEstimate] = [] + for block_index, block in enumerate(residual_blocks): + residual_channels = current_channels + block_layers = list(getattr(block, "_layers", []) or getattr(block, "layers", []) or []) + for child in block_layers: + layer_type = type(child).__name__ + child_name = f"{layer.name}/residual_block_{block_index}/{child.name}" + if layer_type == "Conv1D": + output_channels = _conv_output_channels(child) + if output_channels is None: + input_elements = None + output_child_elements = None + else: + input_channels = ( + residual_channels + if str(child.name).startswith("matching_") + else current_channels + ) + input_elements = timesteps * input_channels + output_child_elements = timesteps * output_channels + if not str(child.name).startswith("matching_"): + current_channels = output_channels + elif layer_type == "Lambda" and str(child.name).startswith("matching_"): + input_elements = timesteps * residual_channels + output_child_elements = timesteps * residual_channels + else: + input_elements = timesteps * current_channels + output_child_elements = timesteps * current_channels + estimates.append( + _make_estimate( + layer_name=child_name, + layer_type=layer_type, + input_elements=input_elements, + weight_bytes=_weight_bytes(child, dtype_bytes, seen_weights), + output_elements=output_child_elements, + dtype_bytes=dtype_bytes, + warning="inferred_tcn_internal_activation_shape", + shape_source="inferred_tcn_residual_block", + ) + ) + current_channels = _conv_output_channels(block_layers[0]) or current_channels + + estimates.append( + _make_estimate( + layer_name=f"{layer.name}/output_slice", + layer_type=type(layer).__name__, + input_elements=timesteps * current_channels, + weight_bytes=0, + output_elements=output_elements, + dtype_bytes=dtype_bytes, + warning="inferred_tcn_final_sequence_to_vector_shape", + shape_source="inferred_tcn_output_slice", + ) + ) + return estimates + + +def _estimate_model(model: tf.keras.Model, dtype_bytes: int) -> ProxyEstimate: + """Estimate static memory traffic for one built Keras model. + + Parameters + ---------- + model : tensorflow.keras.Model + Built OdomTCN model. + dtype_bytes : int + Deployment dtype width. + + Returns + ------- + ProxyEstimate + Aggregate and layer-level proxy metrics. + """ + + details: list[LayerEstimate] = [] + seen_weights: set[int] = set() + for layer in model.layers: + if isinstance(layer, tf.keras.layers.InputLayer): + continue + if type(layer).__name__ == "TCN": + details.extend(_estimate_tcn_layer(layer, dtype_bytes, seen_weights)) + else: + details.append(_generic_layer_estimate(layer, dtype_bytes, seen_weights)) + + unassigned_weight_bytes = 0 + for weight in model.weights: + key = id(weight) + if key in seen_weights: + continue + seen_weights.add(key) + elements = _shape_elements(weight) + if elements is not None: + unassigned_weight_bytes += elements * dtype_bytes + if unassigned_weight_bytes: + details.append( + _make_estimate( + layer_name="unassigned_model_weights", + layer_type="Weights", + input_elements=0, + weight_bytes=unassigned_weight_bytes, + output_elements=0, + dtype_bytes=dtype_bytes, + warning="weights_not_attributed_to_layer", + shape_source="model_weights", + ) + ) + + weight_bytes = sum(item.weight_bytes for item in details) + activation_bytes = sum( + item.input_activation_bytes + item.output_activation_bytes for item in details + ) + memory_traffic_bytes = sum(item.traffic_bytes for item in details) + warning_count = sum(1 for item in details if item.warning) + return ProxyEstimate( + weight_bytes=int(weight_bytes), + activation_bytes=int(activation_bytes), + memory_traffic_bytes=int(memory_traffic_bytes), + dtype_bytes=dtype_bytes, + warning_count=int(warning_count), + layer_details=details, + ) + + +def _estimate_row( + row: pd.Series, + config_shape: tuple[int, int] | None, + override_shape: tuple[int, int] | None, +) -> dict[str, Any]: + """Compute proxy output columns for one CSV row. + + Parameters + ---------- + row : pandas.Series + Trial row. + config_shape : tuple[int, int] | None + Shape inferred from config. + override_shape : tuple[int, int] | None + Explicit CLI override. + + Returns + ------- + dict[str, Any] + Proxy output columns and layer details. + """ + + quant_mode, dtype_bytes, quant_source = _normalize_quantization_mode(row) + input_shape = _resolve_input_shape(row, config_shape, override_shape) + hparams = _trial_hparams(row, input_shape) + family = OdomTCNFamily() + ctx = _build_context(input_shape) + model = family.build_model(hparams, ctx, {}) + estimate = _estimate_model(model, dtype_bytes) + tf.keras.backend.clear_session() + return { + "proxy_weight_bytes": estimate.weight_bytes, + "proxy_activation_bytes": estimate.activation_bytes, + "proxy_memory_traffic_bytes": estimate.memory_traffic_bytes, + "proxy_dtype_bytes": estimate.dtype_bytes, + "proxy_warning_count": estimate.warning_count, + "proxy_quantization_mode": quant_mode, + "proxy_quantization_mode_source": quant_source, + "proxy_layer_details_json": json.dumps( + [asdict(item) for item in estimate.layer_details], + sort_keys=True, + ), + } + + +def _default_output_path(input_csv: Path, output_dir: Path | None) -> Path: + """Return the default augmented CSV path for one input. + + Parameters + ---------- + input_csv : pathlib.Path + Source CSV path. + output_dir : pathlib.Path | None + Optional output directory override. + + Returns + ------- + pathlib.Path + Destination CSV path. + """ + + parent = input_csv.parent if output_dir is None else output_dir + return parent / f"{input_csv.stem}_with_memory_proxy{input_csv.suffix}" + + +def _augment_csv( + *, + input_csv: Path, + output_csv: Path, + config_shape: tuple[int, int] | None, + override_shape: tuple[int, int] | None, + max_rows: int | None, + include_layer_details: bool, +) -> pd.DataFrame: + """Read, augment, and write one trial CSV. + + Parameters + ---------- + input_csv : pathlib.Path + Source trial CSV. + output_csv : pathlib.Path + Destination augmented CSV. + config_shape : tuple[int, int] | None + Shape inferred from config. + override_shape : tuple[int, int] | None + Explicit CLI shape override. + max_rows : int | None + Optional row cap. + include_layer_details : bool + Whether to retain the large layer-detail JSON column. + + Returns + ------- + pandas.DataFrame + Augmented dataframe. + """ + + frame = pd.read_csv(input_csv) + if max_rows is not None: + frame = frame.head(max_rows).copy() + proxy_rows: list[dict[str, Any]] = [] + for index, row in frame.iterrows(): + try: + proxy_rows.append(_estimate_row(row, config_shape, override_shape)) + except Exception as exc: # pragma: no cover - exercised by CLI diagnostics. + raise RuntimeError(f"Failed to estimate row {index} from {input_csv}: {exc}") from exc + + proxy_frame = pd.DataFrame(proxy_rows) + if not include_layer_details and "proxy_layer_details_json" in proxy_frame: + proxy_frame = proxy_frame.drop(columns=["proxy_layer_details_json"]) + output = pd.concat([frame.reset_index(drop=True), proxy_frame], axis=1) + output_csv.parent.mkdir(parents=True, exist_ok=True) + output.to_csv(output_csv, index=False) + return output + + +def _numeric_series( + frame: pd.DataFrame, + candidates: Iterable[str], +) -> tuple[pd.Series | None, str | None]: + """Return the first usable numeric series from candidate columns. + + Parameters + ---------- + frame : pandas.DataFrame + Dataframe to inspect. + candidates : Iterable[str] + Candidate column names. + + Returns + ------- + tuple[pandas.Series | None, str | None] + Numeric series and source column name. + """ + + for column in candidates: + if column not in frame.columns: + continue + series = pd.to_numeric(frame[column], errors="coerce") + if series.notna().any(): + return series, column + return None, None + + +def _plot_outputs(frame: pd.DataFrame, output_csv: Path, plot_dir: Path | None) -> None: + """Write optional scatter plots and print rank correlations. + + Parameters + ---------- + frame : pandas.DataFrame + Augmented dataframe. + output_csv : pathlib.Path + CSV path used for plot naming. + plot_dir : pathlib.Path | None + Optional destination directory. + + Returns + ------- + None + Writes PNGs when matplotlib is available. + """ + + try: + import matplotlib + matplotlib.use("Agg", force=True) + import matplotlib.pyplot as plt + except ImportError: + print("Plotting skipped: matplotlib is not available.") + return + + target_dir = output_csv.parent if plot_dir is None else plot_dir + target_dir.mkdir(parents=True, exist_ok=True) + x = pd.to_numeric(frame["proxy_memory_traffic_bytes"], errors="coerce") + plot_specs = [ + ( + ("flops", "user_attrs_flops"), + "flops", + ), + ( + ("metric__rmse_total", "rmse_total", "values_rmse_total", "user_attrs_metric__rmse_total"), + "rmse_total", + ), + ( + ( + "energy_mj_per_inference", + "values_energy_mj_per_inference", + "user_attrs_energy_mj_per_inference", + ), + "energy_mj_per_inference", + ), + ] + for candidates, label in plot_specs: + y, source = _numeric_series(frame, candidates) + if y is None or source is None: + continue + valid = x.notna() & y.notna() & (x >= 0) & (y >= 0) & (y < 1.0e11) + if valid.sum() < 2: + continue + fig, ax = plt.subplots(figsize=(6.0, 4.0)) + ax.scatter(x[valid], y[valid], s=18, alpha=0.75) + ax.set_xlabel("static memory traffic bytes") + ax.set_ylabel(label) + ax.set_title(f"{label} vs static memory traffic") + fig.tight_layout() + path = target_dir / f"{output_csv.stem}_{label}_vs_memory_traffic.png" + fig.savefig(path, dpi=160) + plt.close(fig) + print(f"Wrote plot: {path}") + _print_correlations(x[valid], y[valid], label) + + +def _print_correlations(x: pd.Series, y: pd.Series, label: str) -> None: + """Print optional Spearman and Kendall correlations. + + Parameters + ---------- + x : pandas.Series + Proxy metric values. + y : pandas.Series + Target metric values. + label : str + Target metric label. + + Returns + ------- + None + Prints correlations to stdout. + """ + + try: + from scipy import stats + except ImportError: + spearman = x.corr(y, method="spearman") + kendall = x.corr(y, method="kendall") + print( + f"{label}: scipy unavailable; pandas Spearman={spearman:.4g}, " + f"Kendall={kendall:.4g}" + ) + return + spearman = stats.spearmanr(x, y, nan_policy="omit") + kendall = stats.kendalltau(x, y, nan_policy="omit") + if not math.isnan(float(spearman.statistic)): + print( + f"{label}: Spearman={spearman.statistic:.4g} (p={spearman.pvalue:.4g}), " + f"Kendall={kendall.statistic:.4g} (p={kendall.pvalue:.4g})" + ) + + +def main() -> None: + """Run the static memory proxy CLI. + + Returns + ------- + None + Writes augmented CSVs and optional plots. + """ + + args = _parse_args() + if args.output_csv is not None and len(args.trials_csv) != 1: + raise SystemExit("--output-csv can only be used with exactly one --trials-csv input.") + if args.output_csv is not None and args.output_dir is not None: + raise SystemExit("Use only one of --output-csv or --output-dir.") + if args.max_rows is not None and args.max_rows <= 0: + raise SystemExit("--max-rows must be positive.") + + config = _load_yaml(args.config) + config_shape = _config_input_shape(config) + override_shape = _parse_input_shape_override(args.input_shape) + + for input_csv in args.trials_csv: + output_csv = args.output_csv or _default_output_path(input_csv, args.output_dir) + output = _augment_csv( + input_csv=input_csv, + output_csv=output_csv, + config_shape=config_shape, + override_shape=override_shape, + max_rows=args.max_rows, + include_layer_details=args.include_layer_details, + ) + print(f"Wrote augmented CSV: {output_csv} ({len(output)} rows)") + if args.plot: + _plot_outputs(output, output_csv, args.plot_dir) + + +if __name__ == "__main__": + main() diff --git a/analysis_scripts/stm32_example_project/README.md b/analysis_scripts/stm32_example_project/README.md index 8d491dc..ec91114 100644 --- a/analysis_scripts/stm32_example_project/README.md +++ b/analysis_scripts/stm32_example_project/README.md @@ -271,7 +271,7 @@ Useful flags: ```bash conda run -n tinyodomex python analysis_scripts/stm32_example_project/run_stm32_toy_ai_hil.py --clean -conda run -n tinyodomex python analysis_scripts/stm32_example_project/run_stm32_toy_ai_hil.py --config src/config/nas_config.yaml +conda run -n tinyodomex python analysis_scripts/stm32_example_project/run_stm32_toy_ai_hil.py --config src/config/nas_config_stm32.yaml conda run -n tinyodomex python analysis_scripts/stm32_example_project/run_stm32_toy_ai_hil.py --dut-port /dev/ttyACM0 --harness-port /dev/ttyACM1 conda run -n tinyodomex python analysis_scripts/stm32_example_project/run_stm32_toy_ai_hil.py --output analysis_scripts/stm32_example_project/last_metrics.json conda run -n tinyodomex python analysis_scripts/stm32_example_project/run_stm32_toy_ai_hil.py --latency-budget-ms 200.0 @@ -532,7 +532,7 @@ Common commands: ```bash # Rebuild the default perturbed model and restage the generated network files: conda run -n tinyodomex python analysis_scripts/stm32_example_project/generate_and_stage_stm32_toy_ai.py \ - --config src/config/nas_config.yaml + --config src/config/nas_config_stm32.yaml # Stage a prebuilt TFLite model instead of rebuilding one: conda run -n tinyodomex python analysis_scripts/stm32_example_project/generate_and_stage_stm32_toy_ai.py \ diff --git a/analysis_scripts/stm32_example_project/generate_and_stage_stm32_toy_ai.py b/analysis_scripts/stm32_example_project/generate_and_stage_stm32_toy_ai.py index ceffeaf..748db4d 100644 --- a/analysis_scripts/stm32_example_project/generate_and_stage_stm32_toy_ai.py +++ b/analysis_scripts/stm32_example_project/generate_and_stage_stm32_toy_ai.py @@ -29,7 +29,7 @@ from stm32_phase2_candidate import export_perturbed_candidate_tflite DEFAULT_PROJECT_ROOT = SCRIPT_DIR / "stm32_toy_ai_project" / "FSBL" -DEFAULT_CONFIG_PATH = REPO_ROOT / "src" / "config" / "nas_config.yaml" +DEFAULT_CONFIG_PATH = REPO_ROOT / "src" / "config" / "nas_config_stm32.yaml" DEFAULT_OUTPUT_ROOT = Path("/tmp/tinyodom_stm32_toy_generate") EXPECTED_OUTPUTS = [ "network.c", diff --git a/analysis_scripts/stm32_example_project/run_stm32_cadenced_comparison.py b/analysis_scripts/stm32_example_project/run_stm32_cadenced_comparison.py index 303d1fd..032b362 100644 --- a/analysis_scripts/stm32_example_project/run_stm32_cadenced_comparison.py +++ b/analysis_scripts/stm32_example_project/run_stm32_cadenced_comparison.py @@ -335,10 +335,10 @@ def _build_arg_parser() -> argparse.ArgumentParser: parser.add_argument( "--config", type=Path, - default=REPO_ROOT / "src" / "config" / "nas_config.yaml", + default=REPO_ROOT / "src" / "config" / "nas_config_stm32.yaml", help=( "TinyODOM NAS configuration YAML used for staging the perturbed model. " - "Example: src/config/nas_config.yaml" + "Example: src/config/nas_config_stm32.yaml" ), ) parser.add_argument( diff --git a/analysis_scripts/stm32_example_project/run_stm32_cpu_clock_sweep.py b/analysis_scripts/stm32_example_project/run_stm32_cpu_clock_sweep.py index 059d680..7564da7 100644 --- a/analysis_scripts/stm32_example_project/run_stm32_cpu_clock_sweep.py +++ b/analysis_scripts/stm32_example_project/run_stm32_cpu_clock_sweep.py @@ -21,7 +21,7 @@ REPO_ROOT = SCRIPT_DIR.parents[1] RUNNER_PATH = SCRIPT_DIR / "run_stm32_toy_ai_hil.py" DEFAULT_PROJECT_ROOT = SCRIPT_DIR / "stm32_cadenced_toy_ai_project" / "FSBL" -DEFAULT_CONFIG = REPO_ROOT / "src" / "config" / "nas_config.yaml" +DEFAULT_CONFIG = REPO_ROOT / "src" / "config" / "nas_config_stm32.yaml" DEFAULT_RESULTS_ROOT = SCRIPT_DIR / "results" DEFAULT_STAGE_OUTPUT_ROOT = Path("/tmp/tinyodom_stm32_toy_generate") DEFAULT_WEIGHTS_MEMORY_POOL = SCRIPT_DIR / "nucleo_mypool.json" diff --git a/analysis_scripts/stm32_example_project/run_stm32_lrun_cadenced_comparison.py b/analysis_scripts/stm32_example_project/run_stm32_lrun_cadenced_comparison.py index 3cccecc..c2b4dbd 100644 --- a/analysis_scripts/stm32_example_project/run_stm32_lrun_cadenced_comparison.py +++ b/analysis_scripts/stm32_example_project/run_stm32_lrun_cadenced_comparison.py @@ -57,7 +57,7 @@ def _run_case( def main() -> int: parser = argparse.ArgumentParser(description="Compare LRUN back-to-back and cadenced runs.") parser.add_argument("--project-root", type=Path, default=SCRIPT_DIR / "stm32_lrun_toy_ai_project") - parser.add_argument("--config", type=Path, default=SCRIPT_DIR.parents[1] / "src" / "config" / "nas_config.yaml") + parser.add_argument("--config", type=Path, default=SCRIPT_DIR.parents[1] / "src" / "config" / "nas_config_stm32.yaml") parser.add_argument("--output-dir", type=Path, default=SCRIPT_DIR / "out" / "stm32_lrun_cadenced_comparison") parser.add_argument("--attempts", type=int, default=3) parser.add_argument("--weight-storage-mode", choices=["embedded", "external_flash"], default="external_flash") diff --git a/analysis_scripts/stm32_example_project/run_stm32_toy_ai_hil.py b/analysis_scripts/stm32_example_project/run_stm32_toy_ai_hil.py index a91a191..40acd3f 100644 --- a/analysis_scripts/stm32_example_project/run_stm32_toy_ai_hil.py +++ b/analysis_scripts/stm32_example_project/run_stm32_toy_ai_hil.py @@ -37,7 +37,7 @@ DEFAULT_BAUD = 115200 DEFAULT_LATENCY_BUDGET_MS = 200.0 DEFAULT_OUTPUT = SCRIPT_DIR / "stm32_toy_ai_metrics.json" -DEFAULT_CONFIG = REPO_ROOT / "src" / "config" / "nas_config.yaml" +DEFAULT_CONFIG = REPO_ROOT / "src" / "config" / "nas_config_stm32.yaml" DEFAULT_STAGE_OUTPUT_ROOT = Path("/tmp/tinyodom_stm32_toy_generate") DEFAULT_HARNESS_FQBN = "arduino:mbed_nano:nano33ble" DEFAULT_HARNESS_AUTO_FLASH = "once" @@ -261,7 +261,7 @@ def _build_parser() -> argparse.ArgumentParser: help=( "TinyODOM NAS configuration YAML used to generate and export the perturbed model. " f"Default: {DEFAULT_CONFIG}. " - "Examples: src/config/nas_config.yaml, configs/my_experiment.yaml" + "Examples: src/config/nas_config_stm32.yaml, configs/my_experiment.yaml" ), ) parser.add_argument( diff --git a/analysis_scripts/stm32_example_project/smoke_test_stm32_lrun_toy_ai.py b/analysis_scripts/stm32_example_project/smoke_test_stm32_lrun_toy_ai.py index dbf83bb..85cad50 100644 --- a/analysis_scripts/stm32_example_project/smoke_test_stm32_lrun_toy_ai.py +++ b/analysis_scripts/stm32_example_project/smoke_test_stm32_lrun_toy_ai.py @@ -19,7 +19,7 @@ def _build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Smoke-test the LRUN STM32 toy AI path.") parser.add_argument("--project-root", type=Path, default=SCRIPT_DIR / "stm32_lrun_toy_ai_project") - parser.add_argument("--config", type=Path, default=SCRIPT_DIR.parents[1] / "src" / "config" / "nas_config.yaml") + parser.add_argument("--config", type=Path, default=SCRIPT_DIR.parents[1] / "src" / "config" / "nas_config_stm32.yaml") parser.add_argument("--output-dir", type=Path, default=SCRIPT_DIR / "out" / "stm32_lrun_smoke") parser.add_argument("--stage-output-root", type=Path, default=Path("/tmp/tinyodom_stm32_lrun_smoke")) parser.add_argument("--phase", choices=["back_to_back", "cadenced"], default="cadenced") diff --git a/analysis_scripts/stm32_example_project/stm32_lrun_common.py b/analysis_scripts/stm32_example_project/stm32_lrun_common.py index d3c9b13..98432a0 100644 --- a/analysis_scripts/stm32_example_project/stm32_lrun_common.py +++ b/analysis_scripts/stm32_example_project/stm32_lrun_common.py @@ -22,7 +22,7 @@ REPO_ROOT = SCRIPT_DIR.parents[1] DEFAULT_PROJECT_ROOT = SCRIPT_DIR / "stm32_lrun_toy_ai_project" -DEFAULT_CONFIG_PATH = REPO_ROOT / "src" / "config" / "nas_config.yaml" +DEFAULT_CONFIG_PATH = REPO_ROOT / "src" / "config" / "nas_config_stm32.yaml" DEFAULT_OUTPUT_ROOT = Path("/tmp/tinyodom_stm32_lrun_generate") DEFAULT_OUTPUT_JSON = SCRIPT_DIR / "stm32_lrun_toy_ai_metrics.json" DEFAULT_DUT_PORT = "/dev/ttyACM0" diff --git a/analysis_scripts/stm32_example_project/stm32_phase2_candidate.py b/analysis_scripts/stm32_example_project/stm32_phase2_candidate.py index b3ed4d6..96b41df 100644 --- a/analysis_scripts/stm32_example_project/stm32_phase2_candidate.py +++ b/analysis_scripts/stm32_example_project/stm32_phase2_candidate.py @@ -19,7 +19,7 @@ from tinyodom.model import configured_quantization_mode, load_config, quantization_requires_calibration from tinyodom.runtime_bootstrap import bootstrap_pipeline -DEFAULT_CONFIG_PATH = REPO_ROOT / "src" / "config" / "nas_config.yaml" +DEFAULT_CONFIG_PATH = REPO_ROOT / "src" / "config" / "nas_config_stm32.yaml" PERTURBED_VARIANT_NAME = "approx_trained" diff --git a/src/README.md b/src/README.md index 5e0ad5f..323242a 100644 --- a/src/README.md +++ b/src/README.md @@ -22,10 +22,6 @@ Related docs: - [`hil_server.py`](hil_server.py) Runs the ZeroMQ HIL server that materializes models, stages device-specific candidates, and returns compile/runtime metrics. -- [`config/nas_config.yaml`](config/nas_config.yaml) - Default runtime configuration for device selection, dataset parameters, - training controls, NAS scoring/pruning, outputs, logging, and network - settings. ## Package Map @@ -98,7 +94,7 @@ typed payloads shared across orchestration code. At a high level, the source tree is wired like this: 1. `nas_model_client.py` or `hil_server.py` loads - [`config/nas_config.yaml`](config/nas_config.yaml) through shared helpers in + [`config/nas_config_stm32.yaml`](config/nas_config_stm32.yaml) through shared helpers in [`tinyodom/model.py`](tinyodom/model.py). 2. The entry point calls `ensure_builtin_components_registered()` from @@ -107,8 +103,8 @@ At a high level, the source tree is wired like this: 3. The entry point runs the shared bootstrap in [`tinyodom/runtime_bootstrap.py`](tinyodom/runtime_bootstrap.py), which resolves component selection, instantiates the selected dataset/task/model - family, derives the target spec, and validates `nas.score` / `nas.prune` - against the task metric contract. + family, derives the target spec, and validates `nas.score`, `nas.prune`, + and `nas.feasibility` against the task metric contract. 4. The selected dataset adapter loads data and produces a normalized `DatasetBundle`. 5. The selected task adapter builds the target contract and training/evaluation @@ -241,6 +237,7 @@ Stable shared columns include: error codes - score/objective metadata (`score_type`, `objective_*_json`) - pruning metadata +- feasibility metadata and signed Optuna constraints - `artifact_summary_json` - cadenced runtime telemetry fields when present @@ -359,8 +356,8 @@ Important caveat: For the current scoring, pruning, and runtime knobs, use: - [`config/README.md`](config/README.md) for the config reference -- [`config/nas_config.yaml`](config/nas_config.yaml) for the default config shape -- [`tinyodom/model.py`](tinyodom/model.py) for score/prune evaluation and HIL - request construction +- [`config/nas_config_stm32.yaml`](config/nas_config_stm32.yaml) for the STM32 config shape +- [`tinyodom/model.py`](tinyodom/model.py) for score/prune/feasibility + evaluation and HIL request construction - [`hil_server.py`](hil_server.py) for the HIL-side request handling and backend failure shaping diff --git a/src/config/README.md b/src/config/README.md index c6d6618..f7bd36a 100644 --- a/src/config/README.md +++ b/src/config/README.md @@ -5,32 +5,35 @@ This directory documents the runtime configuration surface for TinyODOM-EX. For the source architecture and extension map, see [`../README.md`](../README.md). -The current shipped config examples are: +Case-study run configs for the paper live under +[`case_study_configs/`](case_study_configs/). -- [`nas_config.yaml`](nas_config.yaml) +## Example Configs + +- [`nas_config_stm32.yaml`](nas_config_stm32.yaml) + STM32 N657 OxIOD measured-energy NAS example and the most complete commented + reference for score/prune policy shapes. - [`nas_config_ble.yaml`](nas_config_ble.yaml) + Arduino Nano 33 BLE Sense OxIOD measured-energy NAS example. - [`nas_config_portenta.yaml`](nas_config_portenta.yaml) + Portenta H7 CM7 OxIOD measured-energy NAS example. - [`nas_config_audio_stm32.yaml`](nas_config_audio_stm32.yaml) + STM32 N657 UrbanSound8K / DS-CNN audio NAS example. - [`nas_config_audio_portenta.yaml`](nas_config_audio_portenta.yaml) + Portenta H7 CM7 UrbanSound8K / DS-CNN audio NAS example. - [`nas_config_flops_rmse.yaml`](nas_config_flops_rmse.yaml) - -Use [`nas_config.yaml`](nas_config.yaml) as the default starting point for the -repo. It is the main STM32-oriented example config and the most complete -reference for the current score/prune surface. Use the BLE and Portenta files -when you want board-specific starting points for those Arduino-backed targets. -Use [`nas_config_audio_stm32.yaml`](nas_config_audio_stm32.yaml) for the -desktop-first UrbanSound8K / DS-CNN audio path before moving into STM32 HIL -work. Use [`nas_config_audio_portenta.yaml`](nas_config_audio_portenta.yaml) -for the Phase 8 Arduino audio path on Portenta H7 CM7. -Use [`nas_config_flops_rmse.yaml`](nas_config_flops_rmse.yaml) for a pure -desktop OxIOD NAS run that optimizes validation RMSE against FLOPs without HIL -or compile-only resource proxy metrics. + Pure desktop OxIOD NAS example that optimizes validation RMSE and FLOPs. +- [`nas_config_memory_proxy.yaml`](nas_config_memory_proxy.yaml) + Pure desktop OxIOD NAS example that optimizes validation RMSE and static + memory traffic. Audio analysis runners live under [`../../analysis_scripts`](../../analysis_scripts). They measure classifier inference over precomputed log-mel feature tensors; they do not include firmware-side microphone capture or audio feature extraction. -Phase 9 adds optional audio final fold-rotation reporting through: +## Audio Fold Rotation + +Audio configs can optionally run final reporting across UrbanSound8K folds: - `task.params.evaluation.protocol: fixed_split | fold_rotation` - `task.params.evaluation.fold_rotation.test_folds`, defaulting to all 10 @@ -51,9 +54,9 @@ The runtime loader and validator live in that resolves components and validates NAS policy against the active task lives in [`../tinyodom/runtime_bootstrap.py`](../tinyodom/runtime_bootstrap.py). -## Current Shape +## Top-Level Blocks -The main top-level blocks in the current config surface are: +The main top-level blocks are: - `device` Hardware target, HIL runtime behavior, timing, harness options, and @@ -80,8 +83,8 @@ The main top-level blocks in the current config surface are: Those component blocks are resolved by [`../tinyodom/component_selection.py`](../tinyodom/component_selection.py), and -they are now mandatory. The older top-level `data` block is no longer part of -the supported config contract. +they are mandatory. Use `dataset`, `task`, and `model`; the old top-level +`data` block is not supported. ## `device` @@ -104,10 +107,9 @@ Common keys: `back_to_back` or `cadenced`. - `device.latency_budget_ms` Optional shared cadence-budget override. When omitted, the runtime derives - it from the active dataset cadence: first `dataset.params.batch_period_ms` - when present, then legacy - `dataset.params.stride / dataset.params.sampling_rate_hz * 1000` for the - built-in `oxiod` dataset. + it from the active dataset cadence: first `dataset.params.batch_period_ms`, + then `dataset.params.stride / dataset.params.sampling_rate_hz * 1000` for + the built-in `oxiod` dataset. - `device.serial_port` DUT serial port. - `device.measured_inference_runs` @@ -132,14 +134,14 @@ Harness-related keys: - `device.harness_active_timeout_s` - `device.harness_done_timeout_s` -Per-backend nested blocks currently include: +Per-backend nested blocks include: - `device.portenta.*` - `device.stm32.*` -The current STM32 option plumbing is resolved by +STM32 option plumbing is resolved by [`../tinyodom/microcontrollers/__init__.py`](../tinyodom/microcontrollers/__init__.py). -Examples of STM32-owned keys currently supported in code include: +Examples of STM32-owned keys include: - `template_root` - `project_root` @@ -161,24 +163,24 @@ Examples of STM32-owned keys currently supported in code include: - `signing_header_version` - `max_external_flash_bytes` -Important current caveats: +Validation notes: - `device.runtime_mode` must be `back_to_back` or `cadenced`. - `device.stm32.runtime_mode` is no longer supported. Use `device.runtime_mode` instead. - `device.stm32.project_layout` is no longer supported. LRUN `dev_boot` is - implicit for the current STM32 backend. + implicit for the STM32 backend. - `device.latency_budget_ms` must be positive when set. - `device.measured_inference_runs` must be an integer `>= 1`. - `device.cpu_clock_mhz_options` must be a non-empty integer list when set. - For `STM32_NUCLEO_N657X0_Q`, `device.cpu_clock_mhz_options` is validated against the backend-supported set in code. -- For `PORTENTA_H7` and `ARDUINO_NANO_33_BLE_SENSE`, cadenced mode currently - requires `training.input_mode: uniform`. +- For `PORTENTA_H7` and `ARDUINO_NANO_33_BLE_SENSE`, cadenced mode requires + `training.input_mode: uniform`. ## `training` -The `training` block still owns the main NAS/training runtime switches. +The `training` block owns the main NAS/training runtime switches. Common keys in the shipped config: @@ -193,13 +195,13 @@ Common keys in the shipped config: - `training.energy_aware` - `training.input_mode` -Current runtime behavior: +Runtime behavior: - `training.energy_aware` defaults to `false` when omitted - `training.quantization` is required and must use the mapping shape: - `mode`, `search`, and non-empty `choices`. Most shipped configs fix - `mode: int8_ptq`, `search: false`, and `choices: [int8_ptq]`; the audio - STM32 config intentionally searches `choices: [float, int8_ptq]`. + `mode`, `search`, and non-empty `choices`. The main measured-board configs + search `choices: [float, int8_ptq]` so float32 and int8 PTQ exports can be + compared on the same backend. - Supported v1 quantization modes are `float` and `int8_ptq`. Enabling `training.quantization.search: true` samples `quantization_mode` from `choices`; this expands the effective NAS search space and usually needs a @@ -216,21 +218,25 @@ Current runtime behavior: scale/zero-point values and TFLite signature output order; final fixed-split reporting exports/evaluates the trained TFLite on the test split after `train_best_trial`. -- Closeout artifacts may still be Keras-derived in this phase unless a specific - path explicitly requests TFLite evaluation. +- Some closeout artifact paths are Keras-derived unless the path explicitly + requests TFLite evaluation. - `training.input_mode` defaults to `uniform` when omitted - `training.input_mode` supports dataset-agnostic `uniform` plus dataset-specific analysis modes: `oxiod_representative`, `oxiod_real`, `urbansound8k_representative`, and `urbansound8k_real` - `training.max_total_trials` defaults to `training.nas_trials * 2` when omitted +- `training.nas_trials` is the target number of feasible completed trials when + `nas.feasibility.rules` is enabled. Infeasible, failed, and pruned attempts + still count against `training.max_total_trials`, so constrained hardware + runs usually need a larger total-attempt budget than the feasible target. ## `dataset`, `task`, and `model` The modular component-selection surface is resolved by [`../tinyodom/component_selection.py`](../tinyodom/component_selection.py). -Current keys: +Keys: - `dataset.name` - `dataset.params` @@ -244,7 +250,7 @@ For `audio_dscnn`, `model.search: {}` means "use the model-family default search surface" from `AudioDSCNNFamily.AUDIO_DSCNN_SEARCH_CHOICES`. Add keys under `model.search` only when you want to narrow that default surface. -Current caveats: +Validation notes: - `dataset`, `task`, and `model` are required top-level blocks - `dataset.params` is required for the built-in `oxiod` dataset path @@ -252,8 +258,8 @@ Current caveats: - model family classes are instantiated as zero-argument classes - task classes are expected to use the explicit keyword-only constructor contract `__init__(*, checkpoint_path, early_stopping_patience)`; the runtime - no longer probes constructor signatures and does not provide backward- - compatibility shims for older task classes + does not probe constructor signatures or provide compatibility shims for + older task classes Minimal example: @@ -281,9 +287,9 @@ model: ## `nas` -The `nas` block owns scoring and pruning policy. +The `nas` block owns scoring, pruning, and constrained-feasibility policy. -Current structure: +Structure: - `nas.score.type` `scoring-function` or `multi-objective` @@ -292,26 +298,51 @@ Current structure: - `nas.score.params` Score terms or objectives depending on `score.type` - `nas.prune.rules` - Optional pre-training hard-reject rules + Optional post-build/pre-fit hard termination gates +- `nas.feasibility.train_if_infeasible` + Whether trials that violate feasibility constraints should still train and + return real objective values. Defaults to `false`. +- `nas.feasibility.rules` + Optional post-build/pre-fit deployability constraints using the same rule + shape as `nas.prune.rules`: `rule`, `metric`, `condition`, `reference`, and + `reason`. -Built-in derived metric types currently documented by the shipped config and -validated in code include: +Built-in derived metric types validated in code include: - `add` - `energy-budget-from-power` -Current scalar term types include: +Scalar term types include: - `weighted` - `normalized-weighted` - `boundary` - `target` -Current practical guidance: +Practical guidance: -- use `scoring-function` when you want one scalar score and config-driven - prune rules +- use `scoring-function` when you want one scalar score - use `multi-objective` when you want a Pareto front instead of one scalar +- use `nas.prune.rules` with either score type when a pre-fit metric should + hard-stop a trial before feasibility is evaluated. Rules run after model + build/compile, FLOP counting, + and HIL/compile metric collection, but before `task.build_fit_plan`, + `model.fit`, TFLite validation, or Keras validation. Multi-objective gate + hits are logged with `pruned=True` but remain Optuna COMPLETE trials with + direction-aware penalty values. +- use `nas.feasibility.rules` for deployability constraints that should be + visible to Optuna constrained samplers. Feasibility is evaluated after hard + failures and `nas.prune.rules`; each rule persists one signed constraint + where `<= 0` is feasible and `> 0` is infeasible. With + `train_if_infeasible: false`, infeasible trials skip training and return + penalties while still consuming `training.max_total_trials`. With + `train_if_infeasible: true`, trials train normally but remain + constrained-infeasible for samplers, CSV filtering, and plots. +- move latency/deadline budget gates such as `latency_ms > latency_budget_ms` + or `cadenced_deadline_miss_count > 0` to `nas.feasibility.rules` when you + want constrained dominance. Keep the same metric in `nas.prune.rules` only + when you deliberately want hard early termination for debugging or resource + protection. - keep non-HIL configs away from score/prune terms that require measured latency or energy. `latency_ms`, energy/power/current/voltage metrics, `clock_hz`, `harness_latency_ms`, and `cadenced_*` metrics require @@ -319,24 +350,25 @@ Current practical guidance: - set `device.compile_when_hil_disabled: false` for pure desktop scores such as RMSE/FLOPs; leave it as `auto` when non-HIL score/prune terms still need compile-derived resource metrics. -- in cadenced multi-objective runs, overload remains telemetry rather than an - automatic prune +- in cadenced multi-objective runs, overload is telemetry unless you add a + `nas.feasibility.rules` constraint such as + `cadenced_deadline_miss_count > 0` -The most readable examples remain in -[`nas_config.yaml`](nas_config.yaml) itself. +The most complete commented examples remain in +[`nas_config_stm32.yaml`](nas_config_stm32.yaml) itself. ## `outputs` The `outputs` block controls directory roots and naming inputs. -Current shipped keys: +Keys: - `outputs.models_dir` - `outputs.candidate_dir` - `outputs.artifact_stem` - `outputs.log_file_name` -Important runtime caveat: +Runtime notes: - `load_config(...)` derives read-only runtime fields `model_name` and `checkpoint_name` from `outputs.artifact_stem` and `device.name`, then @@ -352,7 +384,7 @@ So the final in-memory values may differ from the literal YAML text. The `network` block owns HIL socket settings. -Current shipped keys: +Keys: - `network.host` - `network.port` @@ -363,7 +395,7 @@ These must match the HIL client/server deployment you actually run. ## `logging` -The `logging` block currently exposes: +The `logging` block exposes: - `logging.level` @@ -380,7 +412,7 @@ Valid values are: Use these files together: -- [`nas_config.yaml`](nas_config.yaml) for the main commented example +- [`nas_config_stm32.yaml`](nas_config_stm32.yaml) for the STM32 commented example - [`../tinyodom/model.py`](../tinyodom/model.py) for validation and derived runtime behavior - [`../README.md`](../README.md) for source architecture diff --git a/src/config/case_study_configs/README.md b/src/config/case_study_configs/README.md new file mode 100644 index 0000000..d10459f --- /dev/null +++ b/src/config/case_study_configs/README.md @@ -0,0 +1,17 @@ +# Case Study Configs + +These configs are the paper-oriented run setups for the CREST case studies. +Use the top-level configs in `src/config/` as general examples; use this folder +when you want the specific study axes from the paper. + +- **Case Study 1:** OxIOD/TCN measured-energy NAS across BLE33, Portenta M7, + Portenta M4, and STM32. The FLOPs and memory-proxy examples live one + directory up as `nas_config_flops_rmse.yaml` and + `nas_config_memory_proxy.yaml`. +- **Case Study 2:** STM32/OxIOD schedule comparison. The back-to-back config + is the continuous-inference side; the cadenced config is the sensing-window + side. +- **Case Study 3:** UrbanSound8K/DS-CNN application-level scoring on Portenta + M7 and STM32. + +Each YAML includes its intended launch command near the top. diff --git a/src/config/case_study_configs/nas_config_case1_2_ble33_b2b_oxiod.yaml b/src/config/case_study_configs/nas_config_case1_2_ble33_b2b_oxiod.yaml new file mode 100644 index 0000000..f8b72df --- /dev/null +++ b/src/config/case_study_configs/nas_config_case1_2_ble33_b2b_oxiod.yaml @@ -0,0 +1,86 @@ +# Case 1.2: Arduino Nano 33 BLE Sense OxIOD measured-energy NAS, back-to-back runtime. +# +# Intended launch on the GPU host: +# CUDA_VISIBLE_DEVICES=1 TF_FORCE_GPU_ALLOW_GROWTH=true \ +# python src/nas_model_client.py \ +# --config src/config/case_study_configs/nas_config_case1_2_ble33_b2b_oxiod.yaml \ +# --study-name OxIOD_BLE33_B2B_case1_2 + +device: + name: ARDUINO_NANO_33_BLE_SENSE + hil: true + compile_when_hil_disabled: auto + runtime_mode: back_to_back + serial_port: "/dev/ttyACM0" + measured_inference_runs: 10 + dut_ready_timeout_s: 12.0 + serial_timeout_s: 12.0 + harness_serial_port: "/dev/ttyACM1" + harness_fqbn: "arduino:mbed_nano:nano33ble" + harness_auto_flash: "once" + harness_ready_timeout_s: 5.0 + harness_arm_timeout_s: 5.0 + harness_active_timeout_s: 30.0 + harness_done_timeout_s: 5.0 + +dataset: + name: oxiod + params: + sampling_rate_hz: 100 + window_size: 200 + stride: 20 + directory: "data/oxiod/" + calibration_windows: 10000 + +task: + name: odometry_regression + params: + early_stopping_patience: 40 + +model: + family: odom_tcn + params: + export_variant: approx_trained + search: {} + +training: + nas_epochs: 55 + model_epochs: 990 + nas_trials: 150 + nas_multiobjective_population_size: 50 + max_total_trials: 300 + quantization: + mode: int8_ptq + search: true + choices: [float, int8_ptq] + latency_proxy_max_flops: 30000000.0 + train: true + energy_aware: true + input_mode: "uniform" + +nas: + score: + type: multi-objective + params: + objectives: + - metric: rmse_total + direction: minimize + - metric: energy_mj_per_inference + direction: minimize + prune: + rules: [] + +outputs: + models_dir: "models" + candidate_dir: "odom_tcn" + artifact_stem: "OxIOD_BLE33_B2B_case1_2" + log_file_name: "log_NAS_OxIOD_BLE33_B2B_case1_2.csv" + +network: + host: "127.0.0.1" + port: 6001 + recv_timeout_sec: 720 + send_timeout_sec: 720 + +logging: + level: "INFO" diff --git a/src/config/case_study_configs/nas_config_case1_3_portenta_m7_b2b_oxiod.yaml b/src/config/case_study_configs/nas_config_case1_3_portenta_m7_b2b_oxiod.yaml new file mode 100644 index 0000000..c3ff71c --- /dev/null +++ b/src/config/case_study_configs/nas_config_case1_3_portenta_m7_b2b_oxiod.yaml @@ -0,0 +1,90 @@ +# Case 1.3: Portenta H7 CM7 OxIOD measured-energy NAS, back-to-back runtime. +# +# Intended launch on the GPU host: +# CUDA_VISIBLE_DEVICES=1 TF_FORCE_GPU_ALLOW_GROWTH=true \ +# python src/nas_model_client.py \ +# --config src/config/case_study_configs/nas_config_case1_3_portenta_m7_b2b_oxiod.yaml \ +# --study-name OxIOD_PORTENTA_M7_B2B_case1_3 + +device: + name: PORTENTA_H7 + hil: true + compile_when_hil_disabled: auto + runtime_mode: back_to_back + serial_port: "/dev/ttyACM0" + measured_inference_runs: 10 + dut_ready_timeout_s: 12.0 + serial_timeout_s: 12.0 + harness_serial_port: "/dev/ttyACM1" + harness_fqbn: "arduino:mbed_nano:nano33ble" + harness_auto_flash: "once" + harness_ready_timeout_s: 5.0 + harness_arm_timeout_s: 5.0 + harness_active_timeout_s: 30.0 + harness_done_timeout_s: 5.0 + portenta: + target_core: "cm7" + split: "75_25" + security: "none" + +dataset: + name: oxiod + params: + sampling_rate_hz: 100 + window_size: 200 + stride: 20 + directory: "data/oxiod/" + calibration_windows: 10000 + +task: + name: odometry_regression + params: + early_stopping_patience: 40 + +model: + family: odom_tcn + params: + export_variant: approx_trained + search: {} + +training: + nas_epochs: 55 + model_epochs: 990 + nas_trials: 150 + nas_multiobjective_population_size: 50 + max_total_trials: 300 + quantization: + mode: int8_ptq + search: true + choices: [float, int8_ptq] + latency_proxy_max_flops: 30000000.0 + train: true + energy_aware: true + input_mode: "uniform" + +nas: + score: + type: multi-objective + params: + objectives: + - metric: rmse_total + direction: minimize + - metric: energy_mj_per_inference + direction: minimize + prune: + rules: [] + +outputs: + models_dir: "models" + candidate_dir: "odom_tcn" + artifact_stem: "OxIOD_PORTENTA_M7_B2B_case1_3" + log_file_name: "log_NAS_OxIOD_PORTENTA_M7_B2B_case1_3.csv" + +network: + host: "127.0.0.1" + port: 6001 + recv_timeout_sec: 720 + send_timeout_sec: 720 + +logging: + level: "INFO" diff --git a/src/config/case_study_configs/nas_config_case1_4_portenta_m4_b2b_oxiod.yaml b/src/config/case_study_configs/nas_config_case1_4_portenta_m4_b2b_oxiod.yaml new file mode 100644 index 0000000..97ea287 --- /dev/null +++ b/src/config/case_study_configs/nas_config_case1_4_portenta_m4_b2b_oxiod.yaml @@ -0,0 +1,90 @@ +# Case 1.4: Portenta H7 CM4 OxIOD measured-energy NAS, back-to-back runtime. +# +# Intended launch on the GPU host: +# CUDA_VISIBLE_DEVICES=1 TF_FORCE_GPU_ALLOW_GROWTH=true \ +# python src/nas_model_client.py \ +# --config src/config/case_study_configs/nas_config_case1_4_portenta_m4_b2b_oxiod.yaml \ +# --study-name OxIOD_PORTENTA_M4_B2B_case1_4 + +device: + name: PORTENTA_H7 + hil: true + compile_when_hil_disabled: auto + runtime_mode: back_to_back + serial_port: "/dev/ttyACM0" + measured_inference_runs: 10 + dut_ready_timeout_s: 12.0 + serial_timeout_s: 12.0 + harness_serial_port: "/dev/ttyACM1" + harness_fqbn: "arduino:mbed_nano:nano33ble" + harness_auto_flash: "once" + harness_ready_timeout_s: 5.0 + harness_arm_timeout_s: 5.0 + harness_active_timeout_s: 30.0 + harness_done_timeout_s: 5.0 + portenta: + target_core: "cm4" + split: "50_50" + security: "none" + +dataset: + name: oxiod + params: + sampling_rate_hz: 100 + window_size: 200 + stride: 20 + directory: "data/oxiod/" + calibration_windows: 10000 + +task: + name: odometry_regression + params: + early_stopping_patience: 40 + +model: + family: odom_tcn + params: + export_variant: approx_trained + search: {} + +training: + nas_epochs: 55 + model_epochs: 990 + nas_trials: 150 + nas_multiobjective_population_size: 50 + max_total_trials: 300 + quantization: + mode: int8_ptq + search: true + choices: [float, int8_ptq] + latency_proxy_max_flops: 30000000.0 + train: true + energy_aware: true + input_mode: "uniform" + +nas: + score: + type: multi-objective + params: + objectives: + - metric: rmse_total + direction: minimize + - metric: energy_mj_per_inference + direction: minimize + prune: + rules: [] + +outputs: + models_dir: "models" + candidate_dir: "odom_tcn" + artifact_stem: "OxIOD_PORTENTA_M4_B2B_case1_4" + log_file_name: "log_NAS_OxIOD_PORTENTA_M4_B2B_case1_4.csv" + +network: + host: "127.0.0.1" + port: 6001 + recv_timeout_sec: 720 + send_timeout_sec: 720 + +logging: + level: "INFO" diff --git a/src/config/nas_config_case1_5_stm32_b2b_oxiod.yaml b/src/config/case_study_configs/nas_config_case1_5_stm32_b2b_oxiod.yaml similarity index 93% rename from src/config/nas_config_case1_5_stm32_b2b_oxiod.yaml rename to src/config/case_study_configs/nas_config_case1_5_stm32_b2b_oxiod.yaml index eb3b5dd..22e7b54 100644 --- a/src/config/nas_config_case1_5_stm32_b2b_oxiod.yaml +++ b/src/config/case_study_configs/nas_config_case1_5_stm32_b2b_oxiod.yaml @@ -3,8 +3,8 @@ # Intended launch on the GPU host: # CUDA_VISIBLE_DEVICES=1 TF_FORCE_GPU_ALLOW_GROWTH=true \ # python src/nas_model_client.py \ -# --config src/config/nas_config_case1_5_stm32_b2b_oxiod.yaml \ -# --study-name OxIOD_STM32_B2B_case1_5_t1 +# --config src/config/case_study_configs/nas_config_case1_5_stm32_b2b_oxiod.yaml \ +# --study-name OxIOD_STM32_B2B_case1_5 device: name: STM32_NUCLEO_N657X0_Q @@ -54,7 +54,7 @@ model: training: nas_epochs: 55 model_epochs: 990 - nas_trials: 150 + nas_trials: 250 nas_multiobjective_population_size: 50 max_total_trials: 300 quantization: diff --git a/src/config/case_study_configs/nas_config_case2_1_stm32_b2b_oxiod.yaml b/src/config/case_study_configs/nas_config_case2_1_stm32_b2b_oxiod.yaml new file mode 100644 index 0000000..e0e8b57 --- /dev/null +++ b/src/config/case_study_configs/nas_config_case2_1_stm32_b2b_oxiod.yaml @@ -0,0 +1,105 @@ +# Case 2.1: STM32 N657 OxIOD measured-energy NAS, back-to-back runtime, +# with post-HIL latency feasibility constraints. +# +# Intended launch on the GPU host: +# CUDA_VISIBLE_DEVICES=1 TF_FORCE_GPU_ALLOW_GROWTH=true \ +# python src/nas_model_client.py \ +# --config src/config/case_study_configs/nas_config_case2_1_stm32_b2b_oxiod.yaml \ +# --study-name OxIOD_STM32_B2B_case2_1 + +device: + name: STM32_NUCLEO_N657X0_Q + hil: true + compile_when_hil_disabled: auto + runtime_mode: back_to_back + serial_port: "/dev/ttyACM0" + measured_inference_runs: 10 + dut_ready_timeout_s: 12.0 + serial_timeout_s: 12.0 + cpu_clock_mhz_options: [200, 300, 400, 600, 800] + harness_serial_port: "/dev/ttyACM1" + harness_fqbn: "arduino:mbed_nano:nano33ble" + harness_auto_flash: "once" + harness_ready_timeout_s: 5.0 + harness_arm_timeout_s: 5.0 + harness_active_timeout_s: 30.0 + harness_done_timeout_s: 5.0 + stm32: + project_root: "sketches/stm32/tinyodom_stm32_lrun" + appli_flash_address: "0x70100000" + wake_margin_us: 5000 + min_sleep_us: 5000 + weight_storage_mode: "external_flash" + weights_memory_pool: "analysis_scripts/stm32_example_project/nucleo_mypool.json" + +dataset: + name: oxiod + params: + sampling_rate_hz: 100 + window_size: 200 + stride: 20 + directory: "data/oxiod/" + calibration_windows: 10000 + +task: + name: odometry_regression + params: + early_stopping_patience: 40 + +model: + family: odom_tcn + params: + export_variant: approx_trained + search: {} + +training: + nas_epochs: 55 + model_epochs: 990 + # Feasibility-enabled NAS targets this many feasible completed trials. + nas_trials: 256 + nas_multiobjective_population_size: 64 + # Infeasible, pruned, and failed attempts consume this larger cap. + max_total_trials: 600 + quantization: + mode: int8_ptq + search: true + choices: [float, int8_ptq] + latency_proxy_max_flops: 30000000.0 + train: true + energy_aware: true + input_mode: "uniform" + +nas: + score: + type: multi-objective + params: + objectives: + - metric: rmse_total + direction: minimize + - metric: energy_mj_per_inference + direction: minimize + feasibility: + train_if_infeasible: false + rules: + - rule: latency_budget + metric: latency_ms + condition: gt + reference: + type: metric + metric: latency_budget_ms + reason: "Latency exceeds deployment budget" + +outputs: + models_dir: "models" + candidate_dir: "odom_tcn" + artifact_stem: "OxIOD_STM32_B2B_case2_1" + log_file_name: "log_NAS_OxIOD_STM32_B2B_case2_1.csv" + +network: + host: "127.0.0.1" + port: 6001 + recv_timeout_sec: 720 + send_timeout_sec: 720 + +logging: + level: "INFO" diff --git a/src/config/nas_config_case2_2_stm32_cadenced_oxiod.yaml b/src/config/case_study_configs/nas_config_case2_2_stm32_cadenced_oxiod.yaml similarity index 74% rename from src/config/nas_config_case2_2_stm32_cadenced_oxiod.yaml rename to src/config/case_study_configs/nas_config_case2_2_stm32_cadenced_oxiod.yaml index f8d2c88..e9748be 100644 --- a/src/config/nas_config_case2_2_stm32_cadenced_oxiod.yaml +++ b/src/config/case_study_configs/nas_config_case2_2_stm32_cadenced_oxiod.yaml @@ -1,10 +1,11 @@ -# Case 2.2: STM32 N657 OxIOD measured-energy NAS, cadenced runtime. +# Case 2.2: STM32 N657 OxIOD measured-energy NAS, cadenced runtime, +# with post-HIL cadenced feasibility constraints. # # Intended launch on the GPU host: # CUDA_VISIBLE_DEVICES=1 TF_FORCE_GPU_ALLOW_GROWTH=true \ # python src/nas_model_client.py \ -# --config src/config/nas_config_case2_2_stm32_cadenced_oxiod.yaml \ -# --study-name OxIOD_STM32_CADENCED_case2_2_t1 +# --config src/config/case_study_configs/nas_config_case2_2_stm32_cadenced_oxiod.yaml \ +# --study-name OxIOD_STM32_CADENCED_case2_2 device: name: STM32_NUCLEO_N657X0_Q @@ -54,9 +55,11 @@ model: training: nas_epochs: 55 model_epochs: 990 - nas_trials: 150 - nas_multiobjective_population_size: 50 - max_total_trials: 300 + # Feasibility-enabled NAS targets this many feasible completed trials. + nas_trials: 256 + nas_multiobjective_population_size: 64 + # Infeasible, pruned, and failed attempts consume this larger cap. + max_total_trials: 600 quantization: mode: int8_ptq search: true @@ -75,8 +78,16 @@ nas: direction: minimize - metric: cadenced_energy_mj_per_window direction: minimize - prune: - rules: [] + feasibility: + train_if_infeasible: false + rules: + - rule: latency_budget + metric: latency_ms + condition: gt + reference: + type: metric + metric: latency_budget_ms + reason: "Latency exceeds deployment budget" outputs: models_dir: "models" diff --git a/src/config/case_study_configs/nas_config_case3_1_portenta_m7_audio.yaml b/src/config/case_study_configs/nas_config_case3_1_portenta_m7_audio.yaml new file mode 100644 index 0000000..6126905 --- /dev/null +++ b/src/config/case_study_configs/nas_config_case3_1_portenta_m7_audio.yaml @@ -0,0 +1,143 @@ +# Case 3.1: Portenta H7 CM7 UrbanSound8K DS-CNN scalar-score NAS. +# +# This is the Portenta M7 counterpart to the STM32 Case 3.2 scalar-score run. +# It measures classifier inference over cached log-mel tensors only. It does +# not include microphone capture, feature extraction, logging, or communications +# energy in the NAS score. +# +# Intended launch on the GPU host: +# CUDA_VISIBLE_DEVICES=1 TF_FORCE_GPU_ALLOW_GROWTH=true \ +# python src/nas_model_client.py \ +# --config src/config/case_study_configs/nas_config_case3_1_portenta_m7_audio.yaml \ +# --study-name UrbanSound8K_DSCNN_PORTENTA_M7_AUDIO_case3_1 + +device: + name: PORTENTA_H7 + hil: true + compile_when_hil_disabled: auto + runtime_mode: back_to_back + latency_budget_ms: 2000.0 + serial_port: "/dev/ttyACM0" + measured_inference_runs: 10 + dut_ready_timeout_s: 12.0 + serial_timeout_s: 12.0 + harness_serial_port: "/dev/ttyACM1" + harness_fqbn: "arduino:mbed_nano:nano33ble" + harness_auto_flash: "once" + harness_ready_timeout_s: 5.0 + harness_arm_timeout_s: 5.0 + harness_active_timeout_s: 30.0 + harness_done_timeout_s: 5.0 + portenta: + target_core: "cm7" + split: "75_25" + security: "none" + +dataset: + name: urbansound8k_mel + params: + cache_dir: "data/urbansound8k/cache/v2_logmel_16k_2s_64mels_25ms_10ms_folds" + fold_rotation_cache_dir: "data/urbansound8k/cache/v2_logmel_16k_2s_64mels_25ms_10ms_folds/fold_rotation" + batch_period_ms: 2000 + +task: + name: sound_classification + params: + early_stopping_patience: 20 + evaluation: + protocol: fixed_split + fold_rotation: + test_folds: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + +model: + family: audio_dscnn + params: + export_variant: untrained + search: {} # Empty search uses AudioDSCNNFamily.AUDIO_DSCNN_SEARCH_CHOICES. + +training: + nas_epochs: 55 + model_epochs: 990 + nas_trials: 200 + max_total_trials: 300 + quantization: + mode: int8_ptq + search: false + choices: [int8_ptq] + latency_proxy_max_flops: 30000000.0 + train: true + energy_aware: true + input_mode: "uniform" + +nas: + score: + type: scoring-function + metrics: + system_energy_budget_mj: + type: energy-budget-from-power + power_mw: + type: literal + value: 800.0 + duration_ms: + type: metric + metric: latency_budget_ms + inference_energy_budget_mj: + type: energy-budget-from-power + power_mw: + type: literal + value: 200.0 + duration_ms: + type: metric + metric: latency_budget_ms + params: + terms: + - type: weighted + metric: macro_f1 + weight: 1.0 + - type: normalized-weighted + metric: energy_mj_per_inference + weight: -0.10 + reference: + type: metric + metric: inference_energy_budget_mj + prune: + rules: [] + feasibility: + train_if_infeasible: false + rules: + - rule: latency_budget + metric: latency_ms + condition: gt + reference: + type: metric + metric: latency_budget_ms + reason: "Latency exceeds the 2 s audio decision cadence" + - rule: ram_capacity + metric: ram_bytes + condition: gt + reference: + type: metric + metric: max_ram_bytes + reason: "RAM usage exceeds target capacity" + - rule: flash_capacity + metric: flash_bytes + condition: gt + reference: + type: metric + metric: max_flash_bytes + reason: "Flash usage exceeds target capacity" + +outputs: + models_dir: "models" + candidate_dir: "audio_dscnn" + artifact_stem: "UrbanSound8K_DSCNN_PORTENTA_M7_AUDIO_case3_1" + log_file_name: "log_NAS_UrbanSound8K_DSCNN_PORTENTA_M7_AUDIO_case3_1.csv" + +network: + host: "127.0.0.1" + port: 6001 + recv_timeout_sec: 720 + send_timeout_sec: 720 + +logging: + level: "INFO" diff --git a/src/config/case_study_configs/nas_config_case3_2_stm32_audio.yaml b/src/config/case_study_configs/nas_config_case3_2_stm32_audio.yaml new file mode 100644 index 0000000..52e66a7 --- /dev/null +++ b/src/config/case_study_configs/nas_config_case3_2_stm32_audio.yaml @@ -0,0 +1,154 @@ +# Case 3.2: STM32 N657 UrbanSound8K DS-CNN scalar-score NAS +# with cadenced telemetry collection. +# +# This measures classifier inference over cached log-mel tensors only. It does +# not include microphone capture, feature extraction, logging, or communications +# energy in the NAS score. +# +# Intended launch on the GPU host: +# CUDA_VISIBLE_DEVICES=1 TF_FORCE_GPU_ALLOW_GROWTH=true \ +# python src/nas_model_client.py \ +# --config src/config/case_study_configs/nas_config_case3_2_stm32_audio.yaml \ +# --study-name UrbanSound8K_DSCNN_STM32_AUDIO_case3_2 + +device: + name: STM32_NUCLEO_N657X0_Q + hil: true + compile_when_hil_disabled: auto + runtime_mode: cadenced + latency_budget_ms: 2000.0 + serial_port: "/dev/ttyACM0" + measured_inference_runs: 10 + dut_ready_timeout_s: 12.0 + serial_timeout_s: 12.0 + cpu_clock_mhz_options: null + harness_serial_port: "/dev/ttyACM1" + harness_fqbn: "arduino:mbed_nano:nano33ble" + harness_auto_flash: "once" + harness_ready_timeout_s: 5.0 + harness_arm_timeout_s: 5.0 + harness_active_timeout_s: 30.0 + harness_done_timeout_s: 5.0 + stm32: + project_root: "sketches/stm32/tinyodom_stm32_lrun" + appli_flash_address: "0x70100000" + wake_margin_us: 5000 + min_sleep_us: 5000 + weight_storage_mode: "external_flash" + weights_memory_pool: "analysis_scripts/stm32_example_project/nucleo_mypool.json" + +dataset: + name: urbansound8k_mel + params: + cache_dir: "data/urbansound8k/cache/v2_logmel_16k_2s_64mels_25ms_10ms_folds" + fold_rotation_cache_dir: "data/urbansound8k/cache/v2_logmel_16k_2s_64mels_25ms_10ms_folds/fold_rotation" + batch_period_ms: 2000 + +task: + name: sound_classification + params: + early_stopping_patience: 20 + evaluation: + protocol: fixed_split + fold_rotation: + test_folds: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + +model: + family: audio_dscnn + params: + export_variant: untrained + search: {} # Empty search uses AudioDSCNNFamily.AUDIO_DSCNN_SEARCH_CHOICES. + +training: + nas_epochs: 55 + model_epochs: 990 + nas_trials: 200 + max_total_trials: 300 + quantization: + mode: int8_ptq + search: false + choices: [int8_ptq] + latency_proxy_max_flops: 30000000.0 + train: true + energy_aware: true + input_mode: "uniform" + +nas: + score: + type: scoring-function + metrics: + system_energy_budget_mj: + type: energy-budget-from-power + power_mw: + type: literal + value: 800.0 + duration_ms: + type: metric + metric: latency_budget_ms + inference_energy_budget_mj: + type: energy-budget-from-power + power_mw: + type: literal + value: 200.0 + duration_ms: + type: metric + metric: latency_budget_ms + params: + terms: + - type: weighted + metric: macro_f1 + weight: 1.0 + - type: normalized-weighted + metric: energy_mj_per_inference + weight: -0.10 + reference: + type: metric + metric: inference_energy_budget_mj + prune: + rules: [] + feasibility: + train_if_infeasible: false + rules: + - rule: latency_budget + metric: latency_ms + condition: gt + reference: + type: metric + metric: latency_budget_ms + reason: "Latency exceeds the 2 s audio decision cadence" + - rule: ram_capacity + metric: ram_bytes + condition: gt + reference: + type: metric + metric: max_ram_bytes + reason: "RAM usage exceeds target capacity" + - rule: flash_capacity + metric: flash_bytes + condition: gt + reference: + type: metric + metric: max_flash_bytes + reason: "Internal flash usage exceeds target capacity" + - rule: external_flash_capacity + metric: external_flash_bytes + condition: gt + reference: + type: literal + value: 67108864 + reason: "External flash weight storage exceeds target capacity" + +outputs: + models_dir: "models" + candidate_dir: "audio_dscnn" + artifact_stem: "UrbanSound8K_DSCNN_STM32_AUDIO_case3_2" + log_file_name: "log_NAS_UrbanSound8K_DSCNN_STM32_AUDIO_case3_2.csv" + +network: + host: "127.0.0.1" + port: 6001 + recv_timeout_sec: 720 + send_timeout_sec: 720 + +logging: + level: "INFO" diff --git a/src/config/nas_config.yaml b/src/config/nas_config.yaml deleted file mode 100644 index ed27fa1..0000000 --- a/src/config/nas_config.yaml +++ /dev/null @@ -1,347 +0,0 @@ -# Hardware / device configuration -device: - name: STM32_NUCLEO_N657X0_Q - hil: true - compile_when_hil_disabled: auto # With hil:false, choose compile-only proxy automatically when score/prune needs compile metrics; use false for pure desktop runs. - runtime_mode: cadenced # Shared runtime control: `back_to_back` or `cadenced`. - # latency_budget_ms: 200.0 # Optional shared cadence-budget override. Defaults to dataset batch_period_ms, then stride / sampling_rate_hz * 1000. - serial_port: "/dev/ttyACM0" - measured_inference_runs: 10 # Number of on-device inference invokes averaged into one measured HIL attempt. - dut_ready_timeout_s: 12.0 # Time to wait for STM32_AI_INIT / DUT READY before sending START. - serial_timeout_s: 12.0 # Time to wait for the measured runtime pass after START. - cpu_clock_mhz_options: [200, 300, 400, 600, 800] # Optional per-trial CPU presets for boards that support runtime clock selection. - harness_serial_port: "/dev/ttyACM1" - harness_fqbn: "arduino:mbed_nano:nano33ble" - harness_auto_flash: "once" - harness_ready_timeout_s: 5.0 - harness_arm_timeout_s: 5.0 - harness_active_timeout_s: 30.0 - harness_done_timeout_s: 5.0 - stm32: - project_root: "sketches/stm32/tinyodom_stm32_lrun" - appli_flash_address: "0x70100000" - wake_margin_us: 5000 # Cadenced wake-up guard band before each release time. - min_sleep_us: 5000 # Cadenced minimum Stop-mode sleep request. - weight_storage_mode: "external_flash" # Set to external_flash to stage weights into NOR flash. - weights_memory_pool: "analysis_scripts/stm32_example_project/nucleo_mypool.json" - -# Explicit modular component selection -dataset: - name: oxiod - params: - sampling_rate_hz: 100 # Hz, derived from dataset - window_size: 200 # matches TinyODOM paper - stride: 20 # matches TinyODOM paper - directory: "data/oxiod/" - calibration_windows: 10000 # windows to load for calibration-only jobs; set None for full split - -task: - name: odometry_regression - params: - early_stopping_patience: 40 - -model: - family: odom_tcn - params: - # Odometry HIL/export uses this deterministic perturbed export model to - # approximate trained numeric behavior without requiring a checkpoint. - export_variant: approx_trained - search: {} - -# Training-time switches and limits -training: - nas_epochs: 55 - model_epochs: 990 - nas_trials: 150 - nas_multiobjective_population_size: 50 # population size for multi-objective NAS, note: nas trials should be at least 3x this value - max_total_trials: 300 # safety cap for total attempts to offset pruned/failed runs - quantization: - mode: int8_ptq - search: false - choices: [int8_ptq] - latency_proxy_max_flops: 30000000.0 - train: true - energy_aware: true - # Input mode for HIL sketches: uniform, oxiod_representative, or oxiod_real - input_mode: "uniform" - -# Active NAS policy configuration. -# This is the default scalar NAS setup used by the repo. -# -# This example is meant to be the readable config equivalent of the old -# hard-coded score in `model.py`. Each term below exists for a specific reason: -# - reward lower RMSE -# - lightly penalize RAM usage relative to device RAM capacity -# - lightly penalize flash usage relative to device flash capacity -# - penalize latency only after it exceeds the configured latency budget -# - penalize energy when it deviates from an energy budget derived from power -# budget and latency budget -# -# The `metrics:` subsection defines reusable derived values that make the score -# terms easier to read. The `params.terms:` subsection then says how each value -# contributes to the final scalar score. -nas: - score: - type: scoring-function - metrics: - # Flash can live in internal flash and, on some targets, external flash. - # This derived metric lets the score treat total model storage as one value. - total_flash_bytes: - type: add - metrics: - - flash_bytes - - external_flash_bytes - # Reconstruct the old energy target from: - # target power (mW) * latency budget (ms) / 1000 = energy budget (mJ) - # Adjust `power_mw.value` to reflect the per-inference power budget you want. - energy_budget_mj: - type: energy-budget-from-power - power_mw: - type: literal - value: 100.0 - duration_ms: - type: metric - metric: latency_budget_ms - params: - terms: - # Core accuracy term. - # Negative weight means lower RMSE produces a higher scalar score. - - type: weighted - metric: rmse_total - weight: -1.0 - - # Resource term for RAM usage. - # This is normalized by the target device RAM capacity so the penalty - # scales sensibly across boards with different memory sizes. - - type: normalized-weighted - metric: ram_bytes - weight: 0.01 - reference: - type: metric - metric: max_ram_bytes - - # Resource term for flash usage. - # Uses `total_flash_bytes` so internal + external flash are considered - # together when the target supports both. - - type: normalized-weighted - metric: total_flash_bytes - weight: 0.01 - reference: - type: metric - metric: max_flash_bytes - - # Latency policy term. - # Below the latency budget this contributes nothing. - # Above the latency budget it subtracts a penalty proportional to the - # amount of overrun. - - type: boundary - metric: latency_ms - weight: 1.0 - reference: - type: metric - metric: latency_budget_ms - - # Energy policy term. - # This compares measured energy per inference against the derived - # `energy_budget_mj` target. If energy is below target or above target, - # the penalty grows with the distance from that target. - - type: target - metric: energy_mj_per_inference - weight: 0.15 - reference: - type: metric - metric: energy_budget_mj - prune: - rules: - # Pre-training latency gate. - # If the measured latency already exceeds the deployment budget, prune - # the trial before training so NAS does not waste time on it. - - rule: latency_budget - metric: latency_ms - condition: gt - reference: - type: metric - metric: latency_budget_ms - reason: "Latency exceeds deployment budget" - # Optional energy gate that reuses the derived budget metric above. - # - rule: energy_budget - # metric: energy_mj_per_inference - # condition: gt - # reference: - # type: metric - # metric: energy_budget_mj - # reason: "Energy exceeds deployment budget" - -# Alternative multi-objective NAS example. -# Uncomment this block if you want a Pareto front instead of one scalar score. -# -# Multi-objective mode does not collapse everything into one scalar. Instead, -# Optuna keeps a frontier of models that trade off accuracy against runtime. -# This is the better fit when you want to inspect the quality/latency trade -# space rather than committing to one fixed weighting policy up front. -# Note: Pruning is not supported for multi objective studies in Optuna, so the -# `prune.rules` list is left empty here. -# -# When used with `device.runtime_mode: cadenced`, cadenced overload remains -# telemetry in this mode. Trials with `cadenced_deadline_miss_count > 0` or -# `cadenced_active_inference_latency_ms > latency_budget_ms` can still appear -# on the Pareto frontier and should be post-filtered if schedulability is a hard -# requirement. -# -# Example cadenced Pareto setup: -# - objective 1 minimizes total RMSE -# - objective 2 minimizes cadenced energy per window -# -# nas: -# score: -# type: multi-objective -# params: -# objectives: -# # Objective 1: minimize overall prediction error. -# # `rmse_total` is the built-in aggregate `rmse_vel_x + rmse_vel_y`. -# - metric: rmse_total -# direction: minimize -# # Objective 2: minimize cadenced energy per scheduled window. -# # This gives Optuna a Pareto tradeoff between model quality and energy. -# - metric: cadenced_energy_mj_per_window -# direction: minimize -# prune: -# rules: [] - -# Alternative scalar cadenced-feasibility example. -# Uncomment and adapt this shape if you want cadenced schedulability to be a -# hard gate before training. This is the config-level version of "deadline -# misses are infeasible". -# -# nas: -# score: -# type: scoring-function -# metrics: -# total_flash_bytes: -# type: add -# metrics: -# - flash_bytes -# - external_flash_bytes -# energy_budget_mj: -# type: energy-budget-from-power -# power_mw: -# type: literal -# value: 100.0 -# duration_ms: -# type: metric -# metric: latency_budget_ms -# params: -# terms: -# - type: weighted -# metric: rmse_total -# weight: -1.0 -# - type: normalized-weighted -# metric: ram_bytes -# weight: 0.01 -# reference: -# type: metric -# metric: max_ram_bytes -# - type: normalized-weighted -# metric: total_flash_bytes -# weight: 0.01 -# reference: -# type: metric -# metric: max_flash_bytes -# - type: target -# metric: cadenced_energy_mj_per_window -# weight: 0.15 -# reference: -# type: metric -# metric: energy_budget_mj -# prune: -# rules: -# - rule: latency_budget -# metric: latency_ms -# condition: gt -# reference: -# type: metric -# metric: latency_budget_ms -# reason: "Latency exceeds deployment budget" -# - rule: cadenced_deadline_budget -# metric: cadenced_deadline_miss_count -# condition: gt -# reference: -# type: literal -# value: 0 -# reason: "Cadenced deadline missed" -# - rule: cadenced_active_latency_budget -# metric: cadenced_active_inference_latency_ms -# condition: gt -# reference: -# type: metric -# metric: latency_budget_ms -# reason: "Cadenced active inference exceeds slot budget" -# # Optional backend-local cadenced error gate. -# # - rule: cadenced_phase_ok -# # metric: cadenced_error_code -# # condition: gt -# # reference: -# # type: literal -# # value: 0 -# # reason: "Cadenced phase failed" - -# Alternative non-HIL scalar example. -# Uncomment and adapt this shape if you want `device.hil: false` and do not -# want the score to depend on measured latency or energy. In proxy mode those -# metrics are unavailable, so keep the score/prune policy limited to metrics -# that still exist before training or after proxy-only compilation. -# -# device: -# hil: false -# -# nas: -# score: -# type: scoring-function -# metrics: -# total_flash_bytes: -# type: add -# metrics: -# - flash_bytes -# - external_flash_bytes -# params: -# terms: -# - type: weighted -# metric: rmse_total -# weight: -1.0 -# - type: normalized-weighted -# metric: ram_bytes -# weight: 0.01 -# reference: -# type: metric -# metric: max_ram_bytes -# - type: normalized-weighted -# metric: total_flash_bytes -# weight: 0.01 -# reference: -# type: metric -# metric: max_flash_bytes -# - type: normalized-weighted -# metric: flops -# weight: 0.01 -# reference: -# type: literal -# value: 30000000.0 -# prune: -# rules: [] - -# Output directories and filenames -outputs: - models_dir: "models" - candidate_dir: "odom_tcn" - artifact_stem: "TinyOdomEx_OxIOD" - log_file_name: "log_NAS_Oxiod_STM32_NUCLEO_N657X0_Q_MO.csv" - -network: - host: "127.0.0.1" - port: 6001 - recv_timeout_sec: 720 - send_timeout_sec: 720 - -# Runtime logging controls -logging: - level: "INFO" diff --git a/src/config/nas_config_audio_portenta.yaml b/src/config/nas_config_audio_portenta.yaml index 17f46f9..a15a579 100644 --- a/src/config/nas_config_audio_portenta.yaml +++ b/src/config/nas_config_audio_portenta.yaml @@ -1,31 +1,26 @@ -# Audio DS-CNN config for Arduino-backed Portenta H7 CM7 smoke/HIL runs. -# -# Phase 8 measures classifier inference over precomputed log-mel tensors only. -# Firmware-side microphone capture and audio frontend timing are out of scope. +# Portenta H7 CM7 UrbanSound8K DS-CNN measured-energy NAS example. device: name: PORTENTA_H7 hil: true + compile_when_hil_disabled: auto runtime_mode: back_to_back + latency_budget_ms: 2000.0 serial_port: "/dev/ttyACM0" measured_inference_runs: 10 - dut_ready_timeout_s: 14.5 + dut_ready_timeout_s: 12.0 serial_timeout_s: 12.0 - portenta: - target_core: cm7 - split: "75_25" - security: none harness_serial_port: "/dev/ttyACM1" harness_fqbn: "arduino:mbed_nano:nano33ble" harness_auto_flash: "once" - harness_arm_pin: 3 - harness_trigger_pin: 2 - dut_arm_hold_ms: 600 - harness_stable_low_ms: 500 - harness_ready_timeout_s: 3.6 - harness_arm_timeout_s: 0.0 - harness_active_timeout_s: 12.0 - harness_done_timeout_s: 3.6 + harness_ready_timeout_s: 5.0 + harness_arm_timeout_s: 5.0 + harness_active_timeout_s: 30.0 + harness_done_timeout_s: 5.0 + portenta: + target_core: "cm7" + split: "75_25" + security: "none" dataset: name: urbansound8k_mel @@ -47,12 +42,12 @@ model: family: audio_dscnn params: export_variant: untrained - search: {} # Empty search uses AudioDSCNNFamily.AUDIO_DSCNN_SEARCH_CHOICES. + search: {} training: nas_epochs: 55 model_epochs: 990 - nas_trials: 150 + nas_trials: 200 nas_multiobjective_population_size: 50 max_total_trials: 300 quantization: @@ -61,49 +56,72 @@ training: choices: [int8_ptq] latency_proxy_max_flops: 30000000.0 train: true - energy_aware: false + energy_aware: true input_mode: "uniform" nas: score: type: scoring-function metrics: - total_flash_bytes: - type: add - metrics: - - flash_bytes - - external_flash_bytes + system_energy_budget_mj: + type: energy-budget-from-power + power_mw: + type: literal + value: 800.0 + duration_ms: + type: metric + metric: latency_budget_ms + inference_energy_budget_mj: + type: energy-budget-from-power + power_mw: + type: literal + value: 200.0 + duration_ms: + type: metric + metric: latency_budget_ms params: terms: - type: weighted - metric: accuracy + metric: macro_f1 weight: 1.0 - type: normalized-weighted - metric: ram_bytes - weight: -0.10 - reference: - type: metric - metric: max_ram_bytes - - type: normalized-weighted - metric: total_flash_bytes + metric: energy_mj_per_inference weight: -0.10 reference: type: metric - metric: max_flash_bytes - - type: boundary - metric: latency_ms - weight: 0.01 - reference: - type: metric - metric: latency_budget_ms + metric: inference_energy_budget_mj prune: rules: [] + feasibility: + train_if_infeasible: false + rules: + - rule: latency_budget + metric: latency_ms + condition: gt + reference: + type: metric + metric: latency_budget_ms + reason: "Latency exceeds the 2 s audio decision cadence" + - rule: ram_capacity + metric: ram_bytes + condition: gt + reference: + type: metric + metric: max_ram_bytes + reason: "RAM usage exceeds target capacity" + - rule: flash_capacity + metric: flash_bytes + condition: gt + reference: + type: metric + metric: max_flash_bytes + reason: "Flash usage exceeds target capacity" outputs: models_dir: "models" candidate_dir: "audio_dscnn" - artifact_stem: "TinyOdomEx_UrbanSound8K" - log_file_name: "log_NAS_UrbanSound8K_PORTENTA_H7.csv" + artifact_stem: "TinyOdomEx_UrbanSound8K_PORTENTA_M7" + log_file_name: "log_NAS_TinyOdomEx_UrbanSound8K_PORTENTA_M7.csv" network: host: "127.0.0.1" diff --git a/src/config/nas_config_audio_stm32.yaml b/src/config/nas_config_audio_stm32.yaml index 9744ffc..312830c 100644 --- a/src/config/nas_config_audio_stm32.yaml +++ b/src/config/nas_config_audio_stm32.yaml @@ -1,17 +1,16 @@ -# Audio DS-CNN desktop/bootstrap config targeting the STM32 N657 shape. -# -# This config targets the joint GPU plus STM32 HIL flow. Hardware-free audio -# short analysis runs should use analysis_scripts/audio_desktop_smoke instead. +# STM32 N657 UrbanSound8K DS-CNN measured-energy NAS example. device: name: STM32_NUCLEO_N657X0_Q hil: true + compile_when_hil_disabled: auto runtime_mode: cadenced + latency_budget_ms: 2000.0 serial_port: "/dev/ttyACM0" measured_inference_runs: 10 dut_ready_timeout_s: 12.0 serial_timeout_s: 12.0 - cpu_clock_mhz_options: [200, 300, 400, 600, 800] + cpu_clock_mhz_options: null harness_serial_port: "/dev/ttyACM1" harness_fqbn: "arduino:mbed_nano:nano33ble" harness_auto_flash: "once" @@ -47,18 +46,18 @@ model: family: audio_dscnn params: export_variant: untrained - search: {} # Empty search uses AudioDSCNNFamily.AUDIO_DSCNN_SEARCH_CHOICES. + search: {} training: nas_epochs: 55 model_epochs: 990 - nas_trials: 150 + nas_trials: 200 nas_multiobjective_population_size: 50 max_total_trials: 300 quantization: mode: int8_ptq - search: true - choices: [float, int8_ptq] + search: false + choices: [int8_ptq] latency_proxy_max_flops: 30000000.0 train: true energy_aware: true @@ -68,35 +67,37 @@ nas: score: type: scoring-function metrics: - total_flash_bytes: - type: add - metrics: - - flash_bytes - - external_flash_bytes + system_energy_budget_mj: + type: energy-budget-from-power + power_mw: + type: literal + value: 800.0 + duration_ms: + type: metric + metric: latency_budget_ms + inference_energy_budget_mj: + type: energy-budget-from-power + power_mw: + type: literal + value: 200.0 + duration_ms: + type: metric + metric: latency_budget_ms params: terms: - type: weighted - metric: accuracy + metric: macro_f1 weight: 1.0 - type: normalized-weighted - metric: ram_bytes + metric: energy_mj_per_inference weight: -0.10 reference: type: metric - metric: max_ram_bytes - - type: normalized-weighted - metric: total_flash_bytes - weight: -0.10 - reference: - type: metric - metric: max_flash_bytes - - type: boundary - metric: latency_ms - weight: 0.01 - reference: - type: metric - metric: latency_budget_ms + metric: inference_energy_budget_mj prune: + rules: [] + feasibility: + train_if_infeasible: false rules: - rule: latency_budget metric: latency_ms @@ -104,13 +105,34 @@ nas: reference: type: metric metric: latency_budget_ms - reason: "Latency exceeds deployment budget" + reason: "Latency exceeds the 2 s audio decision cadence" + - rule: ram_capacity + metric: ram_bytes + condition: gt + reference: + type: metric + metric: max_ram_bytes + reason: "RAM usage exceeds target capacity" + - rule: flash_capacity + metric: flash_bytes + condition: gt + reference: + type: metric + metric: max_flash_bytes + reason: "Internal flash usage exceeds target capacity" + - rule: external_flash_capacity + metric: external_flash_bytes + condition: gt + reference: + type: literal + value: 67108864 + reason: "External flash weight storage exceeds target capacity" outputs: models_dir: "models" candidate_dir: "audio_dscnn" artifact_stem: "TinyOdomEx_UrbanSound8K" - log_file_name: "log_NAS_UrbanSound8K_STM32_NUCLEO_N657X0_Q.csv" + log_file_name: "log_NAS_TinyOdomEx_UrbanSound8K_STM32.csv" network: host: "127.0.0.1" diff --git a/src/config/nas_config_ble.yaml b/src/config/nas_config_ble.yaml index b810827..48fd273 100644 --- a/src/config/nas_config_ble.yaml +++ b/src/config/nas_config_ble.yaml @@ -1,30 +1,30 @@ -# Hardware / device configuration +# Arduino Nano 33 BLE Sense OxIOD measured-energy NAS example. + device: name: ARDUINO_NANO_33_BLE_SENSE hil: true + compile_when_hil_disabled: auto + runtime_mode: back_to_back serial_port: "/dev/ttyACM0" - dut_ready_timeout_s: 14.5 # Time to wait for DUT READY before sending START. - harness_serial_port: "/dev/ttyACM1" # Serial port for the harness board (INA228 reader). - harness_fqbn: "arduino:mbed_nano:nano33ble" # Arduino FQBN used to compile/upload the harness sketch. - harness_auto_flash: "once" # Harness flash policy: once, always, or never. - harness_arm_pin: 3 # Active-low arm GPIO shared between DUT and harness. - harness_trigger_pin: 2 # Trigger GPIO toggled HIGH/LOW around inference. - dut_arm_hold_ms: 600 # DUT hold time after arm LOW before trigger HIGH. - harness_stable_low_ms: 500 # Required D3 LOW + D2 LOW stability before harness arms. - harness_ready_timeout_s: 3.6 # Time to wait for HARNESS READY after PING. - harness_arm_timeout_s: 0.0 # Harness arm timeout in seconds (compiled into harness firmware; 0 disables). - harness_active_timeout_s: 12.0 # Max duration of the measurement window. - harness_done_timeout_s: 3.6 # Time to wait for DONE after the window completes. + measured_inference_runs: 10 + dut_ready_timeout_s: 12.0 + serial_timeout_s: 12.0 + harness_serial_port: "/dev/ttyACM1" + harness_fqbn: "arduino:mbed_nano:nano33ble" + harness_auto_flash: "once" + harness_ready_timeout_s: 5.0 + harness_arm_timeout_s: 5.0 + harness_active_timeout_s: 30.0 + harness_done_timeout_s: 5.0 -# Explicit modular component selection dataset: name: oxiod params: - sampling_rate_hz: 100 # Hz, derived from dataset - window_size: 200 # matches TinyODOM paper - stride: 20 # matches TinyODOM paper + sampling_rate_hz: 100 + window_size: 200 + stride: 20 directory: "data/oxiod/" - calibration_windows: 10000 # windows to load for calibration-only jobs; set None for full split + calibration_windows: 10000 task: name: odometry_regression @@ -34,27 +34,22 @@ task: model: family: odom_tcn params: - # Odometry HIL/export uses this deterministic perturbed export model to - # approximate trained numeric behavior without requiring a checkpoint. export_variant: approx_trained search: {} -# Training-time switches and limits training: nas_epochs: 55 model_epochs: 990 nas_trials: 150 - nas_multiobjective_population_size: 50 # population size for multi-objective NAS, note: nas trials should be at least 3x this value - max_total_trials: 300 # safety cap for total attempts to offset pruned/failed runs + nas_multiobjective_population_size: 50 + max_total_trials: 300 quantization: mode: int8_ptq - search: false - choices: [int8_ptq] + search: true + choices: [float, int8_ptq] latency_proxy_max_flops: 30000000.0 train: true - # Enable energy-aware scoring during NAS energy_aware: true - # Input mode for HIL sketches: uniform, oxiod_representative, or oxiod_real input_mode: "uniform" nas: @@ -64,17 +59,16 @@ nas: objectives: - metric: rmse_total direction: minimize - - metric: latency_ms + - metric: energy_mj_per_inference direction: minimize prune: rules: [] -# Output directories and filenames outputs: models_dir: "models" candidate_dir: "odom_tcn" - artifact_stem: "TinyOdomEx_OxIOD" - log_file_name: "log_NAS_Oxiod_ARDUINO_NANO_33_BLE_SENSE_MO.csv" + artifact_stem: "TinyOdomEx_OxIOD_BLE33" + log_file_name: "log_NAS_TinyOdomEx_OxIOD_BLE33.csv" network: host: "127.0.0.1" @@ -82,6 +76,5 @@ network: recv_timeout_sec: 720 send_timeout_sec: 720 -# Runtime logging controls logging: level: "INFO" diff --git a/src/config/nas_config_memory_proxy.yaml b/src/config/nas_config_memory_proxy.yaml new file mode 100644 index 0000000..33b2730 --- /dev/null +++ b/src/config/nas_config_memory_proxy.yaml @@ -0,0 +1,78 @@ +# Pure desktop OxIOD NAS example: minimize validation RMSE and static memory traffic. + +device: + name: STM32_NUCLEO_N657X0_Q + hil: false + compile_when_hil_disabled: false + runtime_mode: back_to_back + serial_port: "/dev/ttyACM0" + measured_inference_runs: 10 + dut_ready_timeout_s: 12.0 + serial_timeout_s: 12.0 + stm32: + project_root: "sketches/stm32/tinyodom_stm32_lrun" + appli_flash_address: "0x70100000" + weight_storage_mode: "external_flash" + weights_memory_pool: "analysis_scripts/stm32_example_project/nucleo_mypool.json" + +dataset: + name: oxiod + params: + sampling_rate_hz: 100 + window_size: 200 + stride: 20 + directory: "data/oxiod/" + calibration_windows: 10000 + +task: + name: odometry_regression + params: + early_stopping_patience: 40 + +model: + family: odom_tcn + params: + export_variant: approx_trained + search: {} + +training: + nas_epochs: 55 + model_epochs: 990 + nas_trials: 150 + nas_multiobjective_population_size: 50 + max_total_trials: 300 + quantization: + mode: int8_ptq + search: false + choices: [int8_ptq] + latency_proxy_max_flops: 30000000.0 + train: true + energy_aware: false + input_mode: "uniform" + +nas: + score: + type: multi-objective + params: + objectives: + - metric: rmse_total + direction: minimize + - metric: memory_traffic_bytes + direction: minimize + prune: + rules: [] + +outputs: + models_dir: "models" + candidate_dir: "odom_tcn" + artifact_stem: "OxIOD_MEMORY_PROXY_case1_1" + log_file_name: "log_NAS_OxIOD_MEMORY_PROXY_case1_1.csv" + +network: + host: "127.0.0.1" + port: 6001 + recv_timeout_sec: 720 + send_timeout_sec: 720 + +logging: + level: "INFO" diff --git a/src/config/nas_config_portenta.yaml b/src/config/nas_config_portenta.yaml index 9ef73f0..5787822 100644 --- a/src/config/nas_config_portenta.yaml +++ b/src/config/nas_config_portenta.yaml @@ -1,35 +1,34 @@ -# Hardware / device configuration +# Portenta H7 CM7 OxIOD measured-energy NAS example. + device: name: PORTENTA_H7 hil: true + compile_when_hil_disabled: auto + runtime_mode: back_to_back serial_port: "/dev/ttyACM0" - measured_inference_runs: 10 # Number of on-device inference invokes averaged into one measured HIL attempt. + measured_inference_runs: 10 + dut_ready_timeout_s: 12.0 + serial_timeout_s: 12.0 + harness_serial_port: "/dev/ttyACM1" + harness_fqbn: "arduino:mbed_nano:nano33ble" + harness_auto_flash: "once" + harness_ready_timeout_s: 5.0 + harness_arm_timeout_s: 5.0 + harness_active_timeout_s: 30.0 + harness_done_timeout_s: 5.0 portenta: - target_core: "cm7" # Required for PORTENTA_H7. Choose: cm7 or cm4. - split: "75_25" # Optional. Defaults: cm7->75_25, cm4->50_50. - security: "none" # Optional. Default is none. - dut_ready_timeout_s: 14.5 # Time to wait for DUT READY before sending START. - harness_serial_port: "/dev/ttyACM1" # Serial port for the harness board (INA228 reader). - harness_fqbn: "arduino:mbed_nano:nano33ble" # Arduino FQBN used to compile/upload the harness sketch. - harness_auto_flash: "once" # Harness flash policy: once, always, or never. - harness_arm_pin: 3 # Active-low arm GPIO shared between DUT and harness. - harness_trigger_pin: 2 # Trigger GPIO toggled HIGH/LOW around inference. - dut_arm_hold_ms: 600 # DUT hold time after arm LOW before trigger HIGH. - harness_stable_low_ms: 500 # Required D3 LOW + D2 LOW stability before harness arms. - harness_ready_timeout_s: 3.6 # Time to wait for HARNESS READY after PING. - harness_arm_timeout_s: 0.0 # Harness arm timeout in seconds (compiled into harness firmware; 0 disables). - harness_active_timeout_s: 12.0 # Max duration of the measurement window. - harness_done_timeout_s: 3.6 # Time to wait for DONE after the window completes. + target_core: "cm7" + split: "75_25" + security: "none" -# Explicit modular component selection dataset: name: oxiod params: - sampling_rate_hz: 100 # Hz, derived from dataset - window_size: 200 # matches TinyODOM paper - stride: 20 # matches TinyODOM paper + sampling_rate_hz: 100 + window_size: 200 + stride: 20 directory: "data/oxiod/" - calibration_windows: 10000 # windows to load for calibration-only jobs; set None for full split + calibration_windows: 10000 task: name: odometry_regression @@ -39,27 +38,22 @@ task: model: family: odom_tcn params: - # Odometry HIL/export uses this deterministic perturbed export model to - # approximate trained numeric behavior without requiring a checkpoint. export_variant: approx_trained search: {} -# Training-time switches and limits training: nas_epochs: 55 model_epochs: 990 nas_trials: 150 - nas_multiobjective_population_size: 50 # population size for multi-objective NAS, note: nas trials should be at least 3x this value - max_total_trials: 300 # safety cap for total attempts to offset pruned/failed runs + nas_multiobjective_population_size: 50 + max_total_trials: 300 quantization: mode: int8_ptq - search: false - choices: [int8_ptq] + search: true + choices: [float, int8_ptq] latency_proxy_max_flops: 30000000.0 train: true - # Enable energy-aware scoring during NAS energy_aware: true - # Input mode for HIL sketches: uniform, oxiod_representative, or oxiod_real input_mode: "uniform" nas: @@ -69,17 +63,16 @@ nas: objectives: - metric: rmse_total direction: minimize - - metric: latency_ms + - metric: energy_mj_per_inference direction: minimize prune: rules: [] -# Output directories and filenames outputs: models_dir: "models" candidate_dir: "odom_tcn" - artifact_stem: "TinyOdomEx_OxIOD" - log_file_name: "log_NAS_Oxiod_PORTENTA_H7_MO.csv" + artifact_stem: "TinyOdomEx_OxIOD_PORTENTA_M7" + log_file_name: "log_NAS_TinyOdomEx_OxIOD_PORTENTA_M7.csv" network: host: "127.0.0.1" @@ -87,6 +80,5 @@ network: recv_timeout_sec: 720 send_timeout_sec: 720 -# Runtime logging controls logging: level: "INFO" diff --git a/src/config/nas_config_stm32.yaml b/src/config/nas_config_stm32.yaml new file mode 100644 index 0000000..8b25955 --- /dev/null +++ b/src/config/nas_config_stm32.yaml @@ -0,0 +1,308 @@ +# STM32 N657 OxIOD measured-energy NAS example. + +device: + name: STM32_NUCLEO_N657X0_Q + hil: true + compile_when_hil_disabled: auto + runtime_mode: back_to_back + serial_port: "/dev/ttyACM0" + measured_inference_runs: 10 + dut_ready_timeout_s: 12.0 + serial_timeout_s: 12.0 + cpu_clock_mhz_options: [200, 300, 400, 600, 800] + harness_serial_port: "/dev/ttyACM1" + harness_fqbn: "arduino:mbed_nano:nano33ble" + harness_auto_flash: "once" + harness_ready_timeout_s: 5.0 + harness_arm_timeout_s: 5.0 + harness_active_timeout_s: 30.0 + harness_done_timeout_s: 5.0 + stm32: + project_root: "sketches/stm32/tinyodom_stm32_lrun" + appli_flash_address: "0x70100000" + wake_margin_us: 5000 + min_sleep_us: 5000 + weight_storage_mode: "external_flash" + weights_memory_pool: "analysis_scripts/stm32_example_project/nucleo_mypool.json" + +dataset: + name: oxiod + params: + sampling_rate_hz: 100 + window_size: 200 + stride: 20 + directory: "data/oxiod/" + calibration_windows: 10000 + +task: + name: odometry_regression + params: + early_stopping_patience: 40 + +model: + family: odom_tcn + params: + export_variant: approx_trained + search: {} + +training: + nas_epochs: 55 + model_epochs: 990 + nas_trials: 250 + nas_multiobjective_population_size: 50 + max_total_trials: 300 + quantization: + mode: int8_ptq + search: true + choices: [float, int8_ptq] + latency_proxy_max_flops: 30000000.0 + train: true + energy_aware: true + input_mode: "uniform" + +nas: + score: + type: multi-objective + params: + objectives: + - metric: rmse_total + direction: minimize + - metric: energy_mj_per_inference + direction: minimize + prune: + rules: [] + +# Alternative scalar NAS example. +# Uncomment and adapt this block if you want one weighted score instead of a +# Pareto front. This shape rewards lower RMSE, lightly penalizes RAM and flash, +# gates latency against the deployment budget, and compares energy against a +# budget derived from power and latency. +# +# nas: +# score: +# type: scoring-function +# metrics: +# total_flash_bytes: +# type: add +# metrics: +# - flash_bytes +# - external_flash_bytes +# energy_budget_mj: +# type: energy-budget-from-power +# power_mw: +# type: literal +# value: 100.0 +# duration_ms: +# type: metric +# metric: latency_budget_ms +# params: +# terms: +# - type: weighted +# metric: rmse_total +# weight: -1.0 +# - type: normalized-weighted +# metric: ram_bytes +# weight: 0.01 +# reference: +# type: metric +# metric: max_ram_bytes +# - type: normalized-weighted +# metric: total_flash_bytes +# weight: 0.01 +# reference: +# type: metric +# metric: max_flash_bytes +# - type: boundary +# metric: latency_ms +# weight: 1.0 +# reference: +# type: metric +# metric: latency_budget_ms +# - type: target +# metric: energy_mj_per_inference +# weight: 0.15 +# reference: +# type: metric +# metric: energy_budget_mj +# feasibility: +# train_if_infeasible: false +# rules: +# - rule: latency_budget +# metric: latency_ms +# condition: gt +# reference: +# type: metric +# metric: latency_budget_ms +# reason: "Latency exceeds deployment budget" +# # - rule: energy_budget +# # metric: energy_mj_per_inference +# # condition: gt +# # reference: +# # type: metric +# # metric: energy_budget_mj +# # reason: "Energy exceeds deployment budget" + +# Alternative cadenced multi-objective NAS example. +# Uncomment this block if you want a Pareto front for cadenced execution: +# one objective minimizes prediction error and the other minimizes cadenced +# energy per scheduled window. +# +# device: +# runtime_mode: cadenced +# +# nas: +# score: +# type: multi-objective +# params: +# objectives: +# - metric: rmse_total +# direction: minimize +# - metric: cadenced_energy_mj_per_window +# direction: minimize +# feasibility: +# train_if_infeasible: false +# rules: +# - rule: cadenced_deadline_budget +# metric: cadenced_deadline_miss_count +# condition: gt +# reference: +# type: literal +# value: 0 +# reason: "Cadenced deadline missed" + +# Alternative cadenced-feasibility scalar example. +# Uncomment and adapt this shape if you want cadenced schedulability as a +# constrained deployability policy before training. The same rule shape works +# for scalar and multi-objective scores. +# +# nas: +# score: +# type: scoring-function +# metrics: +# total_flash_bytes: +# type: add +# metrics: +# - flash_bytes +# - external_flash_bytes +# energy_budget_mj: +# type: energy-budget-from-power +# power_mw: +# type: literal +# value: 100.0 +# duration_ms: +# type: metric +# metric: latency_budget_ms +# params: +# terms: +# - type: weighted +# metric: rmse_total +# weight: -1.0 +# - type: normalized-weighted +# metric: ram_bytes +# weight: 0.01 +# reference: +# type: metric +# metric: max_ram_bytes +# - type: normalized-weighted +# metric: total_flash_bytes +# weight: 0.01 +# reference: +# type: metric +# metric: max_flash_bytes +# - type: target +# metric: cadenced_energy_mj_per_window +# weight: 0.15 +# reference: +# type: metric +# metric: energy_budget_mj +# feasibility: +# train_if_infeasible: false +# rules: +# - rule: latency_budget +# metric: latency_ms +# condition: gt +# reference: +# type: metric +# metric: latency_budget_ms +# reason: "Latency exceeds deployment budget" +# - rule: cadenced_deadline_budget +# metric: cadenced_deadline_miss_count +# condition: gt +# reference: +# type: literal +# value: 0 +# reason: "Cadenced deadline missed" +# - rule: cadenced_active_latency_budget +# metric: cadenced_active_inference_latency_ms +# condition: gt +# reference: +# type: metric +# metric: latency_budget_ms +# reason: "Cadenced active inference exceeds slot budget" +# # - rule: cadenced_phase_ok +# # metric: cadenced_error_code +# # condition: gt +# # reference: +# # type: literal +# # value: 0 +# # reason: "Cadenced phase failed" + +# Alternative non-HIL scalar example. +# Uncomment and adapt this shape if you want `device.hil: false` and do not +# want the score to depend on measured latency or energy. In proxy mode those +# metrics are unavailable, so keep the score/prune/feasibility policy limited +# to metrics that still exist before training or after proxy-only compilation. +# +# device: +# hil: false +# compile_when_hil_disabled: false +# +# nas: +# score: +# type: scoring-function +# metrics: +# total_flash_bytes: +# type: add +# metrics: +# - flash_bytes +# - external_flash_bytes +# params: +# terms: +# - type: weighted +# metric: rmse_total +# weight: -1.0 +# - type: normalized-weighted +# metric: ram_bytes +# weight: 0.01 +# reference: +# type: metric +# metric: max_ram_bytes +# - type: normalized-weighted +# metric: total_flash_bytes +# weight: 0.01 +# reference: +# type: metric +# metric: max_flash_bytes +# - type: normalized-weighted +# metric: flops +# weight: 0.01 +# reference: +# type: literal +# value: 30000000.0 +# prune: +# rules: [] + +outputs: + models_dir: "models" + candidate_dir: "odom_tcn" + artifact_stem: "TinyOdomEx_OxIOD_STM32" + log_file_name: "log_NAS_TinyOdomEx_OxIOD_STM32.csv" + +network: + host: "127.0.0.1" + port: 6001 + recv_timeout_sec: 720 + send_timeout_sec: 720 + +logging: + level: "INFO" diff --git a/src/nas_model_client.py b/src/nas_model_client.py index af3b60d..722fe9b 100644 --- a/src/nas_model_client.py +++ b/src/nas_model_client.py @@ -52,6 +52,7 @@ configured_quantization_mode, build_trial_outcome, apply_cadenced_metric_defaults, + evaluate_feasibility_rules, evaluate_prune_rules, evaluate_score_config, get_score_config_directions, @@ -65,6 +66,7 @@ score_config_uses_training_metrics, set_error_code, ) +from tinyodom.model_metrics import StaticMemoryEstimate from tinyodom.pipeline_types import DataSplit, DatasetBundle, ModelBuildContext from tinyodom.registry import dataset_registry, model_family_registry from tinyodom.runtime_bootstrap import bootstrap_pipeline @@ -91,6 +93,8 @@ "harness_latency_ms", } ) +FEASIBILITY_POLICY_SIGNATURE_ATTR = "tinyodom_feasibility_policy_signature" +FEASIBILITY_NOT_EVALUATED_CONSTRAINT = 1e12 @dataclass(frozen=True) @@ -149,7 +153,7 @@ class NASModelClient: ---------- config_path : Path | str, optional Path to the NAS configuration YAML. Defaults to - ``src/config/nas_config.yaml`` via ``DEFAULT_CONFIG_PATH``. The + ``src/config/nas_config_stm32.yaml`` via ``DEFAULT_CONFIG_PATH``. The configuration controls data paths, device settings, NAS options (single vs multi-objective), training schedules, output directories, and network @@ -559,6 +563,200 @@ def _study_metric_names(self) -> list[str]: return [str(obj.metric) for obj in self.config.nas.score.params.objectives] return ["score"] + def _feasibility_config(self) -> Dict: + """Return the normalized NAS feasibility config with defaults. + + Returns + ------- + addict.Dict + Feasibility policy exposing ``train_if_infeasible`` and ``rules``. + """ + + raw_config = getattr(self.config.nas, "feasibility", None) + feasibility_config = Dict(raw_config or {}) + feasibility_config.train_if_infeasible = bool( + getattr(feasibility_config, "train_if_infeasible", False) + ) + feasibility_config.rules = list(getattr(feasibility_config, "rules", [])) + return feasibility_config + + def _feasibility_enabled(self) -> bool: + """Return whether the active config has feasibility rules. + + Returns + ------- + bool + ``True`` when ``nas.feasibility.rules`` contains at least one rule. + """ + + return bool(self._feasibility_config().rules) + + @staticmethod + def _normalize_feasibility_reference(reference: Any) -> dict[str, Any]: + """Return a signature-stable feasibility reference payload. + + Parameters + ---------- + reference : Any + Normalized rule reference from config. + + Returns + ------- + dict[str, Any] + JSON-friendly reference shape used in the study policy signature. + """ + + ref_type = str(getattr(reference, "type", "")).strip().lower() + if ref_type == "literal": + return {"type": "literal", "value": float(reference.value)} + return {"type": "metric", "metric": str(reference.metric)} + + def _feasibility_policy_signature(self) -> dict[str, Any] | None: + """Build the study-level feasibility policy signature. + + Returns + ------- + dict[str, Any] | None + Ordered normalized policy signature, or ``None`` when feasibility + is disabled. + """ + + feasibility_config = self._feasibility_config() + if not feasibility_config.rules: + return None + return { + "train_if_infeasible": bool(feasibility_config.train_if_infeasible), + "rules": [ + { + "rule": str(rule_cfg.rule), + "metric": str(rule_cfg.metric), + "condition": str(rule_cfg.condition), + "reference": self._normalize_feasibility_reference(rule_cfg.reference), + } + for rule_cfg in feasibility_config.rules + ], + } + + def _feasibility_rule_count(self) -> int: + """Return the number of active feasibility constraints. + + Returns + ------- + int + Length of ``nas.feasibility.rules`` for the active config. + """ + + return len(self._feasibility_config().rules) + + def _constraints_func(self, frozen_trial: optuna.trial.FrozenTrial) -> tuple[float, ...]: + """Return persisted feasibility constraints for Optuna samplers. + + Parameters + ---------- + frozen_trial : optuna.trial.FrozenTrial + Completed trial whose user attributes carry feasibility metadata. + + Returns + ------- + tuple[float, ...] + Signed constraints in policy order. Empty when feasibility is + disabled. + + Notes + ----- + The sampler callback must never re-evaluate the current config against + old trial metrics. It reads only persisted trial attributes; hard + failures and prune-rule exits use a positive persisted status-derived + sentinel so constrained samplers rank them infeasible. + """ + + if not self._feasibility_enabled(): + return () + raw_constraints = frozen_trial.user_attrs.get("feasibility_constraints") + if raw_constraints not in (None, ""): + return tuple(float(value) for value in raw_constraints) + status = str(frozen_trial.user_attrs.get("feasibility_status", "")).strip() + if status == "not_evaluated": + return tuple(FEASIBILITY_NOT_EVALUATED_CONSTRAINT for _ in range(self._feasibility_rule_count())) + return tuple(FEASIBILITY_NOT_EVALUATED_CONSTRAINT for _ in range(self._feasibility_rule_count())) + + def _validate_or_store_feasibility_signature(self, study: optuna.Study) -> None: + """Validate persisted feasibility policy metadata for a study. + + Parameters + ---------- + study : optuna.Study + Study being created or resumed. + + Returns + ------- + None + + Raises + ------ + RuntimeError + If the active config and persisted study policy are incompatible. + """ + + active_signature = self._feasibility_policy_signature() + stored_signature = getattr(study, "user_attrs", {}).get(FEASIBILITY_POLICY_SIGNATURE_ATTR) + has_trials = bool(study.trials) + if active_signature is None: + if stored_signature is not None: + raise RuntimeError( + "Existing study has a feasibility policy signature, but the active config disables nas.feasibility." + ) + return + if stored_signature is None: + if has_trials: + raise RuntimeError( + "Active config enables nas.feasibility, but the existing study has no feasibility policy signature." + ) + study.set_user_attr(FEASIBILITY_POLICY_SIGNATURE_ATTR, active_signature) + elif stored_signature != active_signature: + raise RuntimeError( + "Active nas.feasibility policy does not match the existing study feasibility signature." + ) + + for trial in study.trials: + if trial.state != TrialState.COMPLETE: + continue + status = str(trial.user_attrs.get("feasibility_status", "")).strip() + if status == "not_evaluated": + continue + constraints = trial.user_attrs.get("feasibility_constraints") + if status not in {"feasible", "infeasible"} or constraints in (None, ""): + raise RuntimeError( + f"Completed trial {trial.number} lacks required feasibility attributes for the active policy." + ) + + def _build_sampler(self): + """Build an Optuna sampler matching the active score and constraints. + + Returns + ------- + optuna.samplers.BaseSampler + NSGA-II or TPE sampler with feasibility constraints enabled when + configured. + """ + + constraints_func = self._constraints_func if self._feasibility_enabled() else None + if self._score_is_multiobjective(): + kwargs = { + "population_size": self.config.training.nas_multiobjective_population_size, + "seed": 42, + } + if constraints_func is not None: + kwargs["constraints_func"] = constraints_func + return optuna.samplers.NSGAIISampler(**kwargs) + kwargs = { + "n_startup_trials": 15, + "multivariate": True, + } + if constraints_func is not None: + kwargs["constraints_func"] = constraints_func + return optuna.samplers.TPESampler(**kwargs) + def _hardware_limit_device_options(self) -> dict[str, str] | None: """Build board options required to resolve dynamic hardware limits. @@ -689,6 +887,7 @@ def _build_runtime_metadata( self, flops: int, batch_size: int, + static_memory: StaticMemoryEstimate, ) -> Dict: """Build runtime-owned request metadata for HIL and scoring paths. @@ -698,6 +897,8 @@ def _build_runtime_metadata( FLOP count for the built model. batch_size : int Runner-owned batch size. + static_memory : StaticMemoryEstimate + Offline static tensor-memory proxy for the built model. Returns ------- @@ -714,6 +915,11 @@ def _build_runtime_metadata( "timesteps": timesteps, "input_dim": input_dim, "flops": int(flops), + "weight_bytes": int(static_memory.weight_bytes), + "activation_bytes": int(static_memory.activation_bytes), + "memory_traffic_bytes": int(static_memory.memory_traffic_bytes), + "memory_proxy_dtype_bytes": int(static_memory.dtype_bytes), + "memory_proxy_warning_count": int(static_memory.warning_count), } ) @@ -810,6 +1016,10 @@ def _add_metric(metric_name: str, stack: tuple[str, ...] = ()) -> None: _add_metric(str(rule_cfg.metric)) _add_reference(getattr(rule_cfg, "reference", None)) + for rule_cfg in getattr(self._feasibility_config(), "rules", []): + _add_metric(str(rule_cfg.metric)) + _add_reference(getattr(rule_cfg, "reference", None)) + compile_derived = {metric for metric in visited if metric in COMPILE_DERIVED_METRICS} runtime_only = {metric for metric in visited if self._metric_is_runtime_only(metric)} return NASMetricDependencies( @@ -1020,6 +1230,72 @@ def _apply_non_hil_success_sentinels(metrics: dict[str, Any]) -> None: metrics["latency_ms"] = -1 set_error_code(metrics, 1) + def _mark_feasibility_not_evaluated(self, metrics: dict[str, Any]) -> None: + """Attach metadata for a trial that ended before feasibility checks. + + Parameters + ---------- + metrics : dict[str, Any] + Trial metrics dictionary to mutate in place. + + Returns + ------- + None + """ + + metrics["feasible"] = False + metrics["feasibility_status"] = "not_evaluated" + metrics["feasibility_rule"] = "" + metrics["feasibility_reason"] = "" + metrics["feasibility_metric"] = "" + metrics["feasibility_value"] = "" + metrics["feasibility_reference"] = "" + metrics["feasibility_violation"] = "" + metrics["feasibility_constraints_json"] = "" + + @staticmethod + def _apply_feasibility_evaluation(metrics: dict[str, Any], evaluation: Any) -> None: + """Persist resolved feasibility metadata into the trial metrics dict. + + Parameters + ---------- + metrics : dict[str, Any] + Trial metrics dictionary to mutate in place. + evaluation : FeasibilityEvaluation + Resolved feasibility outcome. + + Returns + ------- + None + """ + + metrics["feasible"] = bool(evaluation.feasible) + metrics["feasibility_status"] = str(evaluation.status) + metrics["feasibility_constraints"] = list(evaluation.constraints) + metrics["feasibility_constraints_json"] = json.dumps(list(evaluation.constraints)) + violation = evaluation.first_violation or {} + metrics["feasibility_rule"] = violation.get("rule", "") + metrics["feasibility_reason"] = violation.get("reason", "") + metrics["feasibility_metric"] = violation.get("metric", "") + metrics["feasibility_value"] = violation.get("value", "") + metrics["feasibility_reference"] = violation.get("reference", "") + metrics["feasibility_violation"] = violation.get("violation", "") + + def _direction_penalty_values(self) -> list[float]: + """Return objective penalties aligned with active Optuna directions. + + Returns + ------- + list[float] + ``1e12`` for minimized objectives and ``-1e12`` for maximized + objectives. + """ + + return [ + -1e12 if direction == "maximize" else 1e12 + for direction in self._study_directions() + ] + @staticmethod def _sync_task_metrics(metrics: dict[str, Any], task_metrics: dict[str, Any]) -> None: """Write task-owned evaluation metrics back into the shared metrics dict. @@ -1136,10 +1412,11 @@ def objective(self, trial: optuna.Trial) -> float | tuple: This objective samples model hyperparameters (e.g., filters, kernel size, dilations) via Optuna, builds the corresponding TCN model to estimate FLOPs, queries a hardware-in-the-loop (HIL) server for resource/latency - metrics, and—when the candidate passes resource checks—trains and scores - the model on the OXIOD dataset. Trials are pruned on HIL errors or - resource violations. The returned objective is either a single score or - a multi-objective tuple depending on configuration. + metrics, and—when the candidate passes resource and configured + feasibility gates—trains and scores the model on the OXIOD dataset. + Trials are pruned on HIL errors or resource violations. The returned + objective is either a single score or a multi-objective tuple depending + on configuration. Parameters ---------- @@ -1175,15 +1452,17 @@ def objective(self, trial: optuna.Trial) -> float | tuple: 1. sample/build the model family hyperparameters 2. request HIL metrics for the candidate 3. apply hardware limit and arena checks - 4. evaluate pre-training prune rules + 4. evaluate post-build/pre-fit prune rules 5. either train/evaluate the task or synthesize task-metric sentinels 6. validate objective values and log the trial Single-objective runs prune by raising ``optuna.TrialPruned``. - Multi-objective runs instead return penalty tuples so the study can - keep its full objective shape. Sentinel conventions such as ``-1`` and - ``10000.0`` are used to preserve legacy logging/scoring expectations - when hardware or training metrics are unavailable. + Multi-objective HIL errors, resource failures, and feasibility gates + instead log ``pruned=True`` and return direction-aware penalty tuples + so Optuna records a complete trial with the configured objective shape. + Sentinel conventions such as ``-1`` and ``10000.0`` are used to + preserve legacy logging/scoring expectations when hardware or training + metrics are unavailable. """ artifacts_dir = self._artifacts_dir() log_path = artifacts_dir / self.config.outputs.log_file_name @@ -1217,9 +1496,16 @@ def objective(self, trial: optuna.Trial) -> float | tuple: trial, allow_search=uses_quantized_deployment_path, ) + static_memory = self.model_family.estimate_static_memory( + model, + self.model_build_context, + self.model_config, + quantization_mode=quantization_mode, + ) runtime_metadata = self._build_runtime_metadata( flops, batch_size, + static_memory, ) hyperparams = Dict({**family_hparams, **runtime_metadata}) @@ -1259,6 +1545,15 @@ def objective(self, trial: optuna.Trial) -> float | tuple: metrics.setdefault("energy_aware", bool(self.config.training.energy_aware)) else: metrics = self._synthesize_desktop_success_metrics() + metrics.update( + { + "weight_bytes": int(runtime_metadata.weight_bytes), + "activation_bytes": int(runtime_metadata.activation_bytes), + "memory_traffic_bytes": int(runtime_metadata.memory_traffic_bytes), + "memory_proxy_dtype_bytes": int(runtime_metadata.memory_proxy_dtype_bytes), + "memory_proxy_warning_count": int(runtime_metadata.memory_proxy_warning_count), + } + ) needs_hardware_limits = ( collect_compile_metrics @@ -1321,13 +1616,11 @@ def _fail_with_penalty( metrics.setdefault("latency_budget_ms", -1.0) metrics.setdefault("arena_bytes", -1) apply_cadenced_metric_defaults(metrics, metrics) + self._mark_feasibility_not_evaluated(metrics) directions = self._study_directions() if self._score_is_multiobjective(): objective_names = [str(obj.metric) for obj in self.config.nas.score.params.objectives] - objective_values = [ - -1e12 if direction == "maximize" else 1e12 - for direction in directions - ] + objective_values = self._direction_penalty_values() trial_outcome = TrialOutcome( score=None, objective_names=objective_names, @@ -1418,11 +1711,72 @@ def _fail_with_penalty( hyperparams=Dict(hyperparams), score_config=self.config.nas.score, prune_config=self.config.nas.prune, + task_nonnegative_metric_names=task_nonnegative_metric_names, ) if prune_hit is not None: prune_rule, prune_reason = prune_hit return _fail_with_penalty(prune_reason, prune_rule=prune_rule) + feasibility_config = self._feasibility_config() + feasibility_evaluation = None + if feasibility_config.rules: + feasibility_evaluation = evaluate_feasibility_rules( + metrics=metrics, + hyperparams=Dict(hyperparams), + score_config=self.config.nas.score, + feasibility_config=feasibility_config, + task_nonnegative_metric_names=task_nonnegative_metric_names, + ) + self._apply_feasibility_evaluation(metrics, feasibility_evaluation) + if ( + not feasibility_evaluation.feasible + and not bool(feasibility_config.train_if_infeasible) + ): + directions = self._study_directions() + if self._score_is_multiobjective(): + objective_names = [str(obj.metric) for obj in self.config.nas.score.params.objectives] + objective_values = self._direction_penalty_values() + trial_outcome = TrialOutcome( + score=None, + objective_names=objective_names, + objective_values=objective_values, + objective_directions=directions, + task_metrics={}, + hyperparams=dict(hyperparams), + artifact_summary=None, + quantization_mode=quantization_mode, + ) + log_trial( + trial_outcome=trial_outcome, + metrics=metrics, + trial=trial, + log_file_name=str(log_path), + study_name=self.study_name, + ) + return tuple(objective_values) + + trial_outcome = TrialOutcome( + score=penalty_acc, + objective_names=["score"], + objective_values=[penalty_acc], + objective_directions=["maximize"], + task_metrics={}, + hyperparams=dict(hyperparams), + artifact_summary=None, + quantization_mode=quantization_mode, + ) + log_trial( + trial_outcome=trial_outcome, + metrics=metrics, + trial=trial, + log_file_name=str(log_path), + study_name=self.study_name, + ) + return penalty_acc + else: + metrics.setdefault("feasible", True) + metrics.setdefault("feasibility_status", "not_evaluated") + try: if not self.config.training.train: # The no-training path still needs task-owned metric names in @@ -1595,10 +1949,7 @@ def smoke_test( "train=False is incompatible with score configs that require training-only metrics." ) if self._score_is_multiobjective(): - sampler = optuna.samplers.NSGAIISampler( - population_size=self.config.training.nas_multiobjective_population_size, - seed=42, - ) + sampler = self._build_sampler() single_trial_study = optuna.create_study( directions=self._study_directions(), storage=storage_uri, @@ -1607,10 +1958,7 @@ def smoke_test( load_if_exists=True, ) else: - sampler = optuna.samplers.TPESampler( - n_startup_trials=15, - multivariate=True, - ) + sampler = self._build_sampler() single_trial_study = optuna.create_study( direction="maximize", storage=storage_uri, @@ -1618,6 +1966,7 @@ def smoke_test( sampler=sampler, load_if_exists=True, ) + self._validate_or_store_feasibility_signature(single_trial_study) single_trial_study.set_metric_names(self._study_metric_names()) try: single_trial_study.optimize(self.objective, n_trials=trials) @@ -1668,8 +2017,7 @@ def run_nas( study_name: str, storage: str = "sqlite:///optuna.db", ) -> optuna.Study: - """ - Run NAS with production settings, honoring configuration flags. + """Run NAS with production settings, honoring configuration flags. Parameters ---------- @@ -1680,11 +2028,12 @@ def run_nas( Notes ----- - The pipeline targets `config.training.nas_trials` completed trials and will - retry pruned/failed attempts until that target is met or - `config.training.max_total_trials` is reached. + The pipeline targets ``config.training.nas_trials`` feasible completed + trials and retries pruned, failed, and infeasible attempts until that + target is met or ``config.training.max_total_trials`` is reached. - Failed and pruned trials still consume the total-attempt budget. + Failed, pruned, and infeasible trials still consume the total-attempt + budget. Returns ------- @@ -1696,9 +2045,7 @@ def run_nas( max_total_trials = self.config.training.max_total_trials if self._score_is_multiobjective(): - sampler = optuna.samplers.NSGAIISampler( - population_size=self.config.training.nas_multiobjective_population_size, - seed=42) + sampler = self._build_sampler() study = optuna.create_study( directions=self._study_directions(), storage=storage, @@ -1708,10 +2055,7 @@ def run_nas( ) else: # Set up the Optuna study with TPE sampler and persistent storage. - sampler = optuna.samplers.TPESampler( - n_startup_trials=15, # slightly more exploration than default before narrowing in - multivariate=True, - ) + sampler = self._build_sampler() study = optuna.create_study( direction="maximize", storage=storage, @@ -1719,58 +2063,99 @@ def run_nas( sampler=sampler, load_if_exists=True, # resume if the study already exists ) + self._validate_or_store_feasibility_signature(study) study.set_metric_names(self._study_metric_names()) # Make sure we never shrink the total budget when resuming an existing study. max_total_trials = max(max_total_trials, len(study.trials)) def _trial_counts(): - """Count completed, pruned, and failed Optuna trials. + """Count complete/pruned/failed trials plus feasibility outcomes. Returns ------- - tuple[int, int, int] - Counts for complete, pruned, and failed trials in that order. + tuple[int, int, int, int, int] + Counts for complete, feasible complete, infeasible complete, + pruned, and failed trials in that order. """ completed = sum(1 for t in study.trials if t.state == TrialState.COMPLETE) + if self._feasibility_enabled(): + feasible = sum( + 1 + for t in study.trials + if ( + t.state == TrialState.COMPLETE + and str(getattr(t, "user_attrs", {}).get("feasibility_status", "")).strip() + == "feasible" + ) + ) + infeasible = sum( + 1 + for t in study.trials + if ( + t.state == TrialState.COMPLETE + and str(getattr(t, "user_attrs", {}).get("feasibility_status", "")).strip() + == "infeasible" + ) + ) + else: + feasible = completed + infeasible = 0 pruned = sum(1 for t in study.trials if t.state == TrialState.PRUNED) failed = sum(1 for t in study.trials if t.state == TrialState.FAIL) - return completed, pruned, failed + return completed, feasible, infeasible, pruned, failed round_idx = 0 + stop_reason = "unknown" try: while True: - completed, pruned, failed = _trial_counts() + completed, feasible, infeasible, pruned, failed = _trial_counts() total = len(study.trials) + feasible_fraction = (feasible / completed) if completed else 0.0 print( - f"[NAS] Progress: {completed} completed, {pruned} pruned, " - f"{failed} failed ({total} attempted)." + f"[NAS] Progress: {feasible} feasible, {infeasible} infeasible, " + f"{completed} completed, {pruned} pruned, {failed} failed " + f"({total} attempted, feasible_fraction={feasible_fraction:.3f})." ) - if completed >= target_completions: - print(f"[NAS] Reached target of {target_completions} completed trials.") + if feasible >= target_completions: + stop_reason = "target_feasible_completions" + print(f"[NAS] Reached target of {target_completions} feasible completed trials.") break - remaining_needed = target_completions - completed + remaining_needed = target_completions - feasible remaining_budget = max_total_trials - total if remaining_budget <= 0: + stop_reason = "max_total_trials" print( - f"[NAS] Stopping with {completed}/{target_completions} completed trials " + f"[NAS] Stopping with {feasible}/{target_completions} feasible completed trials " f"after hitting max_total_trials={max_total_trials}." ) break round_idx += 1 - next_batch = min(remaining_needed, remaining_budget) + if self._score_is_multiobjective(): + population_size = int(self.config.training.nas_multiobjective_population_size) + next_batch = min(max(remaining_needed, population_size), remaining_budget) + else: + next_batch = min(remaining_needed, remaining_budget) print(f"[NAS] Launching round {round_idx} for {next_batch} additional trial(s).") study.optimize(self.objective, n_trials=next_batch) except Exception as exc: - completed, pruned, failed = _trial_counts() + completed, feasible, infeasible, pruned, failed = _trial_counts() print( f"[NAS] Aborting after {len(study.trials)} trials " - f"({completed} completed, {pruned} pruned, {failed} failed) because of an error: {exc}" + f"({feasible} feasible, {infeasible} infeasible, {completed} completed, " + f"{pruned} pruned, {failed} failed) because of an error: {exc}" ) raise complete_trials = [t for t in study.trials if t.state == TrialState.COMPLETE] + completed, feasible, infeasible, pruned, failed = _trial_counts() + feasible_fraction = (feasible / completed) if completed else 0.0 + print( + f"[NAS] Finished by {stop_reason}: {feasible} feasible, {infeasible} infeasible, " + f"{completed} completed, {pruned} pruned, {failed} failed, " + f"feasible_fraction={feasible_fraction:.3f}." + ) if not complete_trials: print("[NAS] No completed trials recorded; skipping best-trial reporting.") return study @@ -1826,7 +2211,11 @@ def run_scoring_nas(self, study_name: str, storage_uri: str = "sqlite:///optuna. if self._score_is_multiobjective(): # Multi-objective: keep this as a “scoring + analysis” run. - pareto_trials = study.best_trials + pareto_trials = [ + trial + for trial in study.best_trials + if (not self._feasibility_enabled()) or bool(trial.user_attrs.get("feasible")) + ] pareto_ids = [t.number for t in pareto_trials] pareto_df = trials_df[trials_df["number"].isin(pareto_ids)] pareto_csv = Path(self.config.outputs.models_dir) / f"{study_name}_pareto.csv" @@ -1835,7 +2224,8 @@ def run_scoring_nas(self, study_name: str, storage_uri: str = "sqlite:///optuna. print(f"[run_scoring_nas] Pareto front size: {len(pareto_trials)}") return - print(f"[run_scoring_nas] Best value: {study.best_value}") + best_trial = self._best_trial_for_finalization(study) + print(f"[run_scoring_nas] Best feasible value: {best_trial.value}") # 2) Retrain the best architecture for the long schedule with early stopping. history_path = artifacts_dir / "train_history.json" history = self.train_best_trial( @@ -1963,7 +2353,49 @@ def _best_trial_params(self, study_storage: str, study_name: str) -> dict[str, A """ study = optuna.load_study(study_name=study_name, storage=study_storage) - return _family_trial_params(study.best_trial.params) + selected = self._best_trial_for_finalization(study) + return _family_trial_params(selected.params) + + def _best_trial_for_finalization(self, study: optuna.Study) -> Any: + """Return the best completed trial allowed for final artifacts. + + Parameters + ---------- + study : optuna.Study + Study containing completed NAS trials. + + Returns + ------- + Any + Best scalar trial for final training/evaluation metadata. + + Raises + ------ + RuntimeError + If feasibility is enabled and the study has no feasible completed + trial, or if no completed trial exists at all. + """ + + if self._feasibility_enabled(): + feasible_trials = [ + trial + for trial in study.get_trials(deepcopy=False, states=(TrialState.COMPLETE,)) + if ( + str(trial.user_attrs.get("feasibility_status", "")).strip() == "feasible" + and not bool(trial.user_attrs.get("pruned")) + ) + ] + if not feasible_trials: + raise RuntimeError( + "NAS completed without any feasible completed trials; " + "increase training.max_total_trials, relax nas.feasibility, " + "or inspect the feasibility CSV columns." + ) + return max(feasible_trials, key=lambda trial: float(trial.value)) + try: + return study.best_trial + except ValueError as exc: + raise RuntimeError("NAS completed without any completed trials.") from exc def _train_with_decoded_hparams( self, @@ -2118,7 +2550,8 @@ def _evaluate_checkpoint_with_context( best_quantization_mode = None if study_storage and study_name: study = optuna.load_study(study_name=study_name, storage=study_storage) - raw_best_params = dict(study.best_trial.params) + selected_trial = self._best_trial_for_finalization(study) + raw_best_params = dict(selected_trial.params) best_params = _family_trial_params(raw_best_params) best_quantization_mode = raw_best_params.get("quantization_mode") resolved_quantization_mode = ( @@ -2953,7 +3386,8 @@ def write_summary_bundle( summary_path.parent.mkdir(parents=True, exist_ok=True) study = optuna.load_study(study_name=study_name, storage=study_storage) - raw_best_params = dict(study.best_trial.params) + selected_trial = self._best_trial_for_finalization(study) + raw_best_params = dict(selected_trial.params) best_params = _family_trial_params(raw_best_params) quantization_mode = raw_best_params.get( "quantization_mode", diff --git a/src/tinyodom/interfaces.py b/src/tinyodom/interfaces.py index 91067e3..4cb9db2 100644 --- a/src/tinyodom/interfaces.py +++ b/src/tinyodom/interfaces.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Literal +from .model_metrics import StaticMemoryEstimate, count_flops_keras, estimate_static_memory_keras from .pipeline_types import ( DataSplit, DatasetBundle, @@ -657,6 +658,41 @@ def custom_objects(self) -> dict[str, Any]: return {} + def estimate_static_memory( + self, + model: tf.keras.Model, + ctx: ModelBuildContext, + config: Any, + *, + quantization_mode: str, + ) -> StaticMemoryEstimate: + """Estimate static tensor memory traffic for one built model. + + Parameters + ---------- + model : tensorflow.keras.Model + Built model to inspect. + ctx : ModelBuildContext + Normalized build-time context. The default implementation does not + need this value, but model families may use it for custom layers. + config : Any + Model-family configuration subtree. The default implementation does + not need this value. + quantization_mode : str + Deployment quantization mode used to choose scalar byte width. + + Returns + ------- + StaticMemoryEstimate + Static proxy estimate for batch size 1. + """ + + del ctx, config + return estimate_static_memory_keras( + model, + quantization_mode=quantization_mode, + ) + def count_flops( self, model: tf.keras.Model, @@ -670,25 +706,29 @@ def count_flops( model : tf.keras.Model Built model to profile. ctx : ModelBuildContext - Normalized build-time context. + Normalized build-time context containing the logical input shape. config : Any - Model-family configuration subtree. + Model-family configuration subtree. The default implementation does + not need this value. Returns ------- int - Model FLOP count when implemented by a concrete family or later - shared utility. + Static graph FLOP proxy for a batch-size-1 forward pass. Raises ------ - NotImplementedError - Raised by the Phase 1 default implementation because FLOP counting - has not yet been generalized into the abstraction layer. + ValueError + If ``ctx.input_shape`` is missing or empty. """ - del model, ctx, config - raise NotImplementedError("FLOP counting is not implemented by the Phase 1 default.") + del config + if ctx.input_shape is None or len(ctx.input_shape) == 0: + raise ValueError("ModelFamilyABC requires a non-empty input shape to count FLOPs.") + return count_flops_keras( + model, + tuple(int(dim) for dim in ctx.input_shape), + ) def supports_tflite(self) -> bool: """Return whether the family is intended to support TFLite export. diff --git a/src/tinyodom/microcontrollers/README.md b/src/tinyodom/microcontrollers/README.md index 3de95d0..b0b7cae 100644 --- a/src/tinyodom/microcontrollers/README.md +++ b/src/tinyodom/microcontrollers/README.md @@ -280,7 +280,7 @@ Use this path only for boards built and flashed through `arduino-cli`. 1. `src/tinyodom/microcontrollers/arduino_.py` 2. `src/tinyodom/microcontrollers/__init__.py` 3. `setup_arduino.sh` when a new Arduino core package is required -4. `src/config/nas_config.yaml` and/or board-specific configs +4. `src/config/` example configs and any board-specific config variants 5. Tests in `test/test_hardware.py` and `test/test_model.py` 6. `src/tinyodom/model.py` only when you are changing shared request plumbing, which should be unusual for a normal Arduino board bring-up diff --git a/src/tinyodom/microcontrollers/stm32_nucleo_n657x0.py b/src/tinyodom/microcontrollers/stm32_nucleo_n657x0.py index 90b1ef2..103713e 100644 --- a/src/tinyodom/microcontrollers/stm32_nucleo_n657x0.py +++ b/src/tinyodom/microcontrollers/stm32_nucleo_n657x0.py @@ -3109,10 +3109,18 @@ def _upload_failure(kind: str, detail: str) -> DeviceMetrics: if passthrough_value is not None: merged_power_metrics[passthrough_field] = passthrough_value merged_power_metrics.update(runtime_storage_metrics) + result_latency_s = telemetry.latency_s + try: + harness_latency_s = float(merged_power_metrics.get("harness_latency_s", -1.0)) + except (TypeError, ValueError): + harness_latency_s = -1.0 + phase_label = str(merged_power_metrics.get("phase", "")).strip().lower() + if phase_label == "back_to_back" and harness_latency_s > 0.0: + result_latency_s = harness_latency_s return DeviceMetrics( ram_bytes=compile_result.ram_bytes or -1, flash_bytes=compile_result.flash_bytes or -1, - latency_s=telemetry.latency_s, + latency_s=result_latency_s, arena_bytes=compile_result.arena_bytes or -1, error_code=HIL_ERROR_OK, power_metrics=merged_power_metrics, @@ -3445,10 +3453,15 @@ def evaluate( ) merged_power_metrics["runtime_mode"] = "cadenced" merged_power_metrics.update(self._cadenced_power_metrics_from_phase_result(cadenced_result)) + result_latency_s = ( + cadenced_result.latency_s + if cadenced_result.latency_s >= 0.0 + else base_result.latency_s + ) return DeviceMetrics( ram_bytes=base_result.ram_bytes, flash_bytes=base_result.flash_bytes, - latency_s=base_result.latency_s, + latency_s=result_latency_s, arena_bytes=base_result.arena_bytes, error_code=base_result.error_code, power_metrics=merged_power_metrics, diff --git a/src/tinyodom/model.py b/src/tinyodom/model.py index 640efd9..e69341b 100644 --- a/src/tinyodom/model.py +++ b/src/tinyodom/model.py @@ -27,10 +27,8 @@ from tcn import TCN from tensorflow.keras import Input, Model from tensorflow.keras.layers import Dense, Flatten, MaxPooling1D, Reshape -from tensorflow.python.framework.convert_to_constants import ( - convert_variables_to_constants_v2, -) +from .model_metrics import count_flops_keras from .hardware import ( HIL_controller, describe_error_code, @@ -40,7 +38,7 @@ get_device as get_microcontroller_device, resolve_device_options, ) -DEFAULT_CONFIG_PATH = Path(__file__).resolve().parents[1] / "config" / "nas_config.yaml" +DEFAULT_CONFIG_PATH = Path(__file__).resolve().parents[1] / "config" / "nas_config_stm32.yaml" REPO_ROOT = Path(__file__).resolve().parents[2] STM32_DEFAULT_PROJECT_ROOT = ( REPO_ROOT @@ -65,12 +63,13 @@ BOARD_QUANTIZATION_CAPABILITIES = { "STM32_NUCLEO_N657X0_Q": {"float", "int8_ptq"}, "PORTENTA_H7": {"float", "int8_ptq"}, - "ARDUINO_NANO_33_BLE_SENSE": {"int8_ptq"}, + "ARDUINO_NANO_33_BLE_SENSE": {"float", "int8_ptq"}, } VALID_DERIVED_METRIC_TYPES = {"add", "energy-budget-from-power"} VALID_TERM_TYPES = {"weighted", "normalized-weighted", "boundary", "target"} VALID_OBJECTIVE_DIRECTIONS = {"maximize", "minimize"} VALID_PRUNE_CONDITIONS = {"gt", "gte", "lt", "lte"} +FEASIBILITY_EQUALITY_EPSILON = 1e-12 NONNEGATIVE_METRICS = { "ram_bytes", "flash_bytes", @@ -78,6 +77,11 @@ "max_flash_bytes", "external_flash_bytes", "flops", + "weight_bytes", + "activation_bytes", + "memory_traffic_bytes", + "memory_proxy_dtype_bytes", + "memory_proxy_warning_count", "latency_ms", "energy_mj_per_inference", "avg_power_mw", @@ -102,6 +106,11 @@ "max_flash_bytes", "external_flash_bytes", "flops", + "weight_bytes", + "activation_bytes", + "memory_traffic_bytes", + "memory_proxy_dtype_bytes", + "memory_proxy_warning_count", "latency_ms", "energy_mj_per_inference", "avg_power_mw", @@ -179,6 +188,11 @@ "external_flash_bytes", "weight_storage_mode", "flops", + "weight_bytes", + "activation_bytes", + "memory_traffic_bytes", + "memory_proxy_dtype_bytes", + "memory_proxy_warning_count", "latency_ms", "latency_budget_ms", "energy_mj_per_inference", @@ -197,6 +211,15 @@ "pruned", "prune_reason", "prune_rule", + "feasible", + "feasibility_status", + "feasibility_rule", + "feasibility_reason", + "feasibility_metric", + "feasibility_value", + "feasibility_reference", + "feasibility_violation", + "feasibility_constraints_json", *CADENCED_CSV_FIELDS, ) @@ -310,6 +333,29 @@ class TrialOutcome: quantization_mode: str = "int8_ptq" +@dataclass(frozen=True) +class FeasibilityEvaluation: + """Resolved feasibility policy outcome for one pre-training trial. + + Parameters + ---------- + feasible : bool + Whether every configured feasibility constraint is satisfied. + status : str + Stable status label: ``"feasible"``, ``"infeasible"``, or + ``"not_evaluated"``. + constraints : list[float] + Signed Optuna constraint values in configuration order. + first_violation : dict[str, Any] | None + Metadata for the first violating rule in configuration order. + """ + + feasible: bool + status: str + constraints: list[float] + first_violation: dict[str, Any] | None = None + + class ScoreConfigEvaluationError(ValueError): """Raised when a configured score cannot be evaluated at runtime. @@ -1664,7 +1710,8 @@ def _validate_prune_config( Raises ------ ValueError - If any prune rule is invalid or incompatible with the score mode. + If any prune rule is invalid or references metrics that are unavailable + before task training. """ prune_config = Dict(prune_input or {}) raw_rules = prune_config.get("rules", []) @@ -1672,9 +1719,6 @@ def _validate_prune_config( raw_rules = [] if not isinstance(raw_rules, list): raise ValueError("nas.prune.rules must be a list when provided.") - if is_multiobjective_score_config(score_config) and len(raw_rules) > 0: - raise ValueError("nas.prune.rules is only supported when nas.score.type is scoring-function.") - allowed_metric_names = allowed_base_metric_names | set(getattr(score_config, "metrics", Dict()).keys()) normalized_rules = [] for idx, raw_rule in enumerate(raw_rules): @@ -1720,6 +1764,142 @@ def _validate_prune_config( return prune_config +def _validate_feasibility_config( + feasibility_input: Any, + score_config: Dict, + allowed_base_metric_names: set[str], + training_only_metric_names: set[str], + *, + allow_unknown_metric_names: bool = False, +) -> Dict: + """Validate and normalize NAS feasibility constraints. + + Parameters + ---------- + feasibility_input : object + Raw ``nas.feasibility`` configuration. + score_config : addict.Dict + Normalized ``nas.score`` configuration. + allowed_base_metric_names : set[str] + Infrastructure and task-owned metric names that are valid for the + current validation context. + training_only_metric_names : set[str] + Task-owned metric names that are only available after training. + allow_unknown_metric_names : bool, optional + Whether generic config loading should defer unknown task metric checks. + + Returns + ------- + addict.Dict + Normalized feasibility configuration with ``train_if_infeasible`` and + ``rules`` fields. + + Raises + ------ + ValueError + If any feasibility rule is invalid or references metrics unavailable + before task training. + """ + + feasibility_config = Dict(feasibility_input or {}) + raw_train_if_infeasible = feasibility_config.get("train_if_infeasible", False) + if not isinstance(raw_train_if_infeasible, bool): + raise ValueError("nas.feasibility.train_if_infeasible must be a boolean.") + feasibility_config.train_if_infeasible = bool(raw_train_if_infeasible) + feasibility_config.rules = _validate_prefit_rule_config( + raw_rules=feasibility_config.get("rules", []), + rule_path="nas.feasibility.rules", + score_config=score_config, + allowed_base_metric_names=allowed_base_metric_names, + training_only_metric_names=training_only_metric_names, + allow_unknown_metric_names=allow_unknown_metric_names, + ) + return feasibility_config + + +def _validate_prefit_rule_config( + *, + raw_rules: Any, + rule_path: str, + score_config: Dict, + allowed_base_metric_names: set[str], + training_only_metric_names: set[str], + allow_unknown_metric_names: bool = False, +) -> list[Dict]: + """Validate and normalize a list of pre-training NAS rules. + + Parameters + ---------- + raw_rules : Any + Raw rule list from ``nas.prune.rules`` or ``nas.feasibility.rules``. + rule_path : str + Human-readable config path used in error messages. + score_config : addict.Dict + Normalized ``nas.score`` configuration. + allowed_base_metric_names : set[str] + Infrastructure and task-owned metric names allowed by the active task. + training_only_metric_names : set[str] + Task metric names unavailable before training. + allow_unknown_metric_names : bool, optional + Whether generic config loading should defer unknown task metric checks. + + Returns + ------- + list[addict.Dict] + Normalized rule entries preserving configuration order. + + Raises + ------ + ValueError + If rules are malformed or reference metrics unavailable pre-training. + """ + + if raw_rules is None: + raw_rules = [] + if not isinstance(raw_rules, list): + raise ValueError(f"{rule_path} must be a list when provided.") + allowed_metric_names = allowed_base_metric_names | set(getattr(score_config, "metrics", Dict()).keys()) + normalized_rules = [] + for idx, raw_rule in enumerate(raw_rules): + rule_cfg = Dict(raw_rule) + metric_name = str(rule_cfg.get("metric", "")).strip() + condition = str(rule_cfg.get("condition", "")).strip().lower() + if metric_name not in allowed_metric_names and not allow_unknown_metric_names: + raise ValueError(f"{rule_path}[{idx}] references unknown metric '{metric_name}'.") + if condition not in VALID_PRUNE_CONDITIONS: + raise ValueError( + f"{rule_path}[{idx}].condition must be one of: {sorted(VALID_PRUNE_CONDITIONS)}." + ) + if _metric_depends_on_training( + metric_name, + getattr(score_config, "metrics", Dict()), + training_only_metric_names, + ): + raise ValueError(f"{rule_path}[{idx}] may not use training-only metric '{metric_name}'.") + reference = _validate_typed_reference( + rule_cfg.get("reference"), + allowed_metric_names, + f"{rule_path}[{idx}]", + allow_unknown_metric_names=allow_unknown_metric_names, + ) + if reference.type == "metric" and _metric_depends_on_training( + str(reference.metric), + getattr(score_config, "metrics", Dict()), + training_only_metric_names, + ): + raise ValueError( + f"{rule_path}[{idx}] may not reference training-only metric '{reference.metric}'." + ) + rule_cfg.metric = metric_name + rule_cfg.condition = condition + rule_cfg.reference = reference + rule_cfg.reason = str(rule_cfg.get("reason", "")).strip() + raw_rule_id = str(rule_cfg.get("rule", "")).strip() + rule_cfg.rule = raw_rule_id if raw_rule_id else f"rule_{idx}" + normalized_rules.append(rule_cfg) + return normalized_rules + + def _validate_nas_config( config: Dict, task_metric_names: set[str] | None = None, @@ -1771,8 +1951,16 @@ def _validate_nas_config( effective_training_only_metric_names, allow_unknown_metric_names=allow_unknown_metric_names, ) + feasibility_config = _validate_feasibility_config( + nas_config.get("feasibility", {}), + score_config, + allowed_base_metric_names, + effective_training_only_metric_names, + allow_unknown_metric_names=allow_unknown_metric_names, + ) nas_config.score = score_config nas_config.prune = prune_config + nas_config.feasibility = feasibility_config return nas_config @@ -1813,8 +2001,9 @@ def evaluate_prune_rules( hyperparams: Dict, score_config: Dict, prune_config: Dict, + task_nonnegative_metric_names: set[str] | None = None, ) -> tuple[str, str] | None: - """Evaluate pre-training prune rules against the current trial context. + """Evaluate post-build feasibility rules against the current trial context. Parameters ---------- @@ -1826,6 +2015,9 @@ def evaluate_prune_rules( Normalized score configuration that owns the derived metric registry. prune_config : addict.Dict Normalized prune configuration. + task_nonnegative_metric_names : set[str] | None, optional + Task-declared metric names that must not use negative unavailable + sentinels. Returns ------- @@ -1835,20 +2027,41 @@ def evaluate_prune_rules( Notes ----- - Rules are evaluated in configuration order. If a configured metric or - reference is unavailable at prune time, that unavailability is itself - treated as a prune outcome for the current rule. + Rules are evaluated in configuration order after model build/compile, FLOP + counting, and HIL/compile metrics are available, but before task fitting or + validation evaluation. If a configured metric or reference is unavailable + at gate time, that unavailability is itself treated as a prune outcome for + the current rule. """ context = dict(metrics) context["flops"] = hyperparams["flops"] for rule_cfg in getattr(prune_config, "rules", []): try: - metric_value = _resolve_metric_value(rule_cfg.metric, context, score_config) - reference_value = _typed_reference_value(rule_cfg.reference, context, score_config) + metric_value = _resolve_metric_value( + rule_cfg.metric, + context, + score_config, + task_nonnegative_metric_names, + ) except ValueError: return rule_cfg.rule, f"Configured prune metric unavailable: {rule_cfg.metric}" + try: + reference_value = _typed_reference_value( + rule_cfg.reference, + context, + score_config, + task_nonnegative_metric_names, + ) + except ValueError: + if rule_cfg.reference.type == "metric": + return ( + rule_cfg.rule, + f"Configured prune reference metric unavailable: {rule_cfg.reference.metric}", + ) + return rule_cfg.rule, "Configured prune reference unavailable" + condition_matched = { "gt": metric_value > reference_value, "gte": metric_value >= reference_value, @@ -1864,6 +2077,147 @@ def evaluate_prune_rules( return None +def _signed_feasibility_constraint( + *, + condition: str, + metric_value: float, + reference_value: float, +) -> float: + """Return an Optuna-compatible signed constraint value. + + Parameters + ---------- + condition : str + Feasibility condition from the rule config. + metric_value : float + Resolved metric value for the current trial. + reference_value : float + Resolved numeric reference value for the current trial. + + Returns + ------- + float + Signed constraint where values ``<= 0`` are feasible and values ``> 0`` + are infeasible. + + Notes + ----- + Inclusive ``gte``/``lte`` rules treat equality as a violation because the + rule text says the forbidden region includes equality. A tiny positive + value preserves that distinction without changing ordinary violation + magnitudes. + """ + + if condition in {"gt", "gte"}: + signed = metric_value - reference_value + elif condition in {"lt", "lte"}: + signed = reference_value - metric_value + else: + raise ValueError(f"Unsupported feasibility condition '{condition}'.") + if condition in {"gte", "lte"} and signed == 0.0: + return FEASIBILITY_EQUALITY_EPSILON + return float(signed) + + +def evaluate_feasibility_rules( + metrics: dict[str, Any], + hyperparams: Dict, + score_config: Dict, + feasibility_config: Dict, + task_nonnegative_metric_names: set[str] | None = None, +) -> FeasibilityEvaluation: + """Evaluate NAS feasibility rules against the current pre-training context. + + Parameters + ---------- + metrics : dict[str, Any] + Runtime and compile metrics available before task training. + hyperparams : addict.Dict + Trial hyperparameters, including ``flops``. + score_config : addict.Dict + Normalized score configuration that owns derived metric definitions. + feasibility_config : addict.Dict + Normalized ``nas.feasibility`` policy. + task_nonnegative_metric_names : set[str] | None, optional + Task-declared metric names that must not use negative unavailable + sentinels. + + Returns + ------- + FeasibilityEvaluation + Signed constraints plus first-violation metadata. + + Notes + ----- + Feasibility is intentionally separate from pruning: every rule contributes + a signed Optuna constraint value, and the first positive value in + configuration order is the CSV/user-attribute summary. + """ + + context = dict(metrics) + context["flops"] = hyperparams["flops"] + constraints: list[float] = [] + first_violation: dict[str, Any] | None = None + + for rule_cfg in getattr(feasibility_config, "rules", []): + try: + metric_value = _resolve_metric_value( + rule_cfg.metric, + context, + score_config, + task_nonnegative_metric_names, + ) + except ValueError: + metric_value = 1e12 + reference_value = 0.0 + violation = 1e12 + else: + try: + reference_value = _typed_reference_value( + rule_cfg.reference, + context, + score_config, + task_nonnegative_metric_names, + ) + except ValueError: + reference_value = 0.0 + violation = 1e12 + else: + violation = _signed_feasibility_constraint( + condition=str(rule_cfg.condition), + metric_value=float(metric_value), + reference_value=float(reference_value), + ) + + constraints.append(float(violation)) + if violation > 0.0 and first_violation is None: + reason = ( + rule_cfg.reason + or f"Feasibility rule '{rule_cfg.rule}' matched: {rule_cfg.metric} {rule_cfg.condition} {reference_value}" + ) + first_violation = { + "rule": str(rule_cfg.rule), + "reason": reason, + "metric": str(rule_cfg.metric), + "value": float(metric_value), + "reference": float(reference_value), + "violation": float(violation), + } + + if first_violation is None: + return FeasibilityEvaluation( + feasible=True, + status="feasible", + constraints=constraints, + ) + return FeasibilityEvaluation( + feasible=False, + status="infeasible", + constraints=constraints, + first_violation=first_violation, + ) + + def load_config( config_path: str | Path | None = None, *, @@ -1878,10 +2232,11 @@ def load_config( ---------- config_path : str | Path | None Optional override for the YAML location. Defaults to - ``src/config/nas_config.yaml``. + ``src/config/nas_config_stm32.yaml``. task_metric_names : set[str] | None, optional Task-owned metric names that may appear in score or prune configs. When - omitted, only generic NAS-policy validation runs during config load. + omitted, generic NAS-policy validation may preserve metric names that + are unknown until a task contract is available. training_only_task_metric_names : set[str] | None, optional Task-owned metric names that are only available after training. When omitted, task-aware validation treats no task metrics as training-only. @@ -1906,10 +2261,11 @@ def load_config( ----- Besides parsing YAML, this helper normalizes derived artifact paths, validates board/runtime policy, and injects harness defaults. It always - performs the generic NAS score/prune validation pass. When - ``task_metric_names`` are supplied, it additionally validates the NAS - policy against that concrete task contract instead of deferring task-owned - metric checks to the later bootstrap step. + performs the generic NAS score/prune validation pass, preserving potential + task-owned metric names when no task context exists. When + ``task_metric_names`` are supplied, it rejects truly unknown metrics and + validates the NAS policy against that concrete task contract instead of + deferring task-owned metric checks to the later bootstrap step. """ cfg_path = Path(config_path) if config_path else DEFAULT_CONFIG_PATH if not cfg_path.exists(): @@ -2233,32 +2589,14 @@ def count_flops(model, input_shape): Returns ------- int - Total floating point operations for a single forward pass with batch size 1. + Static graph FLOP proxy for a single forward pass with batch size 1. Notes ----- - The estimate freezes the TensorFlow graph with a batch size of 1 and then - delegates to the TensorFlow v1 profiler. It is useful for relative NAS - comparisons, but it still depends on TensorFlow profiler support for the - active ops. + Compatibility wrapper around :func:`tinyodom.model_metrics.count_flops_keras`. """ - concrete = tf.function(model).get_concrete_function( - tf.TensorSpec([1, *input_shape], tf.float32) - ) - frozen = convert_variables_to_constants_v2(concrete) - graph_def = frozen.graph.as_graph_def() - - with tf.Graph().as_default() as graph: - tf.compat.v1.import_graph_def(graph_def, name="") - options = ( - tf.compat.v1.profiler.ProfileOptionBuilder( - tf.compat.v1.profiler.ProfileOptionBuilder.float_operation() - ) - .with_empty_output() - .build() - ) - flops = tf.compat.v1.profiler.profile(graph, options=options) - return flops.total_float_ops + + return count_flops_keras(model, tuple(int(dim) for dim in input_shape)) def require_logical_input_shape(input_shape: Any) -> tuple[int, int]: @@ -2372,6 +2710,20 @@ def _trial_log_row_mapping( "external_flash_bytes": metrics.get("external_flash_bytes", -1), "weight_storage_mode": metrics.get("weight_storage_mode", "embedded"), "flops": trial_outcome.hyperparams["flops"], + "weight_bytes": metrics.get("weight_bytes", trial_outcome.hyperparams.get("weight_bytes", -1)), + "activation_bytes": metrics.get("activation_bytes", trial_outcome.hyperparams.get("activation_bytes", -1)), + "memory_traffic_bytes": metrics.get( + "memory_traffic_bytes", + trial_outcome.hyperparams.get("memory_traffic_bytes", -1), + ), + "memory_proxy_dtype_bytes": metrics.get( + "memory_proxy_dtype_bytes", + trial_outcome.hyperparams.get("memory_proxy_dtype_bytes", -1), + ), + "memory_proxy_warning_count": metrics.get( + "memory_proxy_warning_count", + trial_outcome.hyperparams.get("memory_proxy_warning_count", -1), + ), "latency_ms": metrics["latency_ms"], "latency_budget_ms": metrics.get("latency_budget_ms", -1.0), "energy_mj_per_inference": metrics["energy_mj_per_inference"], @@ -2390,13 +2742,32 @@ def _trial_log_row_mapping( "pruned": pruned, "prune_reason": prune_reason, "prune_rule": prune_rule, + "feasible": metrics.get("feasible", not pruned), + "feasibility_status": metrics.get( + "feasibility_status", + "not_evaluated" if pruned else "feasible", + ), + "feasibility_rule": metrics.get("feasibility_rule", ""), + "feasibility_reason": metrics.get("feasibility_reason", ""), + "feasibility_metric": metrics.get("feasibility_metric", ""), + "feasibility_value": metrics.get("feasibility_value", ""), + "feasibility_reference": metrics.get("feasibility_reference", ""), + "feasibility_violation": metrics.get("feasibility_violation", ""), + "feasibility_constraints_json": metrics.get("feasibility_constraints_json", ""), } for field_name in CADENCED_CSV_FIELDS: mapping[field_name] = metrics.get(field_name) for metric_name, value in trial_outcome.task_metrics.items(): mapping[f"metric__{metric_name}"] = value for hyperparam_name, value in trial_outcome.hyperparams.items(): - if hyperparam_name == "flops": + if hyperparam_name in { + "flops", + "weight_bytes", + "activation_bytes", + "memory_traffic_bytes", + "memory_proxy_dtype_bytes", + "memory_proxy_warning_count", + }: continue mapping[f"hparam__{hyperparam_name}"] = value return mapping @@ -2609,6 +2980,29 @@ def log_trial( trial.set_user_attr("hil_error_code", metrics["error_code"]) trial.set_user_attr("arena_bytes", metrics["arena_bytes"]) trial.set_user_attr("flops", trial_outcome.hyperparams["flops"]) + trial.set_user_attr("weight_bytes", metrics.get("weight_bytes", trial_outcome.hyperparams.get("weight_bytes", -1))) + trial.set_user_attr( + "activation_bytes", + metrics.get("activation_bytes", trial_outcome.hyperparams.get("activation_bytes", -1)), + ) + trial.set_user_attr( + "memory_traffic_bytes", + metrics.get("memory_traffic_bytes", trial_outcome.hyperparams.get("memory_traffic_bytes", -1)), + ) + trial.set_user_attr( + "memory_proxy_dtype_bytes", + metrics.get( + "memory_proxy_dtype_bytes", + trial_outcome.hyperparams.get("memory_proxy_dtype_bytes", -1), + ), + ) + trial.set_user_attr( + "memory_proxy_warning_count", + metrics.get( + "memory_proxy_warning_count", + trial_outcome.hyperparams.get("memory_proxy_warning_count", -1), + ), + ) trial.set_user_attr("error_code", metrics["error_code"]) trial.set_user_attr( "score_type", @@ -2622,13 +3016,34 @@ def log_trial( trial.set_user_attr("pruned", pruned) trial.set_user_attr("prune_reason", prune_reason) trial.set_user_attr("prune_rule", prune_rule) + trial.set_user_attr("feasible", metrics.get("feasible", not pruned)) + trial.set_user_attr( + "feasibility_status", + metrics.get("feasibility_status", "not_evaluated" if pruned else "feasible"), + ) + trial.set_user_attr("feasibility_rule", metrics.get("feasibility_rule", "")) + trial.set_user_attr("feasibility_reason", metrics.get("feasibility_reason", "")) + trial.set_user_attr("feasibility_metric", metrics.get("feasibility_metric", "")) + trial.set_user_attr("feasibility_value", metrics.get("feasibility_value", "")) + trial.set_user_attr("feasibility_reference", metrics.get("feasibility_reference", "")) + trial.set_user_attr("feasibility_violation", metrics.get("feasibility_violation", "")) + trial.set_user_attr("feasibility_constraints_json", metrics.get("feasibility_constraints_json", "")) + if "feasibility_constraints" in metrics: + trial.set_user_attr("feasibility_constraints", metrics["feasibility_constraints"]) trial.set_user_attr("task_metrics", dict(trial_outcome.task_metrics)) trial.set_user_attr("hyperparameters", dict(trial_outcome.hyperparams)) trial.set_user_attr("artifact_summary", trial_outcome.artifact_summary) for metric_name, value in trial_outcome.task_metrics.items(): trial.set_user_attr(f"metric__{metric_name}", value) for hyperparam_name, value in trial_outcome.hyperparams.items(): - if hyperparam_name == "flops": + if hyperparam_name in { + "flops", + "weight_bytes", + "activation_bytes", + "memory_traffic_bytes", + "memory_proxy_dtype_bytes", + "memory_proxy_warning_count", + }: continue trial.set_user_attr(f"hparam__{hyperparam_name}", value) for field_name in CADENCED_ALL_FIELDS: diff --git a/src/tinyodom/model_families/audio_dscnn.py b/src/tinyodom/model_families/audio_dscnn.py index f1693a5..71327c2 100644 --- a/src/tinyodom/model_families/audio_dscnn.py +++ b/src/tinyodom/model_families/audio_dscnn.py @@ -18,9 +18,8 @@ ReLU, Reshape, ) -from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2 - from ..interfaces import ModelFamilyABC +from ..model_metrics import count_flops_keras from ..pipeline_types import ModelBuildContext, TargetSpec BASE_CHANNELS_CHOICES = (4, 8, 12, 16, 20, 24, 32) @@ -385,39 +384,6 @@ def _pointwise_filters(hparams: dict[str, Any], block_index: int) -> int: return min(hparams["max_channels"], max(1, filters)) -def _count_flops(model: tf.keras.Model, input_shape: tuple[int, int]) -> int: - """Estimate DS-CNN FLOPs from a frozen TensorFlow graph. - - Parameters - ---------- - model : tensorflow.keras.Model - Built Keras model to profile. - input_shape : tuple[int, int] - Logical ``(frames, mel_bins)`` input shape. - - Returns - ------- - int - Forward-pass FLOP count for batch size 1. - """ - - concrete = tf.function(model).get_concrete_function( - tf.TensorSpec([1, *input_shape], tf.float32) - ) - frozen = convert_variables_to_constants_v2(concrete) - graph_def = frozen.graph.as_graph_def() - with tf.Graph().as_default() as graph: - tf.compat.v1.import_graph_def(graph_def, name="") - options = ( - tf.compat.v1.profiler.ProfileOptionBuilder( - tf.compat.v1.profiler.ProfileOptionBuilder.float_operation() - ) - .with_empty_output() - .build() - ) - flops = tf.compat.v1.profiler.profile(graph, options=options) - return int(flops.total_float_ops) - class AudioDSCNNFamily(ModelFamilyABC): """Explicit depthwise-plus-pointwise CNN family for log-mel audio classification.""" @@ -732,7 +698,7 @@ def count_flops( del config input_shape = self._validate_input_shape(ctx.input_shape) - return _count_flops(model, input_shape) + return count_flops_keras(model, input_shape) @staticmethod def _validate_input_shape(input_shape: tuple[int, ...] | None) -> tuple[int, int]: diff --git a/src/tinyodom/model_families/odom_tcn.py b/src/tinyodom/model_families/odom_tcn.py index 98e853a..91cf88b 100644 --- a/src/tinyodom/model_families/odom_tcn.py +++ b/src/tinyodom/model_families/odom_tcn.py @@ -13,9 +13,16 @@ from tcn import TCN from tensorflow.keras import Model from tensorflow.keras.layers import Dense, Flatten, Input, MaxPooling1D, Reshape -from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2 - from ..interfaces import ModelFamilyABC +from ..model_metrics import ( + StaticMemoryEstimate, + count_flops_keras, + dtype_bytes_for_quantization, + estimate_static_memory_keras, + layer_tensor_elements, + tensor_shape_elements, + unique_weight_bytes, +) from ..pipeline_types import ModelBuildContext logger = logging.getLogger(__name__) @@ -143,38 +150,202 @@ def apply_combined_perturbation( return bn_touched, bias_touched -def _count_flops(model: tf.keras.Model, input_shape: tuple[int, int]) -> int: - """Estimate Odom TCN FLOPs from a frozen TensorFlow graph. +def _conv_output_channels(layer: tf.keras.layers.Layer) -> int | None: + """Infer output channel count for a Conv1D-like TCN child layer. + + Parameters + ---------- + layer : tensorflow.keras.layers.Layer + Candidate TCN child layer. + + Returns + ------- + int | None + Output channel count when available from layer attributes or weights. + """ + + filters = getattr(layer, "filters", None) + if filters is not None: + return int(filters) + weights = getattr(layer, "weights", []) or [] + if weights: + shape = list(weights[0].shape) + if shape: + return int(shape[-1]) + return None + + +def _activation_bytes(elements: int | None, dtype_bytes: int) -> tuple[int, int]: + """Return activation bytes and warning increment for an element count. + + Parameters + ---------- + elements : int | None + Activation element count. + dtype_bytes : int + Deployment dtype width. + + Returns + ------- + tuple[int, int] + Bytes and warning increment. + """ + + if elements is None: + return 0, 1 + return int(elements) * int(dtype_bytes), 0 + + +def _estimate_odom_tcn_static_memory( + model: tf.keras.Model, + *, + quantization_mode: str, +) -> StaticMemoryEstimate: + """Estimate static tensor traffic for OdomTCN including TCN internals. Parameters ---------- model : tensorflow.keras.Model - Built Keras model to profile. - input_shape : tuple[int, int] - Logical `(timesteps, input_dim)` input shape. + Built OdomTCN model. + quantization_mode : str + Deployment quantization mode used to choose scalar byte width. Returns ------- - int - Forward-pass FLOP count for batch size 1. + StaticMemoryEstimate + Static memory proxy estimate for batch size 1. """ - concrete = tf.function(model).get_concrete_function( - tf.TensorSpec([1, *input_shape], tf.float32) - ) - frozen = convert_variables_to_constants_v2(concrete) - graph_def = frozen.graph.as_graph_def() - with tf.Graph().as_default() as graph: - tf.compat.v1.import_graph_def(graph_def, name="") - options = ( - tf.compat.v1.profiler.ProfileOptionBuilder( - tf.compat.v1.profiler.ProfileOptionBuilder.float_operation() + dtype_bytes = dtype_bytes_for_quantization(quantization_mode) + seen_weights: set[int] = set() + weight_bytes = 0 + activation_bytes = 0 + warning_count = 0 + + def add_operation( + layer: tf.keras.layers.Layer | None, + *, + input_elements: int | None, + output_elements: int | None, + inferred_shape: bool = False, + ) -> None: + """Accumulate one static memory operation. + + Parameters + ---------- + layer : tensorflow.keras.layers.Layer | None + Layer whose unique weights should be counted. + input_elements : int | None + Input activation element count. + output_elements : int | None + Output activation element count. + inferred_shape : bool, optional + Whether the activation shape came from architecture-aware inference. + """ + + nonlocal weight_bytes, activation_bytes, warning_count + input_bytes, input_warning = _activation_bytes(input_elements, dtype_bytes) + output_bytes, output_warning = _activation_bytes(output_elements, dtype_bytes) + activation_bytes += input_bytes + output_bytes + warning_count += input_warning + output_warning + if inferred_shape: + warning_count += 1 + if layer is not None: + weight_bytes += unique_weight_bytes( + layer, + dtype_bytes=dtype_bytes, + seen_weights=seen_weights, ) - .with_empty_output() - .build() + + for layer in model.layers: + if isinstance(layer, tf.keras.layers.InputLayer): + continue + if type(layer).__name__ != "TCN": + add_operation( + layer, + input_elements=layer_tensor_elements(layer, "input"), + output_elements=layer_tensor_elements(layer, "output"), + ) + continue + + input_shape = getattr(getattr(layer, "input", None), "shape", None) + output_elements = layer_tensor_elements(layer, "output") + residual_blocks = list(getattr(layer, "residual_blocks", []) or []) + if input_shape is None or len(input_shape) < 3 or not residual_blocks: + generic = estimate_static_memory_keras(model, quantization_mode=quantization_mode) + return StaticMemoryEstimate( + weight_bytes=generic.weight_bytes, + activation_bytes=generic.activation_bytes, + memory_traffic_bytes=generic.memory_traffic_bytes, + dtype_bytes=generic.dtype_bytes, + warning_count=generic.warning_count + 1, + ) + + timesteps = int(input_shape[1]) + current_channels = int(input_shape[2]) + for block in residual_blocks: + residual_channels = current_channels + block_layers = list(getattr(block, "_layers", []) or getattr(block, "layers", []) or []) + if not block_layers: + warning_count += 1 + continue + for child in block_layers: + layer_type = type(child).__name__ + if layer_type == "Conv1D": + output_channels = _conv_output_channels(child) + if output_channels is None: + input_elements = None + output_child_elements = None + else: + input_channels = ( + residual_channels + if str(child.name).startswith("matching_") + else current_channels + ) + input_elements = timesteps * input_channels + output_child_elements = timesteps * output_channels + if not str(child.name).startswith("matching_"): + current_channels = output_channels + elif layer_type == "Lambda" and str(child.name).startswith("matching_"): + input_elements = timesteps * residual_channels + output_child_elements = timesteps * residual_channels + else: + input_elements = timesteps * current_channels + output_child_elements = timesteps * current_channels + add_operation( + child, + input_elements=input_elements, + output_elements=output_child_elements, + inferred_shape=True, + ) + current_channels = _conv_output_channels(block_layers[0]) or current_channels + + add_operation( + None, + input_elements=timesteps * current_channels, + output_elements=output_elements, + inferred_shape=True, ) - flops = tf.compat.v1.profiler.profile(graph, options=options) - return int(flops.total_float_ops) + + for weight in model.weights: + key = id(weight) + if key in seen_weights: + continue + seen_weights.add(key) + elements = tensor_shape_elements(weight) + if elements is None: + warning_count += 1 + continue + weight_bytes += elements * dtype_bytes + + return StaticMemoryEstimate( + weight_bytes=int(weight_bytes), + activation_bytes=int(activation_bytes), + memory_traffic_bytes=int(weight_bytes + activation_bytes), + dtype_bytes=int(dtype_bytes), + warning_count=int(warning_count), + ) + class OdomTCNFamily(ModelFamilyABC): @@ -387,6 +558,40 @@ def custom_objects(self) -> dict[str, Any]: return {"TCN": TCN} + def estimate_static_memory( + self, + model: tf.keras.Model, + ctx: ModelBuildContext, + config: Any, + *, + quantization_mode: str, + ) -> StaticMemoryEstimate: + """Estimate static memory traffic for Odom TCN models. + + Parameters + ---------- + model : tensorflow.keras.Model + Built Odom TCN model. + ctx : ModelBuildContext + Build-time context. Unused because the built model carries the + concrete TCN input shape. + config : Any + Model-family configuration subtree. Unused for this estimate. + quantization_mode : str + Deployment quantization mode used to choose scalar byte width. + + Returns + ------- + StaticMemoryEstimate + Static tensor memory traffic estimate for batch size 1. + """ + + del ctx, config + return _estimate_odom_tcn_static_memory( + model, + quantization_mode=quantization_mode, + ) + def count_flops( self, model: tf.keras.Model, @@ -418,7 +623,7 @@ def count_flops( del config if ctx.input_shape is None or len(ctx.input_shape) < 2: raise ValueError("OdomTCNFamily requires a 2D input shape: (timesteps, input_dim).") - return _count_flops(model, (int(ctx.input_shape[0]), int(ctx.input_shape[1]))) + return count_flops_keras(model, (int(ctx.input_shape[0]), int(ctx.input_shape[1]))) def materialize_export_model( self, diff --git a/src/tinyodom/model_metrics.py b/src/tinyodom/model_metrics.py new file mode 100644 index 0000000..c3f382d --- /dev/null +++ b/src/tinyodom/model_metrics.py @@ -0,0 +1,268 @@ +"""Static model resource metric estimators.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +import tensorflow as tf +from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2 + + +@dataclass(frozen=True) +class StaticMemoryEstimate: + """Static tensor-memory proxy estimate for one built model. + + Parameters + ---------- + weight_bytes : int + Unique model weight bytes under the selected deployment dtype. + activation_bytes : int + Sum of per-layer input and output activation bytes for batch size 1. + memory_traffic_bytes : int + Sum of per-layer input activation, weight, and output activation bytes. + dtype_bytes : int + Deployment dtype width in bytes. + warning_count : int + Count of layer estimates that required an inferred or incomplete shape. + """ + + weight_bytes: int + activation_bytes: int + memory_traffic_bytes: int + dtype_bytes: int + warning_count: int = 0 + + +def dtype_bytes_for_quantization(quantization_mode: str) -> int: + """Return deployment dtype width for a supported quantization mode. + + Parameters + ---------- + quantization_mode : str + Deployment quantization mode, such as ``"float"`` or ``"int8_ptq"``. + + Returns + ------- + int + Number of bytes per scalar value in the static proxy. + + Raises + ------ + ValueError + If the mode is not supported by the static proxy. + """ + + normalized = str(quantization_mode).strip().lower() + if normalized in {"float", "float32"}: + return 4 + if normalized in {"int8_ptq", "int8"}: + return 1 + raise ValueError( + f"Unsupported quantization mode for static memory estimate: {quantization_mode!r}." + ) + + +def tensor_shape_elements(shape_like: Any) -> int | None: + """Count elements for a tensor shape with batch size fixed to one. + + Parameters + ---------- + shape_like : Any + Tensor, TensorShape, tuple/list, variable, or nested output collection. + + Returns + ------- + int | None + Element count, or ``None`` when a non-batch dimension is unknown. + """ + + if shape_like is None: + return None + if isinstance(shape_like, (list, tuple)) and shape_like and not all( + isinstance(dim, (int, type(None))) for dim in shape_like + ): + total = 0 + for item in shape_like: + item_elements = tensor_shape_elements(item) + if item_elements is None: + return None + total += item_elements + return int(total) + + shape = getattr(shape_like, "shape", shape_like) + try: + dims = list(shape.as_list()) + except AttributeError: + try: + dims = list(shape) + except TypeError: + return None + if not dims: + return 1 + if dims[0] is None: + dims[0] = 1 + elements = 1 + for dim in dims: + if dim is None: + return None + elements *= int(dim) + return int(elements) + + +def layer_tensor_elements(layer: tf.keras.layers.Layer, attr_name: str) -> int | None: + """Return element count for a layer input or output tensor. + + Parameters + ---------- + layer : tensorflow.keras.layers.Layer + Layer to inspect. + attr_name : str + Tensor attribute name, usually ``"input"`` or ``"output"``. + + Returns + ------- + int | None + Element count when Keras exposes a concrete symbolic shape. + """ + + try: + return tensor_shape_elements(getattr(layer, attr_name)) + except (AttributeError, RuntimeError, ValueError): + return None + + +def unique_weight_bytes( + layer: tf.keras.layers.Layer, + *, + dtype_bytes: int, + seen_weights: set[int], +) -> int: + """Count bytes for weights that have not already been counted. + + Parameters + ---------- + layer : tensorflow.keras.layers.Layer + Layer whose weights should be counted. + dtype_bytes : int + Deployment dtype width. + seen_weights : set[int] + Mutable set of Keras variable identities already counted. + + Returns + ------- + int + Unique weight bytes for this layer. + """ + + total = 0 + for weight in getattr(layer, "weights", []) or []: + key = id(weight) + if key in seen_weights: + continue + seen_weights.add(key) + elements = tensor_shape_elements(weight) + if elements is not None: + total += elements * dtype_bytes + return int(total) + + +def estimate_static_memory_keras( + model: tf.keras.Model, + *, + quantization_mode: str, +) -> StaticMemoryEstimate: + """Estimate static memory traffic for a generic Keras model. + + Parameters + ---------- + model : tensorflow.keras.Model + Built model to inspect. + quantization_mode : str + Deployment quantization mode used to choose scalar byte width. + + Returns + ------- + StaticMemoryEstimate + Static tensor traffic proxy for batch size 1. + """ + + dtype_bytes = dtype_bytes_for_quantization(quantization_mode) + seen_weights: set[int] = set() + weight_bytes = 0 + activation_bytes = 0 + warning_count = 0 + + for layer in model.layers: + if isinstance(layer, tf.keras.layers.InputLayer): + continue + input_elements = layer_tensor_elements(layer, "input") + output_elements = layer_tensor_elements(layer, "output") + if input_elements is None or output_elements is None: + warning_count += 1 + input_bytes = 0 if input_elements is None else input_elements * dtype_bytes + output_bytes = 0 if output_elements is None else output_elements * dtype_bytes + activation_bytes += input_bytes + output_bytes + weight_bytes += unique_weight_bytes( + layer, + dtype_bytes=dtype_bytes, + seen_weights=seen_weights, + ) + + for weight in model.weights: + key = id(weight) + if key in seen_weights: + continue + seen_weights.add(key) + elements = tensor_shape_elements(weight) + if elements is None: + warning_count += 1 + continue + weight_bytes += elements * dtype_bytes + + return StaticMemoryEstimate( + weight_bytes=int(weight_bytes), + activation_bytes=int(activation_bytes), + memory_traffic_bytes=int(weight_bytes + activation_bytes), + dtype_bytes=int(dtype_bytes), + warning_count=int(warning_count), + ) + + +def count_flops_keras(model: tf.keras.Model, input_shape: tuple[int, ...]) -> int: + """Estimate Keras model FLOPs by profiling a frozen forward graph. + + Parameters + ---------- + model : tensorflow.keras.Model + Built Keras model to profile. + input_shape : tuple[int, ...] + Logical input shape excluding the batch dimension. + + Returns + ------- + int + TensorFlow profiler FLOP count for a single forward pass with batch size 1. + + Notes + ----- + This is a static graph proxy. It is useful for relative NAS comparisons, + but it is not predicted latency or measured energy. + """ + + concrete = tf.function(model).get_concrete_function( + tf.TensorSpec([1, *input_shape], tf.float32) + ) + frozen = convert_variables_to_constants_v2(concrete) + graph_def = frozen.graph.as_graph_def() + with tf.Graph().as_default() as graph: + tf.compat.v1.import_graph_def(graph_def, name="") + options = ( + tf.compat.v1.profiler.ProfileOptionBuilder( + tf.compat.v1.profiler.ProfileOptionBuilder.float_operation() + ) + .with_empty_output() + .build() + ) + flops = tf.compat.v1.profiler.profile(graph, options=options) + return int(flops.total_float_ops) diff --git a/test/conftest.py b/test/conftest.py index 83502b5..1333c54 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -4,9 +4,27 @@ from pathlib import Path +ANALYSIS_SCRIPT_TESTS = { + "test_analysis_scripts.py", + "test_audio_desktop_smoke.py", + "test_audio_portenta_hil_smoke.py", + "test_audio_stm32_hil_smoke.py", + "test_stedgeai_phase0_probe.py", + "test_stm32_build_wrapper.py", + "test_stm32_project_portability.py", + "test_stm32_runner_wrappers.py", + "test_stm32_template_ownership.py", + "test_urbansound8k_input_profile.py", +} + + def pytest_ignore_collect(collection_path: Path, config) -> bool: - """Keep integration tests opt-in for the default `pytest test/` run.""" - if os.environ.get("RUN_INTEGRATION_TESTS") == "1": - return False + """Keep non-default suites opt-in for the default `pytest test/` run.""" path = Path(str(collection_path)) - return "integration" in path.parts and path.name.startswith("test_") + if ( + os.environ.get("RUN_INTEGRATION_TESTS") != "1" + and "integration" in path.parts + and path.name.startswith("test_") + ): + return True + return os.environ.get("RUN_ANALYSIS_SCRIPT_TESTS") != "1" and path.name in ANALYSIS_SCRIPT_TESTS diff --git a/test/test_analysis_scripts.py b/test/test_analysis_scripts.py index 057ce5c..802c5b4 100644 --- a/test/test_analysis_scripts.py +++ b/test/test_analysis_scripts.py @@ -4,6 +4,7 @@ import importlib.util import json +import os import subprocess import sys import tempfile @@ -13,6 +14,11 @@ from types import SimpleNamespace from unittest.mock import MagicMock, patch +import pytest + +if os.environ.get("RUN_ANALYSIS_SCRIPT_TESTS") != "1": + pytest.skip("analysis-script tests are opt-in", allow_module_level=True) + def _load_module(module_name: str, relative_path: str): """Load an analysis script by repository-relative path for wrapper tests. @@ -210,7 +216,7 @@ def test_single_hil_uses_configured_export_variant(self) -> None: server.config.device.harness_stable_low_ms = 500 server.determine_metrics.return_value = {"latency_ms": 1.0} - argv = ["run_single_hil.py", "--config", "src/config/nas_config.yaml"] + argv = ["run_single_hil.py", "--config", "src/config/nas_config_stm32.yaml"] with patch.object(sys, "argv", argv), patch.object( single_hil, "HILServer", return_value=server ), patch.object(single_hil, "_build_hyperparams", return_value={"nb_filters": 2}), patch.object( @@ -526,7 +532,7 @@ def test_phase2_candidate_uses_configured_device_name_in_tflite_filename(self): ), ): tflite_path, metadata = stm32_phase2_candidate.export_perturbed_candidate_tflite( - Path("/tmp/nas_config.yaml"), + Path("/tmp/nas_config_stm32.yaml"), output_root, ) @@ -580,7 +586,7 @@ def test_phase2_candidate_float_export_omits_calibration_data(self): ), ): stm32_phase2_candidate.export_perturbed_candidate_tflite( - Path("/tmp/nas_config.yaml"), + Path("/tmp/nas_config_stm32.yaml"), Path(tmpdir), ) @@ -780,7 +786,7 @@ def close(self): "#endif /* TOY_AI_PHASE_CONFIG_H */\n", encoding="utf-8", ) - config_path = tmp_path / "nas_config.yaml" + config_path = tmp_path / "nas_config_stm32.yaml" config_path.write_text("training:\n nas_trials: 1\n max_total_trials: 2\n", encoding="utf-8") output_path = tmp_path / "metrics.json" stage_output_root = tmp_path / "stage" diff --git a/test/test_audio_desktop_smoke.py b/test/test_audio_desktop_smoke.py index 7f2fd6c..d737cf8 100644 --- a/test/test_audio_desktop_smoke.py +++ b/test/test_audio_desktop_smoke.py @@ -2,6 +2,7 @@ import argparse import json +import os import sys import tempfile import unittest @@ -10,8 +11,12 @@ from unittest.mock import MagicMock, patch import numpy as np +import pytest from addict import Dict +if os.environ.get("RUN_ANALYSIS_SCRIPT_TESTS") != "1": + pytest.skip("analysis-script tests are opt-in", allow_module_level=True) + ROOT_DIR = Path(__file__).resolve().parents[1] SRC_DIR = ROOT_DIR / "src" SCRIPT_DIR = ROOT_DIR / "analysis_scripts" / "audio_desktop_smoke" diff --git a/test/test_audio_portenta_hil_smoke.py b/test/test_audio_portenta_hil_smoke.py index 136cfc3..0080c7b 100644 --- a/test/test_audio_portenta_hil_smoke.py +++ b/test/test_audio_portenta_hil_smoke.py @@ -4,6 +4,7 @@ import argparse import json +import os import sys import tempfile import unittest @@ -12,8 +13,12 @@ from unittest.mock import MagicMock, patch import numpy as np +import pytest from addict import Dict +if os.environ.get("RUN_ANALYSIS_SCRIPT_TESTS") != "1": + pytest.skip("analysis-script tests are opt-in", allow_module_level=True) + ROOT_DIR = Path(__file__).resolve().parents[1] SRC_DIR = ROOT_DIR / "src" SCRIPT_DIR = ROOT_DIR / "analysis_scripts" / "audio_portenta_hil_smoke" diff --git a/test/test_audio_stm32_hil_smoke.py b/test/test_audio_stm32_hil_smoke.py index 2bc2055..cd899ea 100644 --- a/test/test_audio_stm32_hil_smoke.py +++ b/test/test_audio_stm32_hil_smoke.py @@ -4,6 +4,7 @@ import argparse import json +import os import sys import tempfile import unittest @@ -12,8 +13,12 @@ from unittest.mock import MagicMock, patch import numpy as np +import pytest from addict import Dict +if os.environ.get("RUN_ANALYSIS_SCRIPT_TESTS") != "1": + pytest.skip("analysis-script tests are opt-in", allow_module_level=True) + ROOT_DIR = Path(__file__).resolve().parents[1] SRC_DIR = ROOT_DIR / "src" SCRIPT_DIR = ROOT_DIR / "analysis_scripts" / "audio_stm32_hil_smoke" diff --git a/test/test_hardware.py b/test/test_hardware.py index 9ba7008..b744946 100644 --- a/test/test_hardware.py +++ b/test/test_hardware.py @@ -63,12 +63,7 @@ TFLiteSubprocessError, ) from tinyodom import hil_protocol # noqa: E402 -from tinyodom.devices import ArduinoDevice # noqa: E402 -from analysis_scripts.compare_keras_tflite_accuracy import ( # noqa: E402 - _batched_tflite_predict, - _normalize_keras_prediction_outputs, - _require_odometry_targets, -) +from tinyodom.devices import ArduinoDevice, CandidatePrepareRequest # noqa: E402 from tinyodom.microcontrollers.arduino_base import ( # noqa: E402 ARDUINO_CLI_BIN, ARDUINO_CLI_CONFIG, @@ -460,67 +455,6 @@ def test_prediction_preserves_keras_multi_output_order(self) -> None: self.assertEqual(tflite_output.shape, keras_output.shape) np.testing.assert_allclose(tflite_output, keras_output, rtol=1e-5, atol=1e-5) - def test_batched_accuracy_diagnostic_predict_matches_direct_int8_order(self) -> None: - """Validate batched diagnostic prediction preserves int8 output order. - - Returns - ------- - None - The test passes when the diagnostic batch-inference helper matches - the direct host-side TFLite prediction helper for a quantized - multi-output model. - """ - - inputs = tf.keras.Input(shape=(4,), name="input") - shared = tf.keras.layers.Dense(3, activation="relu")(inputs) - output_0 = tf.keras.layers.Dense(1, activation="linear", name="first")(shared) - output_1 = tf.keras.layers.Dense(2, activation="linear", name="second")(shared) - model = tf.keras.Model(inputs=inputs, outputs=[output_0, output_1]) - - with tempfile.TemporaryDirectory() as tmpdir: - tflite_path = Path(tmpdir) / "multi_output_int8.tflite" - convert_to_tflite_model( - model, - self.train_x, - quantization_mode="int8_ptq", - output_name=tflite_path, - ) - - direct = predict_tflite_model(tflite_path, self.train_x[:5]) - batched = _batched_tflite_predict(tflite_path, self.train_x[:5], batch_size=3) - - self.assertIsInstance(direct, list) - self.assertEqual(len(batched), 2) - for direct_output, batched_output in zip(direct, batched): - self.assertEqual(batched_output.shape, direct_output.shape) - np.testing.assert_allclose(batched_output, direct_output, rtol=1e-6, atol=1e-6) - - def test_accuracy_diagnostic_rejects_non_odometry_targets(self) -> None: - """Validate the diagnostic fails clearly for non-odometry targets. - - Returns - ------- - None - The test passes when a non-mapping target payload raises a - diagnostic-specific ``ValueError``. - """ - - with self.assertRaisesRegex(ValueError, "odometry"): - _require_odometry_targets(np.asarray([0, 1])) - - def test_accuracy_diagnostic_rejects_single_output_predictions(self) -> None: - """Validate the diagnostic fails clearly for single-output models. - - Returns - ------- - None - The test passes when a single prediction array raises a - diagnostic-specific ``ValueError``. - """ - - with self.assertRaisesRegex(ValueError, "two Keras outputs"): - _normalize_keras_prediction_outputs(np.zeros((2, 1), dtype=np.float32)) - def test_subprocess_nonzero_return_raises(self) -> None: """Nonzero worker exits should raise a structured subprocess error.""" failed_process = _FakeCompletedProcess(returncode=2, stderr="worker failed") @@ -2093,6 +2027,33 @@ def test_harness_only_prepare_for_runtime_runtimeerror_maps_to_upload_error(self class DeviceTimeoutPassThroughTests(unittest.TestCase): + def test_arduino_prepare_candidate_float_does_not_require_calibration(self): + """Arduino float exports should stage without representative data.""" + with tempfile.TemporaryDirectory() as tmpdir: + artifact_root = Path(tmpdir) / "candidate" + request = CandidatePrepareRequest( + config=Mock(), + model=Mock(), + model_variant="approx_trained", + artifact_root=artifact_root, + tflite_model_path=Path(tmpdir) / "model.tflite", + calibration_split=None, + quantization_mode="float", + input_shape=(32, 6), + ) + device = ArduinoDevice("ARDUINO_NANO_33_BLE_SENSE") + + with patch("tinyodom.hardware.convert_to_tflite_model") as tflite_mock, patch( + "tinyodom.hardware.convert_to_cpp_model" + ) as cpp_mock, patch("tinyodom.devices._sync_arduino_sketch_variant_for_config") as sync_mock: + prepared_dir = device.prepare_candidate(request=request) + + self.assertEqual(prepared_dir, artifact_root) + self.assertIsNone(tflite_mock.call_args.kwargs["training_data"]) + self.assertEqual(tflite_mock.call_args.kwargs["quantization_mode"], "float") + cpp_mock.assert_called_once() + sync_mock.assert_called_once_with(request.config, artifact_root) + def test_arduino_device_measure_preserves_zero_timeouts(self): # Zero timeout overrides should survive measurement setup instead of being replaced by defaults. device = ArduinoDevice("ARDUINO_NANO_33_BLE_SENSE") diff --git a/test/test_model.py b/test/test_model.py index dcf992b..d70b879 100644 --- a/test/test_model.py +++ b/test/test_model.py @@ -32,6 +32,7 @@ collect_bn_layers, collect_non_bn_bias_layers, count_flops, + evaluate_feasibility_rules, evaluate_score_config, iter_layers, load_config, @@ -1683,38 +1684,45 @@ def test_load_settings_accepts_fixed_int8_quantization(self) -> None: self.assertEqual(settings.training.quantization.choices, ["int8_ptq"]) def test_load_settings_accepts_searchable_quantization_on_supported_board(self) -> None: - """Supported boards may opt into float/int8 PTQ quantization search.""" - with tempfile.TemporaryDirectory() as tmpdir: - tmp_path = Path(tmpdir) - config_path = tmp_path / "config.yaml" - config_path.write_text( - "\n".join( - [ - "device:", - " name: PORTENTA_H7", - " portenta:", - " target_core: cm7", - " split: 75_25", - " security: none", - "training:", - " nas_trials: 5", - " quantization:", - " mode: int8_ptq", - " search: true", - " choices: [float, int8_ptq]", - *self._score_lines(include_quantization=False), - "outputs:", - f" models_dir: \"{tmp_path / 'models'}\"", - f" candidate_dir: \"{tmp_path / 'candidate'}\"", - " artifact_stem: \"TinyOdomEx_Test\"", - ] - ) - ) + """Arduino-backed supported boards may search float and int8 PTQ exports.""" + for device_lines in ( + [ + " name: PORTENTA_H7", + " portenta:", + " target_core: cm7", + " split: 75_25", + " security: none", + ], + [" name: ARDUINO_NANO_33_BLE_SENSE"], + ): + with self.subTest(device=device_lines[0]): + with tempfile.TemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) + config_path = tmp_path / "config.yaml" + config_path.write_text( + "\n".join( + [ + "device:", + *device_lines, + "training:", + " nas_trials: 5", + " quantization:", + " mode: int8_ptq", + " search: true", + " choices: [float, int8_ptq]", + *self._score_lines(include_quantization=False), + "outputs:", + f" models_dir: \"{tmp_path / 'models'}\"", + f" candidate_dir: \"{tmp_path / 'candidate'}\"", + " artifact_stem: \"TinyOdomEx_Test\"", + ] + ) + ) - settings = load_config(config_path=config_path) + settings = load_config(config_path=config_path) - self.assertTrue(settings.training.quantization.search) - self.assertEqual(settings.training.quantization.choices, ["float", "int8_ptq"]) + self.assertTrue(settings.training.quantization.search) + self.assertEqual(settings.training.quantization.choices, ["float", "int8_ptq"]) def test_load_settings_rejects_invalid_quantization_configs(self) -> None: """Quantization must use the new mapping shape and supported choices.""" @@ -1734,12 +1742,6 @@ def test_load_settings_rejects_invalid_quantization_configs(self) -> None: " search: false", " choices: [int8_ptq]", ], - "unsupported_ble_float": [ - " quantization:", - " mode: float", - " search: false", - " choices: [float]", - ], } for label, quantization_lines in cases.items(): with self.subTest(label=label): @@ -1918,12 +1920,13 @@ def test_load_settings_rejects_path_like_artifact_stem(self) -> None: def test_shipped_configs_use_artifact_stem_and_export_variant(self) -> None: """All shipped configs should load with the Phase 5 artifact schema.""" cases = [ - ("nas_config.yaml", "odom_tcn", "approx_trained"), + ("nas_config_stm32.yaml", "odom_tcn", "approx_trained"), ("nas_config_ble.yaml", "odom_tcn", "approx_trained"), ("nas_config_portenta.yaml", "odom_tcn", "approx_trained"), ("nas_config_audio_stm32.yaml", "audio_dscnn", "untrained"), ("nas_config_audio_portenta.yaml", "audio_dscnn", "untrained"), ("nas_config_flops_rmse.yaml", "odom_tcn", "approx_trained"), + ("nas_config_memory_proxy.yaml", "odom_tcn", "approx_trained"), ] for filename, family, export_variant in cases: with self.subTest(filename=filename): @@ -1938,28 +1941,26 @@ def test_shipped_configs_use_artifact_stem_and_export_variant(self) -> None: self.assertEqual(settings.model.family, family) self.assertEqual(selection["model_config"]["params"].export_variant, export_variant) - def test_shipped_configs_use_production_budgets_and_audio_default_search(self) -> None: - """Checked-in configs should use production budgets and default audio search.""" - - expected_budget = { - "nas_epochs": 55, - "model_epochs": 990, - "nas_trials": 150, - "nas_multiobjective_population_size": 50, - "max_total_trials": 300, + def test_shipped_configs_use_production_training_budgets(self) -> None: + """Checked-in example configs should use production training budgets.""" + + expected_budgets = { + "nas_config_stm32.yaml": 250, + "nas_config_ble.yaml": 150, + "nas_config_portenta.yaml": 150, + "nas_config_audio_stm32.yaml": 200, + "nas_config_audio_portenta.yaml": 200, + "nas_config_flops_rmse.yaml": 150, + "nas_config_memory_proxy.yaml": 150, } - for filename in ( - "nas_config.yaml", - "nas_config_ble.yaml", - "nas_config_portenta.yaml", - "nas_config_audio_stm32.yaml", - "nas_config_audio_portenta.yaml", - "nas_config_flops_rmse.yaml", - ): + for filename, expected_trials in expected_budgets.items(): with self.subTest(filename=filename): settings = load_config(config_path=ROOT_DIR / "src/config" / filename) - for key, value in expected_budget.items(): - self.assertEqual(settings.training[key], value) + self.assertEqual(settings.training["nas_epochs"], 55) + self.assertEqual(settings.training["model_epochs"], 990) + self.assertEqual(settings.training["nas_trials"], expected_trials) + self.assertEqual(settings.training["nas_multiobjective_population_size"], 50) + self.assertEqual(settings.training["max_total_trials"], 300) if filename.startswith("nas_config_audio"): self.assertEqual(settings.model.search, {}) @@ -1969,6 +1970,7 @@ def test_config_readme_lists_audio_config_and_artifact_stem(self) -> None: self.assertIn("nas_config_audio_stm32.yaml", readme) self.assertIn("nas_config_flops_rmse.yaml", readme) + self.assertIn("nas_config_memory_proxy.yaml", readme) self.assertIn("artifact_stem", readme) self.assertIn("export_variant", readme) self.assertIn("compile_when_hil_disabled", readme) @@ -1988,8 +1990,8 @@ def test_audio_stm32_config_derives_audio_artifact_names(self) -> None: "TinyOdomEx_UrbanSound8K_STM32_NUCLEO_N657X0_Q.keras", ) self.assertEqual(settings.training.quantization.mode, "int8_ptq") - self.assertTrue(settings.training.quantization.search) - self.assertEqual(settings.training.quantization.choices, ["float", "int8_ptq"]) + self.assertFalse(settings.training.quantization.search) + self.assertEqual(settings.training.quantization.choices, ["int8_ptq"]) def test_audio_stm32_config_resolves_audio_components(self) -> None: """The audio STM32 config should resolve the audio component stack.""" @@ -2771,8 +2773,17 @@ def test_load_settings_accepts_empty_scalar_prune_rules(self) -> None: self.assertEqual(settings.nas.prune.rules, []) - def test_load_settings_rejects_prune_rules_for_multiobjective_score(self) -> None: - # Invalid prune rules for multiobjective score should fail during config load so unsupported NAS settings never reach execution. + def test_load_settings_accepts_prune_rules_for_multiobjective_score(self) -> None: + """Multi-objective score configs should accept pre-fit feasibility gates. + + Returns + ------- + None + Asserts valid multi-objective prune rules pass task-aware + validation. + """ + # Multi-objective prune rules are post-build gates, not Optuna pruning, + # so valid pre-fit metrics should pass task-aware validation. with tempfile.TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) cfg = tmp_path / "config.yaml" @@ -2798,11 +2809,13 @@ def test_load_settings_rejects_prune_rules_for_multiobjective_score(self) -> Non " direction: minimize", " prune:", " rules:", - " - metric: latency_ms", + " - rule: latency_budget", + " metric: latency_ms", " condition: gt", " reference:", " type: metric", " metric: latency_budget_ms", + " reason: Latency exceeds deployment budget", "outputs:", f" models_dir: \"{tmp_path / 'models'}\"", f" candidate_dir: \"{tmp_path / 'tcn'}\"", @@ -2811,8 +2824,14 @@ def test_load_settings_rejects_prune_rules_for_multiobjective_score(self) -> Non ) ) - with self.assertRaisesRegex(ValueError, "only supported"): - load_config(config_path=cfg) + settings = load_config( + config_path=cfg, + task_metric_names={"rmse_total"}, + training_only_task_metric_names={"rmse_total"}, + ) + + self.assertEqual(settings.nas.prune.rules[0].rule, "latency_budget") + self.assertEqual(settings.nas.prune.rules[0].metric, "latency_ms") def test_load_settings_defers_prune_rules_that_depend_on_training_metrics_until_task_validation(self) -> None: """Task-dependent prune validation should run after the task contract is known.""" @@ -3329,6 +3348,62 @@ def test_load_settings_accepts_custom_task_metric_in_prune_rules(self) -> None: self.assertEqual(settings.nas.prune.rules[0].metric, "custom_metric") + def test_validate_nas_policy_for_task_rejects_unknown_multiobjective_prune_metric(self) -> None: + """Task-aware validation should reject undeclared multi-objective gate metrics. + + Returns + ------- + None + Asserts generic load preserves the metric and task-aware validation + rejects it. + """ + with tempfile.TemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) + cfg = tmp_path / "config.yaml" + cfg.write_text( + "\n".join( + [ + "device:", + " name: TEST_DEVICE", + "training:", + " nas_trials: 10", + " quantization:", + " mode: int8_ptq", + " search: false", + " choices: [int8_ptq]", + "nas:", + " score:", + " type: multi-objective", + " params:", + " objectives:", + " - metric: flops", + " direction: minimize", + " prune:", + " rules:", + " - rule: custom_task_gate", + " metric: custom_metric", + " condition: gt", + " reference:", + " type: literal", + " value: 0.0", + "outputs:", + f" models_dir: \"{tmp_path / 'models'}\"", + f" candidate_dir: \"{tmp_path / 'tcn'}\"", + " artifact_stem: \"TinyOdomEx_Test\"", + ] + ) + ) + + settings = load_config(config_path=cfg) + + self.assertEqual(settings.nas.prune.rules[0].metric, "custom_metric") + with self.assertRaisesRegex(ValueError, "unknown metric"): + validate_nas_policy_for_task( + settings, + task_metric_names={"different_metric"}, + training_only_task_metric_names=set(), + ) + def test_load_settings_rejects_prune_rules_that_use_custom_training_only_task_metrics(self) -> None: """Prune rules may not directly read task metrics that need training.""" # Custom training-only task metrics cannot appear in prune rules because those values are unavailable before fit() runs. @@ -3377,6 +3452,112 @@ def test_load_settings_rejects_prune_rules_that_use_custom_training_only_task_me training_only_task_metric_names={"custom_metric"}, ) + def test_load_settings_rejects_multiobjective_prune_rules_with_training_only_metrics(self) -> None: + """Multi-objective prune rules may not read post-training task metrics. + + Returns + ------- + None + Asserts direct training-only task metrics are invalid in + multi-objective gates. + """ + with tempfile.TemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) + cfg = tmp_path / "config.yaml" + cfg.write_text( + "\n".join( + [ + "device:", + " name: TEST_DEVICE", + "training:", + " nas_trials: 10", + " quantization:", + " mode: int8_ptq", + " search: false", + " choices: [int8_ptq]", + "nas:", + " score:", + " type: multi-objective", + " params:", + " objectives:", + " - metric: flops", + " direction: minimize", + " prune:", + " rules:", + " - rule: custom_task_gate", + " metric: custom_metric", + " condition: gt", + " reference:", + " type: literal", + " value: 0.0", + "outputs:", + f" models_dir: \"{tmp_path / 'models'}\"", + f" candidate_dir: \"{tmp_path / 'tcn'}\"", + " artifact_stem: \"TinyOdomEx_Test\"", + ] + ) + ) + + with self.assertRaisesRegex(ValueError, "training-only"): + load_config( + config_path=cfg, + task_metric_names={"custom_metric"}, + training_only_task_metric_names={"custom_metric"}, + ) + + def test_load_settings_rejects_multiobjective_prune_reference_with_training_only_metrics(self) -> None: + """Multi-objective prune references may not read post-training task metrics. + + Returns + ------- + None + Asserts reference metrics that need training are invalid in + multi-objective gates. + """ + with tempfile.TemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) + cfg = tmp_path / "config.yaml" + cfg.write_text( + "\n".join( + [ + "device:", + " name: TEST_DEVICE", + "training:", + " nas_trials: 10", + " quantization:", + " mode: int8_ptq", + " search: false", + " choices: [int8_ptq]", + "nas:", + " score:", + " type: multi-objective", + " params:", + " objectives:", + " - metric: flops", + " direction: minimize", + " prune:", + " rules:", + " - rule: custom_task_gate", + " metric: flops", + " condition: gt", + " reference:", + " type: metric", + " metric: custom_metric", + "outputs:", + f" models_dir: \"{tmp_path / 'models'}\"", + f" candidate_dir: \"{tmp_path / 'tcn'}\"", + " artifact_stem: \"TinyOdomEx_Test\"", + ] + ) + ) + + with self.assertRaisesRegex(ValueError, "training-only"): + load_config( + config_path=cfg, + task_metric_names={"custom_metric"}, + training_only_task_metric_names={"custom_metric"}, + ) + def test_load_settings_rejects_prune_rules_that_depend_on_custom_training_only_task_metrics(self) -> None: """Prune rules may not depend indirectly on task metrics that need training.""" # Derived prune metrics cannot close over training-only task signals because prune decisions happen before training. @@ -3431,6 +3612,65 @@ def test_load_settings_rejects_prune_rules_that_depend_on_custom_training_only_t training_only_task_metric_names={"custom_metric"}, ) + def test_load_settings_rejects_multiobjective_prune_rules_with_training_dependent_derived_metrics(self) -> None: + """Multi-objective prune rules may not indirectly depend on training metrics. + + Returns + ------- + None + Asserts derived metrics that depend on training-only metrics are + invalid in multi-objective gates. + """ + with tempfile.TemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) + cfg = tmp_path / "config.yaml" + cfg.write_text( + "\n".join( + [ + "device:", + " name: TEST_DEVICE", + "training:", + " nas_trials: 10", + " quantization:", + " mode: int8_ptq", + " search: false", + " choices: [int8_ptq]", + "nas:", + " score:", + " type: multi-objective", + " metrics:", + " combined_metric:", + " type: add", + " metrics:", + " - custom_metric", + " - flops", + " params:", + " objectives:", + " - metric: flops", + " direction: minimize", + " prune:", + " rules:", + " - rule: custom_task_gate", + " metric: combined_metric", + " condition: gt", + " reference:", + " type: literal", + " value: 0.0", + "outputs:", + f" models_dir: \"{tmp_path / 'models'}\"", + f" candidate_dir: \"{tmp_path / 'tcn'}\"", + " artifact_stem: \"TinyOdomEx_Test\"", + ] + ) + ) + + with self.assertRaisesRegex(ValueError, "training-only"): + load_config( + config_path=cfg, + task_metric_names={"custom_metric"}, + training_only_task_metric_names={"custom_metric"}, + ) + def test_load_settings_accepts_cadenced_sleep_metric_in_score_terms(self) -> None: # Cadenced sleep metric in score terms should remain a supported config shape. with tempfile.TemporaryDirectory() as tmpdir: @@ -3595,6 +3835,129 @@ def test_load_settings_accepts_cadenced_error_code_in_prune_rules(self) -> None: self.assertEqual(settings.nas.prune.rules[0].metric, "cadenced_error_code") + def test_load_settings_accepts_feasibility_rules(self) -> None: + """Valid pre-training feasibility rules should normalize from config.""" + with tempfile.TemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) + cfg = tmp_path / "config.yaml" + cfg.write_text( + "\n".join( + [ + "device:", + " name: TEST_DEVICE", + "training:", + " nas_trials: 10", + " quantization:", + " mode: int8_ptq", + " search: false", + " choices: [int8_ptq]", + "nas:", + " score:", + " type: scoring-function", + " params:", + " terms:", + " - type: weighted", + " metric: flops", + " weight: -1.0", + " feasibility:", + " train_if_infeasible: true", + " rules:", + " - rule: latency_budget", + " metric: latency_ms", + " condition: gt", + " reference:", + " type: metric", + " metric: latency_budget_ms", + " reason: Latency exceeds budget", + "outputs:", + f" models_dir: \"{tmp_path / 'models'}\"", + f" candidate_dir: \"{tmp_path / 'tcn'}\"", + " artifact_stem: \"TinyOdomEx_Test\"", + ] + ) + ) + + settings = load_config(config_path=cfg) + + self.assertTrue(settings.nas.feasibility.train_if_infeasible) + self.assertEqual(settings.nas.feasibility.rules[0].rule, "latency_budget") + self.assertEqual(settings.nas.feasibility.rules[0].reference.metric, "latency_budget_ms") + + def test_validate_nas_policy_rejects_training_only_feasibility_metric(self) -> None: + """Feasibility rules must not depend on post-training task metrics.""" + with tempfile.TemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) + cfg = tmp_path / "config.yaml" + cfg.write_text( + "\n".join( + [ + "device:", + " name: TEST_DEVICE", + "training:", + " nas_trials: 10", + " quantization:", + " mode: int8_ptq", + " search: false", + " choices: [int8_ptq]", + "nas:", + " score:", + " type: scoring-function", + " params:", + " terms:", + " - type: weighted", + " metric: flops", + " weight: -1.0", + " feasibility:", + " rules:", + " - rule: rmse_gate", + " metric: rmse_total", + " condition: gt", + " reference:", + " type: literal", + " value: 1.0", + "outputs:", + f" models_dir: \"{tmp_path / 'models'}\"", + f" candidate_dir: \"{tmp_path / 'tcn'}\"", + " artifact_stem: \"TinyOdomEx_Test\"", + ] + ) + ) + + settings = load_config(config_path=cfg) + with self.assertRaisesRegex(ValueError, "training-only metric 'rmse_total'"): + validate_nas_policy_for_task( + settings, + task_metric_names={"rmse_total"}, + training_only_task_metric_names={"rmse_total"}, + ) + + def test_evaluate_feasibility_rules_computes_signed_constraints(self) -> None: + """Feasibility rules should emit signed Optuna constraints in order.""" + score_config = Dict(type="scoring-function", metrics=Dict(), params=Dict()) + feasibility_config = Dict( + rules=[ + Dict(rule="gt", metric="latency_ms", condition="gt", reference=Dict(type="literal", value=10.0), reason=""), + Dict(rule="gte", metric="latency_ms", condition="gte", reference=Dict(type="literal", value=12.0), reason=""), + Dict(rule="lt", metric="latency_ms", condition="lt", reference=Dict(type="literal", value=20.0), reason=""), + Dict(rule="lte", metric="latency_ms", condition="lte", reference=Dict(type="literal", value=12.0), reason=""), + ] + ) + + result = evaluate_feasibility_rules( + metrics={"latency_ms": 12.0}, + hyperparams=Dict(flops=1), + score_config=score_config, + feasibility_config=feasibility_config, + ) + + self.assertFalse(result.feasible) + self.assertEqual(result.status, "infeasible") + self.assertGreater(result.constraints[0], 0.0) + self.assertGreater(result.constraints[1], 0.0) + self.assertGreater(result.constraints[2], 0.0) + self.assertGreater(result.constraints[3], 0.0) + self.assertEqual(result.first_violation["rule"], "gt") + def test_load_settings_missing_file(self) -> None: """Nonexistent config paths should raise FileNotFoundError.""" # Missing config files should fail immediately instead of producing a partially initialized runtime. @@ -3878,6 +4241,11 @@ def _sample_metrics(self): "flash_bytes": 2000, "external_flash_bytes": 3000, "weight_storage_mode": "external_flash", + "weight_bytes": 4096, + "activation_bytes": 8192, + "memory_traffic_bytes": 12288, + "memory_proxy_dtype_bytes": 1, + "memory_proxy_warning_count": 4, "rmse_total": 0.3, "latency_ms": 10, "latency_budget_ms": -1, @@ -3906,6 +4274,11 @@ def _sample_hyperparams(self): """Return a representative hyperparameter payload for trial-log tests.""" return { "flops": 1_000_000, + "weight_bytes": 4096, + "activation_bytes": 8192, + "memory_traffic_bytes": 12288, + "memory_proxy_dtype_bytes": 1, + "memory_proxy_warning_count": 4, "nb_filters": 32, "kernel_size": 3, "dilations": [1, 2, 4], @@ -3991,6 +4364,23 @@ def test_log_trial_writes_header_and_row(self): rows[1][header_index["weight_storage_mode"]], metrics["weight_storage_mode"], ) + self.assertEqual(int(rows[1][header_index["weight_bytes"]]), metrics["weight_bytes"]) + self.assertEqual( + int(rows[1][header_index["activation_bytes"]]), + metrics["activation_bytes"], + ) + self.assertEqual( + int(rows[1][header_index["memory_traffic_bytes"]]), + metrics["memory_traffic_bytes"], + ) + self.assertEqual( + int(rows[1][header_index["memory_proxy_dtype_bytes"]]), + metrics["memory_proxy_dtype_bytes"], + ) + self.assertEqual( + int(rows[1][header_index["memory_proxy_warning_count"]]), + metrics["memory_proxy_warning_count"], + ) self.assertEqual( float(rows[1][header_index["latency_ms"]]), metrics["latency_ms"] ) @@ -4061,6 +4451,21 @@ def test_log_trial_writes_header_and_row(self): fake_trial.attrs["weight_storage_mode"], metrics["weight_storage_mode"], ) + self.assertEqual(fake_trial.attrs["weight_bytes"], metrics["weight_bytes"]) + self.assertEqual(fake_trial.attrs["activation_bytes"], metrics["activation_bytes"]) + self.assertEqual( + fake_trial.attrs["memory_traffic_bytes"], + metrics["memory_traffic_bytes"], + ) + self.assertEqual( + fake_trial.attrs["memory_proxy_dtype_bytes"], + metrics["memory_proxy_dtype_bytes"], + ) + self.assertEqual( + fake_trial.attrs["memory_proxy_warning_count"], + metrics["memory_proxy_warning_count"], + ) + self.assertNotIn("hparam__memory_traffic_bytes", fake_trial.attrs) self.assertEqual(fake_trial.attrs["task_metrics"], trial_outcome.task_metrics) self.assertEqual(fake_trial.attrs["metric__rmse_vel_x"], 0.1) self.assertEqual(fake_trial.attrs["metric__rmse_vel_y"], 0.2) diff --git a/test/test_modularity_scaffolding.py b/test/test_modularity_scaffolding.py index a5454ed..19f038b 100644 --- a/test/test_modularity_scaffolding.py +++ b/test/test_modularity_scaffolding.py @@ -8,6 +8,8 @@ from pathlib import Path from unittest.mock import MagicMock, sentinel, patch +import tensorflow as tf + ROOT_DIR = Path(__file__).resolve().parents[1] SRC_DIR = ROOT_DIR / "src" if str(SRC_DIR) not in sys.path: @@ -336,10 +338,30 @@ def test_default_materialize_export_model_rejects_missing_trained_checkpoint(sel checkpoint_path=missing_checkpoint, ) - def test_default_count_flops_raises_not_implemented(self) -> None: - # The abstract default should raise NotImplementedError until a concrete model family supplies a FLOP counter. - with self.assertRaises(NotImplementedError): - self.model_family.count_flops(sentinel.model, self.ctx, {}) + def test_default_count_flops_uses_generic_keras_estimator(self) -> None: + # The default model-family FLOP hook should use the shared generic Keras profiler when an input shape is available. + model = tf.keras.Sequential( + [ + tf.keras.layers.Input(shape=(1,)), + tf.keras.layers.Dense(1), + ] + ) + + flops = self.model_family.count_flops(model, self.ctx, {}) + + self.assertIsInstance(flops, int) + self.assertGreater(flops, 0) + + def test_default_count_flops_rejects_missing_input_shape(self) -> None: + # Generic FLOP estimation needs a logical input shape so it can build the batch-size-1 signature. + ctx = ModelBuildContext( + input_shape=None, + input_dtype="float32", + target_spec=self.target_spec, + ) + + with self.assertRaisesRegex(ValueError, "input shape"): + self.model_family.count_flops(sentinel.model, ctx, {}) class PurityTests(unittest.TestCase): @@ -355,6 +377,7 @@ def test_new_modules_do_not_import_forbidden_runtime_modules(self) -> None: } module_paths = [ SRC_DIR / "tinyodom" / "interfaces.py", + SRC_DIR / "tinyodom" / "model_metrics.py", SRC_DIR / "tinyodom" / "pipeline_types.py", SRC_DIR / "tinyodom" / "registry.py", ] diff --git a/test/test_nas_model_client.py b/test/test_nas_model_client.py index 2b02ae5..135cd1b 100644 --- a/test/test_nas_model_client.py +++ b/test/test_nas_model_client.py @@ -37,6 +37,7 @@ TFLiteSubprocessError, ) # noqa: E402 from tinyodom.model import ScoreConfigEvaluationError, TrialOutcome # noqa: E402 +from tinyodom.model_metrics import StaticMemoryEstimate # noqa: E402 from tinyodom.pipeline_types import ( DataSplit, DatasetBundle, @@ -163,6 +164,15 @@ def _build_test_client(base_dir: Path | None = None) -> NASModelClient: build_model=MagicMock(return_value=fake_built_model), load_model=MagicMock(return_value=fake_loaded_model), count_flops=MagicMock(return_value=1234), + estimate_static_memory=MagicMock( + return_value=StaticMemoryEstimate( + weight_bytes=12, + activation_bytes=34, + memory_traffic_bytes=46, + dtype_bytes=4, + warning_count=0, + ) + ), supports_tflite=MagicMock(return_value=True), default_seed_trial=MagicMock( return_value={ @@ -635,6 +645,14 @@ def test_objective_happy_path_runs_training(self) -> None: self.client.model_build_context, self.client.model_config, ) + self.client.model_family.estimate_static_memory.assert_called_once_with( + self.client.model_family.build_model.return_value, + self.client.model_build_context, + self.client.model_config, + quantization_mode="float", + ) + logged_hparams = self.mock_log.call_args.kwargs["trial_outcome"].hyperparams + self.assertEqual(logged_hparams["memory_traffic_bytes"], 46) self.client.task.build_fit_plan.assert_called_once_with( self.client.dataset_bundle, self.client.task_config, @@ -797,6 +815,94 @@ def test_objective_prunes_when_rule_metric_is_unavailable(self) -> None: self.assertEqual(self.mock_log.call_args.kwargs["prune_rule"], "rule_0") self.assertIn("Configured prune metric unavailable", self.mock_log.call_args.kwargs["prune_reason"]) + def test_objective_infeasible_skips_training_and_logs_constraints(self) -> None: + """Feasibility violations should complete with penalties when training is disabled.""" + metrics = { + "error_code": HIL_MASTER_SUCCESS, + "ram_bytes": 512, + "flash_bytes": 512, + "arena_bytes": 1024, + "latency_ms": 25.0, + "latency_budget_ms": 20.0, + "energy_mj_per_inference": -1.0, + "avg_power_mw": -1.0, + "avg_current_ma": -1.0, + "bus_voltage_v": -1.0, + } + self.client.config.nas.feasibility = Dict( + train_if_infeasible=False, + rules=[ + Dict( + rule="latency_budget", + metric="latency_ms", + condition="gt", + reference=Dict(type="metric", metric="latency_budget_ms"), + reason="Latency exceeds cadence budget", + ) + ], + ) + self.client._hil_request = MagicMock(return_value=metrics) + trial = DummyTrial() + + result = self.client.objective(trial) + + self.assertEqual(result, -100.0) + self.client.task.build_fit_plan.assert_not_called() + logged_metrics = self.mock_log.call_args.kwargs["metrics"] + self.assertFalse(logged_metrics["feasible"]) + self.assertEqual(logged_metrics["feasibility_status"], "infeasible") + self.assertEqual(logged_metrics["feasibility_rule"], "latency_budget") + self.assertEqual(logged_metrics["feasibility_constraints"], [5.0]) + self.assertEqual(json.loads(logged_metrics["feasibility_constraints_json"]), [5.0]) + self.assertFalse(self.mock_log.call_args.kwargs.get("pruned", False)) + + def test_objective_train_if_infeasible_trains_with_real_objectives(self) -> None: + """Infeasible trials may still train while remaining constrained.""" + metrics = { + "error_code": HIL_MASTER_SUCCESS, + "ram_bytes": 512, + "flash_bytes": 512, + "arena_bytes": 1024, + "latency_ms": 25.0, + "latency_budget_ms": 20.0, + "energy_mj_per_inference": -1.0, + "avg_power_mw": -1.0, + "avg_current_ma": -1.0, + "bus_voltage_v": -1.0, + } + self.client.config.nas.score = Dict( + type="multi-objective", + metrics=Dict(), + params=Dict( + objectives=[ + Dict(metric="latency_ms", direction="minimize"), + Dict(metric="rmse_total", direction="minimize"), + ] + ), + ) + self.client.config.nas.feasibility = Dict( + train_if_infeasible=True, + rules=[ + Dict( + rule="latency_budget", + metric="latency_ms", + condition="gt", + reference=Dict(type="metric", metric="latency_budget_ms"), + reason="Latency exceeds cadence budget", + ) + ], + ) + self.client._hil_request = MagicMock(return_value=metrics) + + result = self.client.objective(DummyTrial()) + + self.assertEqual(result, (25.0, 0.3)) + self.client.task.build_fit_plan.assert_called_once() + logged_metrics = self.mock_log.call_args.kwargs["metrics"] + self.assertFalse(logged_metrics["feasible"]) + self.assertEqual(logged_metrics["feasibility_status"], "infeasible") + self.assertEqual(logged_metrics["feasibility_constraints"], [5.0]) + def test_objective_uses_negative_one_rmse_sentinels_for_failed_trials(self) -> None: # Failed trials should log stable RMSE sentinels so CSV summaries can distinguish a failure from missing training output. metrics = { @@ -898,6 +1004,8 @@ def test_objective_samples_cpu_clock_into_device_overrides(self) -> None: self.assertNotIn("model_variant", sent_payload) self.assertNotIn("cpu_clock_mhz", sent_payload["family_hparams"]) self.assertEqual(sent_payload["runtime_metadata"]["flops"], 1234) + self.assertEqual(sent_payload["runtime_metadata"]["memory_traffic_bytes"], 46) + self.assertEqual(sent_payload["runtime_metadata"]["memory_proxy_dtype_bytes"], 4) self.assertEqual(trial.params["cpu_clock_mhz_index"], 0) def test_objective_omits_device_overrides_when_clock_options_are_null(self) -> None: @@ -1017,6 +1125,264 @@ def test_objective_returns_configured_multiobjective_tuple(self) -> None: self.assertEqual(result, (10.0, 512.0)) self.mock_log.assert_called_once() + def test_objective_multiobjective_rule_hit_returns_penalty_tuple(self) -> None: + """Multi-objective prune-rule hits should log and return signed penalties. + + Returns + ------- + None + Asserts that post-HIL gates skip fit/evaluation and return + direction-aware penalties. + """ + metrics = { + "error_code": HIL_MASTER_SUCCESS, + "ram_bytes": 512, + "flash_bytes": 512, + "arena_bytes": 1024, + "latency_ms": 25.0, + "latency_budget_ms": 20.0, + "energy_mj_per_inference": -1.0, + "avg_power_mw": -1.0, + "avg_current_ma": -1.0, + "bus_voltage_v": -1.0, + } + self.client.config.nas.score = Dict( + type="multi-objective", + metrics=Dict(), + params=Dict( + objectives=[ + Dict(metric="latency_ms", direction="minimize"), + Dict(metric="rmse_total", direction="maximize"), + ] + ), + ) + self.client.config.nas.prune = Dict( + rules=[ + Dict( + rule="deadline_miss_limit", + metric="latency_ms", + condition="gt", + reference=Dict(type="metric", metric="latency_budget_ms"), + reason="Latency exceeds deployment budget", + ) + ] + ) + self.client._hil_request = MagicMock(return_value=metrics) + trial = DummyTrial() + + result = self.client.objective(trial) + + self.assertEqual(result, (1e12, -1e12)) + self.client.model_family.build_model.assert_called_once() + self.client.task.compile_model.assert_called_once() + self.client.model_family.count_flops.assert_called_once() + self.client._hil_request.assert_called_once() + self.client.task.build_fit_plan.assert_not_called() + self.client.model_family.build_model.return_value.fit.assert_not_called() + self.client._evaluate_model_with_backend.assert_not_called() + self.mock_log.assert_called_once() + self.assertTrue(self.mock_log.call_args.kwargs["pruned"]) + self.assertEqual(self.mock_log.call_args.kwargs["prune_rule"], "deadline_miss_limit") + self.assertEqual( + self.mock_log.call_args.kwargs["prune_reason"], + "Latency exceeds deployment budget", + ) + + def test_objective_multiobjective_rule_reference_unavailable_reason(self) -> None: + """Unavailable prune reference metrics should be named in the reason. + + Returns + ------- + None + Asserts reference-metric unavailability is distinct from rule + metric unavailability. + """ + metrics = { + "error_code": HIL_MASTER_SUCCESS, + "ram_bytes": 512, + "flash_bytes": 512, + "arena_bytes": 1024, + "latency_ms": 25.0, + "latency_budget_ms": -1.0, + "energy_mj_per_inference": -1.0, + "avg_power_mw": -1.0, + "avg_current_ma": -1.0, + "bus_voltage_v": -1.0, + } + self.client.config.nas.score = Dict( + type="multi-objective", + metrics=Dict(), + params=Dict(objectives=[Dict(metric="latency_ms", direction="minimize")]), + ) + self.client.config.nas.prune = Dict( + rules=[ + Dict( + rule="latency_reference", + metric="latency_ms", + condition="gt", + reference=Dict(type="metric", metric="latency_budget_ms"), + reason="Latency exceeds deployment budget", + ) + ] + ) + self.client._hil_request = MagicMock(return_value=metrics) + + result = self.client.objective(DummyTrial()) + + self.assertEqual(result, (1e12,)) + self.assertEqual(self.mock_log.call_args.kwargs["prune_rule"], "latency_reference") + self.assertEqual( + self.mock_log.call_args.kwargs["prune_reason"], + "Configured prune reference metric unavailable: latency_budget_ms", + ) + + def test_objective_multiobjective_custom_nonnegative_sentinel_fails_closed(self) -> None: + """Task-owned nonnegative prune metrics should treat -1 as unavailable. + + Returns + ------- + None + Asserts task nonnegative sentinel handling is applied before fit. + """ + metrics = { + "error_code": HIL_MASTER_SUCCESS, + "ram_bytes": 512, + "flash_bytes": 512, + "arena_bytes": 1024, + "latency_ms": 10.0, + "custom_feasibility": -1.0, + } + self.client.metric_contract = TaskMetricContract( + available_metric_names={"custom_feasibility", "rmse_total"}, + training_only_metric_names={"rmse_total"}, + nonnegative_metric_names={"custom_feasibility", "rmse_total"}, + primary_metric_names={"rmse_total"}, + ) + self.client.config.nas.score = Dict( + type="multi-objective", + metrics=Dict(), + params=Dict(objectives=[Dict(metric="latency_ms", direction="minimize")]), + ) + self.client.config.nas.prune = Dict( + rules=[ + Dict( + rule="custom_prefit_gate", + metric="custom_feasibility", + condition="gt", + reference=Dict(type="literal", value=0.0), + reason="Custom feasibility failed", + ) + ] + ) + self.client._hil_request = MagicMock(return_value=metrics) + + result = self.client.objective(DummyTrial()) + + self.assertEqual(result, (1e12,)) + self.client.task.build_fit_plan.assert_not_called() + self.assertEqual(self.mock_log.call_args.kwargs["prune_rule"], "custom_prefit_gate") + self.assertEqual( + self.mock_log.call_args.kwargs["prune_reason"], + "Configured prune metric unavailable: custom_feasibility", + ) + + def test_objective_multiobjective_rule_miss_trains_and_evaluates(self) -> None: + """Non-matching multi-objective rules should continue through validation. + + Returns + ------- + None + Asserts rule misses continue into fit plus TFLite and Keras + validation. + """ + metrics = { + "error_code": HIL_MASTER_SUCCESS, + "ram_bytes": 512, + "flash_bytes": 512, + "arena_bytes": 1024, + "latency_ms": 10.0, + "latency_budget_ms": 20.0, + "energy_mj_per_inference": -1.0, + "avg_power_mw": -1.0, + "avg_current_ma": -1.0, + "bus_voltage_v": -1.0, + } + self.client.config.nas.score = Dict( + type="multi-objective", + metrics=Dict(), + params=Dict( + objectives=[ + Dict(metric="latency_ms", direction="minimize"), + Dict(metric="rmse_total", direction="minimize"), + ] + ), + ) + self.client.config.nas.prune = Dict( + rules=[ + Dict( + rule="latency_budget", + metric="latency_ms", + condition="gt", + reference=Dict(type="metric", metric="latency_budget_ms"), + reason="Latency exceeds deployment budget", + ) + ] + ) + self.client._hil_request = MagicMock(return_value=metrics) + + result = self.client.objective(DummyTrial()) + + self.assertEqual(result, (10.0, 0.3)) + self.client.task.build_fit_plan.assert_called_once() + self.assertEqual(self.client._evaluate_model_with_backend.call_count, 2) + self.assertFalse(self.mock_log.call_args.kwargs.get("pruned", False)) + + def test_objective_multiobjective_gated_optuna_trial_is_complete(self) -> None: + """Optuna should record gated multi-objective trials as COMPLETE. + + Returns + ------- + None + Asserts a gated multi-objective objective return is not an Optuna + PRUNED trial. + """ + metrics = { + "error_code": HIL_MASTER_SUCCESS, + "ram_bytes": 512, + "flash_bytes": 512, + "arena_bytes": 1024, + "latency_ms": 25.0, + "latency_budget_ms": 20.0, + "energy_mj_per_inference": -1.0, + "avg_power_mw": -1.0, + "avg_current_ma": -1.0, + "bus_voltage_v": -1.0, + } + self.client.config.nas.score = Dict( + type="multi-objective", + metrics=Dict(), + params=Dict(objectives=[Dict(metric="latency_ms", direction="minimize")]), + ) + self.client.config.nas.prune = Dict( + rules=[ + Dict( + rule="latency_budget", + metric="latency_ms", + condition="gt", + reference=Dict(type="metric", metric="latency_budget_ms"), + reason="Latency exceeds deployment budget", + ) + ] + ) + self.client._hil_request = MagicMock(return_value=metrics) + study = optuna.create_study(directions=["minimize"]) + + study.optimize(self.client.objective, n_trials=1) + + self.assertEqual(len(study.trials), 1) + self.assertEqual(study.trials[0].state, TrialState.COMPLETE) + self.assertEqual(study.trials[0].values, [1e12]) + def test_objective_train_false_uses_generic_metric_sentinels(self) -> None: # Train-disabled trials should still emit generic metric sentinels so downstream logs keep a stable shape. metrics = { @@ -1263,6 +1629,32 @@ def test_objective_pure_desktop_train_false_flops_omits_quantization_search(self self.client.task.build_fit_plan.assert_not_called() self.assertNotIn("quantization_mode", trial.params) + def test_feasibility_metrics_participate_in_hil_dependency_classification(self) -> None: + """Runtime-only feasibility metrics should force runtime dependency validation.""" + self.client.config.nas.score = Dict( + type="scoring-function", + metrics=Dict(), + params=Dict(terms=[Dict(type="weighted", metric="flops", weight=-1.0)]), + ) + self.client.config.nas.feasibility = Dict( + train_if_infeasible=False, + rules=[ + Dict( + rule="latency_budget", + metric="latency_ms", + condition="gt", + reference=Dict(type="metric", metric="latency_budget_ms"), + reason="Latency exceeds cadence budget", + ) + ], + ) + + dependencies = self.client._classify_nas_metric_dependencies() + + self.assertIn("latency_ms", dependencies.metrics) + self.assertIn("latency_budget_ms", dependencies.metrics) + self.assertIn("latency_ms", dependencies.runtime_only) + class SmokeTestTests(unittest.TestCase): """Ensure the convenience smoke_test helper toggles config safely.""" @@ -1488,10 +1880,14 @@ def __init__(self, states): self.best_value = None self.enqueue_calls = [] self.metric_names_calls = [] + self.user_attrs = {} def set_metric_names(self, metric_names): self.metric_names_calls.append(list(metric_names)) + def set_user_attr(self, key, value): + self.user_attrs[key] = value + def optimize(self, func, n_trials): self.optimize_calls.append(n_trials) for _ in range(n_trials): @@ -1556,6 +1952,116 @@ def test_run_nas_sets_multiobjective_metric_names(self) -> None: self.assertEqual(dummy.metric_names_calls, [["rmse_total", "latency_ms"]]) self.assertEqual(dummy.optimize_calls, [1]) + def test_run_nas_wires_constraints_sampler_when_feasibility_enabled(self) -> None: + """Both sampler families should receive the persisted constraints hook.""" + client = _build_test_client() + client.config.training.nas_trials = 1 + client.config.training.max_total_trials = 1 + client.config.nas.feasibility = Dict( + train_if_infeasible=False, + rules=[ + Dict( + rule="latency_budget", + metric="latency_ms", + condition="gt", + reference=Dict(type="metric", metric="latency_budget_ms"), + reason="Latency exceeds cadence budget", + ) + ], + ) + dummy = self.DummyStudy([TrialState.COMPLETE]) + client.objective = MagicMock() + sentinel_sampler = object() + + with patch("nas_model_client.optuna.samplers.TPESampler", return_value=sentinel_sampler) as mock_tpe: + with patch("nas_model_client.optuna.create_study", return_value=dummy): + client.run_nas(study_name="demo", storage="sqlite:///dummy.db") + + constraints_func = mock_tpe.call_args.kwargs["constraints_func"] + self.assertIs(constraints_func.__self__, client) + self.assertIs(constraints_func.__func__, client._constraints_func.__func__) + self.assertEqual( + dummy.user_attrs["tinyodom_feasibility_policy_signature"]["rules"][0]["rule"], + "latency_budget", + ) + + def test_run_nas_does_not_count_not_evaluated_complete_trials_as_infeasible(self) -> None: + """Hard-pruned COMPLETE penalty trials should stay separate from infeasible trials.""" + client = _build_test_client() + client.config.training.nas_trials = 1 + client.config.training.max_total_trials = 1 + client.config.nas.feasibility = Dict( + train_if_infeasible=False, + rules=[ + Dict( + rule="latency_budget", + metric="latency_ms", + condition="gt", + reference=Dict(type="metric", metric="latency_budget_ms"), + reason="Latency exceeds cadence budget", + ) + ], + ) + trial = SimpleNamespace( + state=TrialState.COMPLETE, + user_attrs={"feasible": False, "feasibility_status": "not_evaluated"}, + ) + + class DummyStudy: + """Study double with one hard-pruned complete penalty trial.""" + + def __init__(self) -> None: + self.trials = [trial] + self.user_attrs = { + "tinyodom_feasibility_policy_signature": client._feasibility_policy_signature() + } + self.metric_names_calls = [] + + def set_metric_names(self, metric_names): + """Record metric names like an Optuna study.""" + self.metric_names_calls.append(list(metric_names)) + + dummy = DummyStudy() + + with patch("nas_model_client.optuna.create_study", return_value=dummy): + with patch("builtins.print") as mock_print: + client.run_nas(study_name="demo", storage="sqlite:///dummy.db") + + final_line = "\n".join(str(call.args[0]) for call in mock_print.call_args_list if call.args) + self.assertIn("0 feasible, 0 infeasible, 1 completed", final_line) + + def test_run_nas_rejects_mismatched_feasibility_signature_on_resume(self) -> None: + """Resume should fail when stored feasibility policy differs.""" + client = _build_test_client() + client.config.nas.feasibility = Dict( + train_if_infeasible=False, + rules=[ + Dict( + rule="latency_budget", + metric="latency_ms", + condition="gt", + reference=Dict(type="metric", metric="latency_budget_ms"), + reason="Latency exceeds cadence budget", + ) + ], + ) + dummy = self.DummyStudy([TrialState.COMPLETE]) + dummy.user_attrs["tinyodom_feasibility_policy_signature"] = { + "train_if_infeasible": False, + "rules": [ + { + "rule": "other", + "metric": "latency_ms", + "condition": "gt", + "reference": {"type": "metric", "metric": "latency_budget_ms"}, + } + ], + } + + with patch("nas_model_client.optuna.create_study", return_value=dummy): + with self.assertRaisesRegex(RuntimeError, "does not match"): + client.run_nas(study_name="demo", storage="sqlite:///dummy.db") + def test_run_nas_honors_max_total_trials_cap(self) -> None: """Stop retrying when the global trial-attempt cap is reached. @@ -1911,6 +2417,7 @@ def test_run_scoring_nas_runs_fixed_final_before_fold_rotation(self) -> None: study = SimpleNamespace( trials=[object()], best_value=0.75, + best_trial=SimpleNamespace(value=0.75, params={}), trials_dataframe=MagicMock(return_value=trials_df), ) @@ -1944,6 +2451,46 @@ def test_run_scoring_nas_runs_fixed_final_before_fold_rotation(self) -> None: "fold_rotation/fold_rotation_summary.json", ) + def test_run_scoring_nas_requires_feasible_trial_before_final_training(self) -> None: + """Single-objective closeout should not retrain infeasible penalty trials.""" + + with tempfile.TemporaryDirectory() as tmpdir: + base = Path(tmpdir) + client = _build_test_client(base_dir=base) + client.config.nas.feasibility = Dict( + train_if_infeasible=False, + rules=[ + Dict( + rule="latency_budget", + metric="latency_ms", + condition="gt", + reference=Dict(type="metric", metric="latency_budget_ms"), + reason="Latency exceeds cadence budget", + ) + ], + ) + trials_df = MagicMock() + infeasible_trial = SimpleNamespace( + state=TrialState.COMPLETE, + value=-100.0, + params={"nb_filters": 2}, + user_attrs={"feasible": False, "feasibility_status": "infeasible"}, + ) + study = SimpleNamespace( + trials=[infeasible_trial], + trials_dataframe=MagicMock(return_value=trials_df), + get_trials=MagicMock(return_value=[infeasible_trial]), + ) + + with patch.object(client, "run_nas", return_value=study), patch.object( + client, "train_best_trial" + ) as train_best: + with self.assertRaisesRegex(RuntimeError, "without any feasible completed trials"): + client.run_scoring_nas(study_name="demo") + + train_best.assert_not_called() + trials_df.to_csv.assert_called_once() + def test_run_fold_rotation_uses_per_fold_context_without_export(self) -> None: """Fold reporting should write success artifacts for requested folds.""" diff --git a/test/test_odom_tcn.py b/test/test_odom_tcn.py index 05860d5..f7b6f6d 100644 --- a/test/test_odom_tcn.py +++ b/test/test_odom_tcn.py @@ -487,6 +487,34 @@ def test_count_flops_returns_positive_estimate(self) -> None: self.assertIsInstance(flops, int) self.assertGreater(flops, 0) + def test_estimate_static_memory_returns_positive_tcn_proxy(self) -> None: + # The OdomTCN override should count real weights and infer custom TCN internal activation traffic. + hparams = { + "nb_filters": 8, + "kernel_size": 5, + "dropout_rate": 0.1, + "use_skip_connections": True, + "norm_flag": False, + "dilations": [1, 2, 4], + } + model = self.family.build_model(hparams, self.ctx, {}) + + estimate = self.family.estimate_static_memory( + model, + self.ctx, + {}, + quantization_mode="int8_ptq", + ) + + self.assertEqual(estimate.dtype_bytes, 1) + self.assertGreater(estimate.weight_bytes, 0) + self.assertGreater(estimate.activation_bytes, 0) + self.assertEqual( + estimate.memory_traffic_bytes, + estimate.weight_bytes + estimate.activation_bytes, + ) + self.assertGreater(estimate.warning_count, 0) + def test_validate_hparams_rejects_missing_required_keys(self) -> None: # Hyperparameter validation should reject missing required keys before model construction starts. hparams = { diff --git a/test/test_stedgeai_phase0_probe.py b/test/test_stedgeai_phase0_probe.py index 2b28981..851d064 100644 --- a/test/test_stedgeai_phase0_probe.py +++ b/test/test_stedgeai_phase0_probe.py @@ -3,6 +3,7 @@ from __future__ import annotations import importlib.util +import os import subprocess import sys import tempfile @@ -10,6 +11,11 @@ from pathlib import Path from unittest.mock import patch +import pytest + +if os.environ.get("RUN_ANALYSIS_SCRIPT_TESTS") != "1": + pytest.skip("analysis-script tests are opt-in", allow_module_level=True) + def _load_module(module_name: str, relative_path: str): """Load an analysis script by repository-relative path for wrapper tests. diff --git a/test/test_stm32_backend.py b/test/test_stm32_backend.py index 9f667ab..2c99966 100644 --- a/test/test_stm32_backend.py +++ b/test/test_stm32_backend.py @@ -723,6 +723,7 @@ def test_evaluate_cadenced_runtime_mode_merges_second_pass_metrics(self) -> None self.assertEqual(phase_mock.call_args_list[1].kwargs["phase"], "cadenced") self.assertEqual(metrics.power_metrics["runtime_mode"], "cadenced") self.assertEqual(metrics.power_metrics["cadenced_error_code"], HIL_ERROR_OK) + self.assertAlmostEqual(metrics.latency_s, 0.080) self.assertAlmostEqual(metrics.power_metrics["cadenced_active_inference_latency_ms"], 80.0) self.assertAlmostEqual(metrics.power_metrics["cadenced_window_latency_ms"], 20000.0) self.assertAlmostEqual(metrics.power_metrics["cadenced_energy_mj_per_window"], 1.25) @@ -1468,7 +1469,7 @@ def test_evaluate_combined_external_flash_and_harness_uses_canonical_order(self) }, )() telemetry = stm32_runtime.STM32RuntimeTelemetry( - latency_s=0.003, + latency_s=0.001, serial_log=["STM32_AI_INIT=OK", "DUT READY", "STM32_AI_RUN=OK"], power_metrics={ "clock_hz": 600000000.0, @@ -1649,6 +1650,7 @@ def _wait_done(**kwargs): 5, ) self.assertEqual(metrics.error_code, HIL_ERROR_OK) + self.assertEqual(metrics.latency_s, 0.003) self.assertEqual(metrics.external_flash_bytes, 4096) self.assertEqual(metrics.power_metrics["weight_storage_mode"], "external_flash") self.assertEqual(metrics.power_metrics["runs"], 5) @@ -3282,6 +3284,12 @@ def test_real_lrun_template_parsers_match_checked_in_files(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: staged_root = Path(tmpdir) / canonical_root.name shutil.copytree(canonical_root, staged_root) + _write_text( + staged_root / "FSBL" / "Inc" / "stm32_extmem_conf.h", + "#define EXTMEM_LRUN_SOURCE_SIZE 0x00020000\n", + ) + _write_text(staged_root / "Appli" / "Inc" / "main.h", "#pragma once\n") + _write_text(staged_root / "Appli" / "Src" / "system_stm32n6xx_s.c", "void SystemInit(void) {}\n") _write_text( staged_root / "Appli" / "Inc" / "network_data_params.h", "#define AI_NETWORK_DATA_ACTIVATIONS_SIZE (47688)\n", diff --git a/test/test_stm32_build_wrapper.py b/test/test_stm32_build_wrapper.py index c03109f..070549a 100644 --- a/test/test_stm32_build_wrapper.py +++ b/test/test_stm32_build_wrapper.py @@ -3,12 +3,18 @@ from __future__ import annotations import importlib.util +import os import subprocess import sys import unittest from pathlib import Path from unittest.mock import patch +import pytest + +if os.environ.get("RUN_ANALYSIS_SCRIPT_TESTS") != "1": + pytest.skip("analysis-script tests are opt-in", allow_module_level=True) + def _load_module(module_name: str, relative_path: str): """Load an analysis script by repository-relative path for wrapper tests. diff --git a/test/test_stm32_project_portability.py b/test/test_stm32_project_portability.py index 67072a8..eb327ea 100644 --- a/test/test_stm32_project_portability.py +++ b/test/test_stm32_project_portability.py @@ -2,9 +2,15 @@ from __future__ import annotations +import os import unittest from pathlib import Path +import pytest + +if os.environ.get("RUN_ANALYSIS_SCRIPT_TESTS") != "1": + pytest.skip("analysis-script tests are opt-in", allow_module_level=True) + REPO_ROOT = Path(__file__).resolve().parents[1] STM32_ROOT = REPO_ROOT / "analysis_scripts" / "stm32_example_project" diff --git a/test/test_stm32_runner_wrappers.py b/test/test_stm32_runner_wrappers.py index d0a72d8..0e1f5cb 100644 --- a/test/test_stm32_runner_wrappers.py +++ b/test/test_stm32_runner_wrappers.py @@ -4,6 +4,7 @@ import importlib.util import json +import os import subprocess import sys import tempfile @@ -12,6 +13,11 @@ from pathlib import Path from unittest.mock import patch +import pytest + +if os.environ.get("RUN_ANALYSIS_SCRIPT_TESTS") != "1": + pytest.skip("analysis-script tests are opt-in", allow_module_level=True) + def _load_module(module_name: str, relative_path: str): """Load an analysis script by repository-relative path for wrapper tests. diff --git a/test/test_stm32_template_ownership.py b/test/test_stm32_template_ownership.py index 353adac..ebfe624 100644 --- a/test/test_stm32_template_ownership.py +++ b/test/test_stm32_template_ownership.py @@ -1,10 +1,16 @@ """Tests for the checked-in STM32 LRUN template ownership manifest.""" +import os import subprocess import sys import unittest from pathlib import Path +import pytest + +if os.environ.get("RUN_ANALYSIS_SCRIPT_TESTS") != "1": + pytest.skip("analysis-script tests are opt-in", allow_module_level=True) + ROOT_DIR = Path(__file__).resolve().parents[1] SRC_DIR = ROOT_DIR / "src" diff --git a/test/test_urbansound8k_input_profile.py b/test/test_urbansound8k_input_profile.py index d8a6ca1..a74006d 100644 --- a/test/test_urbansound8k_input_profile.py +++ b/test/test_urbansound8k_input_profile.py @@ -3,13 +3,17 @@ from __future__ import annotations import importlib.util +import os import sys import tempfile import unittest from pathlib import Path import numpy as np +import pytest +if os.environ.get("RUN_ANALYSIS_SCRIPT_TESTS") != "1": + pytest.skip("analysis-script tests are opt-in", allow_module_level=True) ROOT_DIR = Path(__file__).resolve().parents[1] SCRIPT_PATH = ROOT_DIR / "analysis_scripts" / "hil_noise_analysis" / "urbansound8k_input_profile.py"