From fad52d85413774ef502db5944affcf6ef81fcdb5 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Thu, 9 Apr 2026 13:35:17 -0500 Subject: [PATCH 01/24] Consolidate TelemetryCacheHandler to single lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace separate _lock and _callback_condition with a single _condition (threading.Condition) that protects all shared state: _shutdown, _is_flushing, _callbacks_item_count, _events_logged. The two-lock design had a lock order inversion: on_payload_transmitted acquired _lock then _callback_condition, while wait_for_callbacks acquired _callback_condition then _lock (via is_flushing). This could deadlock under concurrent flush + callback scenarios. Using one lock eliminates the ordering issue entirely and simplifies the code — the on_payload_transmitted callback no longer needs nested lock acquisition. Files changed: - olive/telemetry/telemetry.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/telemetry/telemetry.py | 42 +++++++++++++++++------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 0ddb690e2..acd6989aa 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -103,9 +103,10 @@ def __init__(self, telemetry: "Telemetry") -> None: # Single shared cache file for all processes self._cache_file_name = CACHE_FILE_NAME self._shutdown = False - # Protects all shared state to prevent race conditions - self._lock = threading.Lock() - self._callback_condition = threading.Condition() + # Single condition protects all shared state: _shutdown, _is_flushing, + # _callbacks_item_count, _events_logged. Using one lock eliminates + # lock ordering issues that arise with separate locks. + self._condition = threading.Condition() self._callbacks_item_count = 0 self._events_logged = 0 # Prevents concurrent flush operations @@ -118,7 +119,7 @@ def shutdown(self) -> None: offline resilience. If network is working, success callbacks already flushed. If network is down, flushing would fail anyway. """ - with self._lock: + with self._condition: self._shutdown = True def __del__(self): @@ -150,7 +151,7 @@ def on_payload_transmitted(self, args: "PayloadTransmittedCallbackArgs") -> None payload = None should_flush = False - with self._lock: + with self._condition: if self._shutdown: return @@ -159,9 +160,8 @@ def on_payload_transmitted(self, args: "PayloadTransmittedCallbackArgs") -> None # so it's unlikely that an event would suddenly fail and need to be cached # and we don't need to flush again. if self._is_flushing: - with self._callback_condition: - self._callbacks_item_count += args.item_count - self._callback_condition.notify_all() + self._callbacks_item_count += args.item_count + self._condition.notify_all() return if args.succeeded: @@ -182,26 +182,24 @@ def on_payload_transmitted(self, args: "PayloadTransmittedCallbackArgs") -> None # Fail silently - telemetry should never crash the application pass finally: - with self._callback_condition: + with self._condition: self._callbacks_item_count += args.item_count - self._callback_condition.notify_all() + self._condition.notify_all() def wait_for_callbacks(self, timeout_sec: float, during_flush: bool = False) -> bool: deadline = time.time() + timeout_sec while True: - with self._callback_condition: - callbacks_item_count = self._callbacks_item_count - expected_items = self._events_logged - if (during_flush or not self.is_flushing) and callbacks_item_count >= expected_items: + with self._condition: + if (during_flush or not self._is_flushing) and self._callbacks_item_count >= self._events_logged: return True remaining = deadline - time.time() if remaining <= 0: return False - with self._callback_condition: - self._callback_condition.wait(timeout=remaining) + with self._condition: + self._condition.wait(timeout=remaining) def record_event_logged(self, count: int = 1) -> None: - with self._callback_condition: + with self._condition: self._events_logged += count def _schedule_flush(self) -> None: @@ -218,7 +216,7 @@ def _schedule_flush(self) -> None: - Daemon thread is acceptable (flush is best-effort) """ # Check before spawning thread to avoid unnecessary thread creation - with self._lock: + with self._condition: if self._shutdown or self._is_flushing: return self._is_flushing = True @@ -231,7 +229,7 @@ def flush_task(): pass finally: # Always clear flag, even on exception - with self._lock: + with self._condition: self._is_flushing = False thread = threading.Thread(target=flush_task, daemon=True) @@ -350,7 +348,7 @@ def _flush_cache_file(self, cache_path: Path) -> None: flush_path = None try: # Check shutdown before starting (under lock to prevent race) - with self._lock: + with self._condition: if self._shutdown: return @@ -395,7 +393,7 @@ def _flush_cache_file(self, cache_path: Path) -> None: continue # Check if shutdown happened during flush - with self._lock: + with self._condition: if self._shutdown: # Restore cache to avoid data loss during shutdown if flush_path and flush_path.exists(): @@ -430,7 +428,7 @@ def _flush_cache_file(self, cache_path: Path) -> None: @property def is_flushing(self) -> bool: - with self._lock: + with self._condition: return self._is_flushing From fe680ac556f8aec432a4fc4d378a9dd5bf27fe41 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Thu, 9 Apr 2026 15:41:43 -0500 Subject: [PATCH 02/24] Fix concurrency issues in telemetry wait_for_callbacks: Hold _condition lock continuously from condition check through wait() to prevent missing notifications. Previously the lock was released between checking and waiting, allowing notify_all() to fire in the gap. TelemetryLogger: Add RLock to __new__ and get_default_logger to prevent race conditions when multiple threads create the singleton simultaneously. Uses RLock because get_default_logger calls __new__ which both need the same lock. Files changed: - olive/telemetry/telemetry.py - olive/telemetry/library/telemetry_logger.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/telemetry/library/telemetry_logger.py | 20 ++++++++++++-------- olive/telemetry/telemetry.py | 11 +++++------ 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/olive/telemetry/library/telemetry_logger.py b/olive/telemetry/library/telemetry_logger.py index 7eb236e75..c47c9eb0e 100644 --- a/olive/telemetry/library/telemetry_logger.py +++ b/olive/telemetry/library/telemetry_logger.py @@ -6,6 +6,7 @@ """High-level telemetry logger facade for easy usage.""" import logging +import threading import uuid from typing import Any, Callable, Optional @@ -28,6 +29,7 @@ class TelemetryLogger: _instance: Optional["TelemetryLogger"] = None _default_logger: Optional["TelemetryLogger"] = None + _singleton_lock = threading.RLock() _logger: Optional[logging.Logger] = None _logger_exporter: Optional[OneCollectorLogExporter] = None _logger_provider: Optional[LoggerProvider] = None @@ -39,9 +41,10 @@ def __new__(cls, options: Optional[OneCollectorExporterOptions] = None): options: Exporter options (only used on first instantiation) """ - if cls._instance is None: - cls._instance = super().__new__(cls) - cls._instance._initialize(options) + with cls._singleton_lock: + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance._initialize(options) return cls._instance @@ -151,11 +154,12 @@ def get_default_logger(cls, connection_string: Optional[str] = None) -> "Telemet TelemetryLogger instance """ - if cls._default_logger is None: - options = None - if connection_string: - options = OneCollectorExporterOptions(connection_string=connection_string) - cls._default_logger = cls(options=options) + with cls._singleton_lock: + if cls._default_logger is None: + options = None + if connection_string: + options = OneCollectorExporterOptions(connection_string=connection_string) + cls._default_logger = cls(options=options) return cls._default_logger diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index acd6989aa..a6e3b3b2e 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -188,14 +188,13 @@ def on_payload_transmitted(self, args: "PayloadTransmittedCallbackArgs") -> None def wait_for_callbacks(self, timeout_sec: float, during_flush: bool = False) -> bool: deadline = time.time() + timeout_sec - while True: - with self._condition: + with self._condition: + while True: if (during_flush or not self._is_flushing) and self._callbacks_item_count >= self._events_logged: return True - remaining = deadline - time.time() - if remaining <= 0: - return False - with self._condition: + remaining = deadline - time.time() + if remaining <= 0: + return False self._condition.wait(timeout=remaining) def record_event_logged(self, count: int = 1) -> None: From 8aa3d3d4ca7fcc1b5e4719764bc8bd67152606fa Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Thu, 9 Apr 2026 17:20:42 -0500 Subject: [PATCH 03/24] Simplify telemetry cache and singleton patterns 1. Remove base64 encoding from cache: write raw JSON lines instead. The base64 layer added 33% size overhead and prevented human inspection during debugging with no security benefit (cache dir is user-owned). Cache file is now directly readable. 2. Simplify cache flush: remove the .flush file rename dance (atomic rename, restore on failure, stale file cleanup). For ~5 events/session, simple lock-read-send-delete is sufficient. On failure the cache file persists for next retry. 3. Simplify Telemetry singleton: remove double-checked locking in __new__. The lock is cheap and called once at startup; the outer check saved a lock acquisition but added complexity. Net: -83 lines. Files changed: - olive/telemetry/telemetry.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/telemetry/telemetry.py | 125 ++++++++--------------------------- 1 file changed, 27 insertions(+), 98 deletions(-) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index a6e3b3b2e..2d243eb20 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -19,8 +19,6 @@ from olive.telemetry.library.event_source import event_source from olive.telemetry.library.telemetry_logger import TelemetryLogger, get_telemetry_logger from olive.telemetry.utils import ( - _decode_cache_line, - _encode_cache_line, _exclusive_file_lock, get_telemetry_base_dir, ) @@ -292,12 +290,11 @@ def _write_payload_to_cache(self, payload: bytes) -> None: if not entries: return - # Append base64-encoded newline-delimited entries + # Append newline-delimited JSON entries # Use exclusive file lock for multi-process safety with _exclusive_file_lock(cache_path, mode="a") as cache_file: for entry in entries: - plain = json.dumps(entry, ensure_ascii=False, separators=(",", ":")) - cache_file.write(_encode_cache_line(plain) + "\n") + cache_file.write(json.dumps(entry, ensure_ascii=False, separators=(",", ":")) + "\n") return except OSError as exc: # Retry only on transient access errors (file locked by another process) @@ -322,59 +319,28 @@ def _flush_cache(self) -> None: def _flush_cache_file(self, cache_path: Path) -> None: """Flush cached events back to telemetry service. - Approach: - 1. Atomically rename cache → .flush (claims ownership, prevents concurrent flushes) - 2. Read all events from .flush file - 3. Queue all events for sending via telemetry logger - 4. Force flush with 2-second timeout - 5. On success: delete .flush file - 6. On failure: restore .flush → cache for retry - - Multi-process coordination: - - `replace()` is atomic; only one process can successfully rename the cache file - - If another process already renamed it, we get FileNotFoundError and abort - - Stale .flush files from crashes are overwritten by the atomic rename - - Shutdown handling: - - If shutdown flag set during flush, restore cache before returning - - This preserves events even if callbacks don't fire during shutdown - - Callback behavior: - - Queued events trigger callbacks with success/failure - - Failed events are automatically re-cached via callbacks (unless shutting down) - - The _is_flushing flag prevents re-caching of replayed events during flush + Uses atomic rename to claim the cache file, preventing duplicate + sends when multiple processes flush concurrently. """ flush_path = None try: - # Check shutdown before starting (under lock to prevent race) with self._condition: if self._shutdown: return - if not cache_path.exists(): - return - - # Atomically rename to .flush file to claim ownership - # Overwrite any stale .flush file from crashed process (C# pattern) - flush_path = cache_path.with_name(f"{cache_path.name}.flush") + # Atomically rename to claim ownership — only one process can succeed + flush_path = cache_path.with_suffix(".flush") try: - # On Windows/POSIX, replace() overwrites existing files atomically cache_path.replace(flush_path) except FileNotFoundError: - # Cache already claimed by another flush or doesn't exist return - # Read all cached entries (base64-decoded) entries = _read_cache_entries(flush_path) - if not entries: - # Empty cache, just delete the flush file flush_path.unlink(missing_ok=True) return - # Replay all events through telemetry logger - # Note: _is_flushing flag (set by caller) prevents these callbacks from re-caching or triggering nested flushes - # (unlikely since we just successfully sent an event, indicating network is available) + # Replay cached events — _is_flushing flag prevents re-caching for entry in entries: try: event_name = entry["event_name"] @@ -384,46 +350,24 @@ def _flush_cache_file(self, cache_path: Path) -> None: attributes = json.loads(event_data) if not isinstance(attributes, dict): continue - # Preserve original timestamp attributes["initTs"] = entry.get("initTs", entry["ts"]) self._telemetry.log(event_name, attributes, None) except Exception: - # Skip malformed entries continue - # Check if shutdown happened during flush - with self._condition: - if self._shutdown: - # Restore cache to avoid data loss during shutdown - if flush_path and flush_path.exists(): - try: - cache_path.parent.mkdir(parents=True, exist_ok=True) - flush_path.replace(cache_path) - except Exception: - # Silently ignore errors during cleanup - pass - return - - # Wait for in-flight callbacks to complete before deciding success/failure flush_success = self.wait_for_callbacks(timeout_sec=5.0, during_flush=True) if flush_success: - # Success: delete the flush file (events were sent) - if flush_path: - flush_path.unlink(missing_ok=True) - elif flush_path and flush_path.exists(): - # Failure: restore cache for retry later - cache_path.parent.mkdir(parents=True, exist_ok=True) + flush_path.unlink(missing_ok=True) + else: + # Restore cache for next retry flush_path.replace(cache_path) except Exception: - # Best-effort restore on any exception to prevent data loss - if flush_path and flush_path.exists(): - try: - cache_path.parent.mkdir(parents=True, exist_ok=True) + # Best-effort restore on failure + try: + if flush_path and flush_path.exists(): flush_path.replace(cache_path) - except Exception: - # If restore fails, we lose the data (acceptable for telemetry) - pass - return + except Exception: + pass @property def is_flushing(self) -> bool: @@ -442,17 +386,12 @@ class Telemetry: _lock = threading.Lock() def __new__(cls): - """Create or return the singleton instance. - - Thread-safe singleton implementation using double-checked locking. - """ - if cls._instance is None: - with cls._lock: - # Double-check pattern to prevent race conditions - if cls._instance is None: - instance = super().__new__(cls) - instance._initialized = False - cls._instance = instance + """Create or return the singleton instance.""" + with cls._lock: + if cls._instance is None: + instance = super().__new__(cls) + instance._initialized = False + cls._instance = instance return cls._instance def __init__(self): @@ -740,18 +679,10 @@ def _set_nested_value(data: dict[str, Any], key: str, value: Any) -> None: def _read_cache_entries(cache_path: Path) -> list[dict[str, Any]]: - """Read all entries from a cache file, decoding each line. - - Design decisions: - - Use file locking for multi-process safety - - Continue reading past malformed entries (partial data recovery) - - Return empty list on complete read failure (fail gracefully) - - Each line is base64-decoded before JSON parsing. + """Read all JSON-line entries from a cache file. - Assumptions: - - Cache file contains newline-delimited base64-encoded entries (one per line) - - Each line is independent (one malformed line doesn't affect others) - - Empty or whitespace-only lines are skipped + Each line is independent — malformed lines are skipped without + affecting other entries. Returns empty list on read failure. """ entries = [] try: @@ -761,13 +692,11 @@ def _read_cache_entries(cache_path: Path) -> list[dict[str, Any]]: if not line: continue try: - line = json.loads(_decode_cache_line(line)) - if isinstance(line, dict): - entries.append(line) + parsed = json.loads(line) + if isinstance(parsed, dict): + entries.append(parsed) except Exception: - # Malformed line, skip and continue continue except Exception: - # If file cannot be opened or read, return empty list return [] return entries From 166487c1bfd53e0efa668f75f97c65efa38072ea Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Thu, 9 Apr 2026 21:39:20 -0500 Subject: [PATCH 04/24] Fix callback double-counting and multi-process cache overwrite Fix 1: Remove duplicate callback count increment in on_payload_transmitted when _is_flushing is true. The count was incremented both in the early-return path and in the finally block, causing wait_for_callbacks to think events completed before they actually did. Now only the finally block increments (which always runs, even on return). Fix 2: Replace flush_path.replace(cache_path) with new _restore_flush_file() method that appends flush entries into the cache file instead of overwriting it. This prevents losing events written by another process while a flush was in progress. Files changed: - olive/telemetry/telemetry.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/telemetry/telemetry.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 2d243eb20..0cc8b5c73 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -158,8 +158,6 @@ def on_payload_transmitted(self, args: "PayloadTransmittedCallbackArgs") -> None # so it's unlikely that an event would suddenly fail and need to be cached # and we don't need to flush again. if self._is_flushing: - self._callbacks_item_count += args.item_count - self._condition.notify_all() return if args.succeeded: @@ -316,6 +314,29 @@ def _flush_cache(self) -> None: self._flush_cache_file(cache_path) + def _restore_flush_file(self, flush_path: Optional[Path], cache_path: Path) -> None: + """Restore a claimed flush file back into the cache without overwriting new entries. + + Another process may create a fresh cache file while this process is flushing. + Appending the old flush contents preserves both sets of entries. + """ + if not flush_path or not flush_path.exists(): + return + + try: + cache_path.parent.mkdir(parents=True, exist_ok=True) + with ( + _exclusive_file_lock(cache_path, mode="a") as cache_file, + _exclusive_file_lock(flush_path, mode="r") as flush_file, + ): + for raw_line in flush_file: + line = raw_line.rstrip("\n") + if line: + cache_file.write(line + "\n") + flush_path.unlink(missing_ok=True) + except Exception: + pass + def _flush_cache_file(self, cache_path: Path) -> None: """Flush cached events back to telemetry service. @@ -360,14 +381,10 @@ def _flush_cache_file(self, cache_path: Path) -> None: flush_path.unlink(missing_ok=True) else: # Restore cache for next retry - flush_path.replace(cache_path) + self._restore_flush_file(flush_path, cache_path) except Exception: # Best-effort restore on failure - try: - if flush_path and flush_path.exists(): - flush_path.replace(cache_path) - except Exception: - pass + self._restore_flush_file(flush_path, cache_path) @property def is_flushing(self) -> bool: From a6ff131e7b8f57c597eee73a66152c997aa05d81 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 13:39:30 -0500 Subject: [PATCH 05/24] Track recipe telemetry in CI Emit a single OliveRecipe event per workflow so recipe usage and failures can be measured without double-counting nested action failures. Keep CI in recipe-only mode, make Olive app naming explicit, update the telemetry ingestion key, and harden the cache/locking path that the new telemetry depends on. Files changed: - docs/Privacy.md - olive/cli/base.py - olive/cli/run.py - olive/telemetry/constants.py - olive/telemetry/library/options.py - olive/telemetry/library/telemetry_logger.py - olive/telemetry/telemetry.py - olive/telemetry/telemetry_extensions.py - olive/telemetry/utils.py - olive/workflows/run/run.py - test/test_telemetry.py - test/workflows/test_workflow_run.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/Privacy.md | 2 +- olive/cli/base.py | 25 ++- olive/cli/run.py | 18 +- olive/telemetry/constants.py | 2 +- olive/telemetry/library/options.py | 1 + olive/telemetry/library/telemetry_logger.py | 21 +- olive/telemetry/telemetry.py | 52 ++++- olive/telemetry/telemetry_extensions.py | 18 +- olive/telemetry/utils.py | 95 ++++++++- olive/workflows/run/run.py | 225 ++++++++++++++++++-- test/test_telemetry.py | 108 ++++++++++ test/workflows/test_workflow_run.py | 90 ++++++++ 12 files changed, 599 insertions(+), 58 deletions(-) create mode 100644 test/test_telemetry.py diff --git a/docs/Privacy.md b/docs/Privacy.md index 95aee00b0..9e1001e72 100644 --- a/docs/Privacy.md +++ b/docs/Privacy.md @@ -13,4 +13,4 @@ In addition, Olive may collect additional telemetry data such as: - Performance data - Exception information -Collection of this additional telemetry can be disabled by adding the `--disable_telemetry` flag to any Olive CLI command, or by setting the `OLIVE_DISABLE_TELEMETRY` environment variable to `1` before running. Telemetry is also automatically disabled when a CI/CD environment is detected (e.g., GitHub Actions, Azure Pipelines, Jenkins). If telemetry is enabled, but cannot be sent to Microsoft, it will be stored locally and sent when a connection is available. You can override the default cache location by setting the `OLIVE_TELEMETRY_CACHE_DIR` environment variable to a valid directory path. +Collection of this additional telemetry can be disabled by adding the `--disable_telemetry` flag to any Olive CLI command, or by setting the `OLIVE_DISABLE_TELEMETRY` environment variable to `1` before running. In CI/CD environments (e.g., GitHub Actions, Azure Pipelines, Jenkins), Olive suppresses the general heartbeat/action/error events and only emits the `OliveRecipe` event. If telemetry is enabled, but cannot be sent to Microsoft, it will be stored locally and sent when a connection is available. You can override the default cache location by setting the `OLIVE_TELEMETRY_CACHE_DIR` environment variable to a valid directory path. diff --git a/olive/cli/base.py b/olive/cli/base.py index 75fd2816c..90ba0dcc3 100644 --- a/olive/cli/base.py +++ b/olive/cli/base.py @@ -4,6 +4,7 @@ # -------------------------------------------------------------------------- import json import re +import tempfile from abc import ABC, abstractmethod from argparse import ArgumentParser, Namespace from pathlib import Path @@ -16,6 +17,7 @@ from olive.hardware.accelerator import AcceleratorSpec from olive.hardware.constants import DEVICE_TO_EXECUTION_PROVIDERS from olive.resource_path import OLIVE_RESOURCE_ANNOTATIONS +from olive.workflows import run as olive_run class BaseOliveCLICommand(ABC): @@ -29,10 +31,6 @@ def __init__(self, parser: ArgumentParser, args: Namespace, unknown_args: Option parser.error(f"Unknown arguments: {unknown_args}") def _run_workflow(self): - import tempfile - - from olive.workflows import run as olive_run - Path(self.args.output_path).mkdir(parents=True, exist_ok=True) with tempfile.TemporaryDirectory(prefix="olive-cli-tmp-", dir=self.args.output_path) as tempdir: @@ -42,17 +40,28 @@ def _run_workflow(self): if self.args.dry_run: print("Dry run mode enabled. Configuration file is generated but no optimization is performed.") return None - workflow_output = olive_run(run_config) + workflow_output = olive_run(run_config, recipe_telemetry_metadata=self._get_recipe_telemetry_metadata()) if not workflow_output.has_output_model(): print("No output model produced. Please check the log for details.") else: print(f"Model is saved at {self.args.output_path}") return workflow_output + def _get_recipe_telemetry_metadata(self) -> dict[str, str]: + recipe_name = self.__class__.__name__ + if recipe_name.endswith("Command"): + recipe_name = recipe_name[: -len("Command")] + return { + "recipe_name": recipe_name, + "recipe_command": recipe_name, + "recipe_source": "generated_cli", + "recipe_format": "generated", + } + @staticmethod def _parse_extra_options(kv_items): - from onnxruntime_genai import __version__ as OrtGenaiVersion - from packaging import version + from onnxruntime_genai import __version__ as OrtGenaiVersion # noqa: PLC0415 + from packaging import version # noqa: PLC0415 if version.parse(OrtGenaiVersion) <= version.parse("0.9.0"): raise ValueError( @@ -60,7 +69,7 @@ def _parse_extra_options(kv_items): "Please either upgrade to onnxruntime-genai version > 0.9.0 or use the model builder pass directly in the config file." ) - from onnxruntime_genai.models.builder import parse_extra_options + from onnxruntime_genai.models.builder import parse_extra_options # noqa: PLC0415 return parse_extra_options(kv_items) diff --git a/olive/cli/run.py b/olive/cli/run.py index 6d2a831ae..1d8080581 100644 --- a/olive/cli/run.py +++ b/olive/cli/run.py @@ -3,6 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from argparse import ArgumentParser +from pathlib import Path from olive.cli.base import ( BaseOliveCLICommand, @@ -11,7 +12,9 @@ add_telemetry_options, get_input_model_config, ) +from olive.common.config_utils import load_config_file from olive.telemetry import action +from olive.workflows import run as olive_run class WorkflowRunCommand(BaseOliveCLICommand): @@ -49,11 +52,9 @@ def register_subcommand(parser: ArgumentParser): @action def run(self): - from olive.common.config_utils import load_config_file - from olive.workflows import run as olive_run - # allow the run_config to be a dict already (for api use) - run_config = self.args.run_config + run_config_input = self.args.run_config + run_config = run_config_input if not isinstance(run_config, dict): run_config = load_config_file(run_config) if input_model_config := get_input_model_config(self.args, required=False): @@ -73,6 +74,15 @@ def run(self): list_required_packages=self.args.list_required_packages, tempdir=self.args.tempdir, package_config=self.args.package_config, + recipe_telemetry_metadata={ + "recipe_command": "WorkflowRun", + "recipe_source": "config_dict" if isinstance(run_config_input, dict) else "config_file", + "recipe_format": "dict" + if isinstance(run_config_input, dict) + else Path(run_config_input).suffix.lstrip(".").lower() or "unknown", + "execution_mode": "list_required_packages" if self.args.list_required_packages else "run", + "package_config_provided": bool(self.args.package_config), + }, ) if self.args.list_required_packages is True: diff --git a/olive/telemetry/constants.py b/olive/telemetry/constants.py index ca9e150b1..535929866 100644 --- a/olive/telemetry/constants.py +++ b/olive/telemetry/constants.py @@ -5,4 +5,4 @@ """OneCollector connection string.""" -CONNECTION_STRING = "SW5zdHJ1bWVudGF0aW9uS2V5PTlkNWRkYWVjNjFlMjQ1NjdiNzg4YTIwYWVhMzI0NjMxLTcyMzdkN2M2LWVlNjEtNGNmZC1iYjdiLTU5MDNhOTcyYzJlNC03MDQ3" +CONNECTION_STRING = "SW5zdHJ1bWVudGF0aW9uS2V5PTYyMTUwOTExZGMwMDRmYzliYjY3YmE5NjA2NDI3ZTU2LWVjNjFmOWFmLTVkN2EtNGQxOS1hZjMxLWI5Y2Q2OWU5ODdmMS02OTE1" diff --git a/olive/telemetry/library/options.py b/olive/telemetry/library/options.py index dd934cad2..31fd1ba19 100644 --- a/olive/telemetry/library/options.py +++ b/olive/telemetry/library/options.py @@ -62,6 +62,7 @@ class OneCollectorExporterOptions: """Configuration options for OneCollector exporter.""" connection_string: Optional[str] = None + service_name: Optional[str] = None transport_options: OneCollectorTransportOptions = field(default_factory=OneCollectorTransportOptions) # Internal fields populated during validation diff --git a/olive/telemetry/library/telemetry_logger.py b/olive/telemetry/library/telemetry_logger.py index c47c9eb0e..928031e15 100644 --- a/olive/telemetry/library/telemetry_logger.py +++ b/olive/telemetry/library/telemetry_logger.py @@ -19,6 +19,8 @@ from olive.telemetry.library.options import OneCollectorExporterOptions from olive.version import __version__ as VERSION +DEFAULT_SERVICE_NAME = __name__.split(".", maxsplit=1)[0] + class TelemetryLogger: """Singleton telemetry logger for simplified OneCollector integration. @@ -60,10 +62,11 @@ def _initialize(self, options: Optional[OneCollectorExporterOptions]) -> None: self._logger_exporter = OneCollectorLogExporter(options=options) # Create logger provider + service_name = options.service_name or DEFAULT_SERVICE_NAME self._logger_provider = LoggerProvider( resource=Resource.create( { - "service.name": __name__.split(".", maxsplit=1)[0], + "service.name": service_name, "service.version": VERSION, "service.instance.id": str(uuid.uuid4()), # Unique instance ID; can double as session ID } @@ -144,11 +147,14 @@ def shutdown(self) -> None: self._logger_provider.shutdown() @classmethod - def get_default_logger(cls, connection_string: Optional[str] = None) -> "TelemetryLogger": + def get_default_logger( + cls, connection_string: Optional[str] = None, service_name: Optional[str] = None + ) -> "TelemetryLogger": """Get or create the default telemetry logger. Args: connection_string: OneCollector connection string (only used on first call) + service_name: Logical application/service name for emitted telemetry (only used on first call) Returns: TelemetryLogger instance @@ -158,7 +164,9 @@ def get_default_logger(cls, connection_string: Optional[str] = None) -> "Telemet if cls._default_logger is None: options = None if connection_string: - options = OneCollectorExporterOptions(connection_string=connection_string) + options = OneCollectorExporterOptions( + connection_string=connection_string, service_name=service_name + ) cls._default_logger = cls(options=options) return cls._default_logger @@ -171,17 +179,20 @@ def shutdown_default_logger(cls) -> None: cls._default_logger = None -def get_telemetry_logger(connection_string: Optional[str] = None) -> TelemetryLogger: +def get_telemetry_logger( + connection_string: Optional[str] = None, service_name: Optional[str] = None +) -> TelemetryLogger: """Get or create the default telemetry logger. Args: connection_string: OneCollector connection string (only used on first call) + service_name: Logical application/service name for emitted telemetry (only used on first call) Returns: TelemetryLogger instance """ - return TelemetryLogger.get_default_logger(connection_string=connection_string) + return TelemetryLogger.get_default_logger(connection_string=connection_string, service_name=service_name) def log_event(event_name: str, attributes: Optional[dict[str, Any]] = None) -> None: diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 0cc8b5c73..6a9bebe17 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -28,6 +28,7 @@ # Default event names used by the high-level telemetry helpers. HEARTBEAT_EVENT_NAME = "OliveHeartbeat" +RECIPE_EVENT_NAME = "OliveRecipe" # CI/CD environment variables whose presence indicates an automated pipeline. _CI_ENV_VARS = ( @@ -41,6 +42,7 @@ ) ACTION_EVENT_NAME = "OliveAction" ERROR_EVENT_NAME = "OliveError" +APP_NAME = "Olive" ALLOWED_KEYS = { HEARTBEAT_EVENT_NAME: { @@ -70,6 +72,34 @@ "app_instance_id", "initTs", }, + RECIPE_EVENT_NAME: { + "recipe_name", + "recipe_hash", + "recipe_source", + "recipe_format", + "recipe_command", + "execution_mode", + "workflow_id", + "success", + "exception_type", + "input_model_type", + "input_model_source", + "input_model_name_hash", + "model_task", + "target_system_type", + "target_device", + "execution_provider", + "execution_providers", + "pass_types", + "pass_count", + "data_config_count", + "search_enabled", + "package_config_provided", + "is_ci", + "app_version", + "app_instance_id", + "initTs", + }, } CRITICAL_EVENTS = {HEARTBEAT_EVENT_NAME} @@ -78,6 +108,11 @@ CACHE_FILE_NAME = "olive.json" +def is_ci_environment() -> bool: + """Detect CI/CD environments by checking well-known environment variables.""" + return any(os.environ.get(var) for var in _CI_ENV_VARS) + + class TelemetryCacheHandler: """Handles caching of failed telemetry events for offline resilience. @@ -240,7 +275,7 @@ def cache_path(self) -> Optional[Path]: """ telemetry_cache_dir = None if "OLIVE_TELEMETRY_CACHE_DIR" in os.environ: - telemetry_cache_dir = os.environ["OLIVE_TELEMETRY_CACHE_DIR"] + telemetry_cache_dir = Path(os.environ["OLIVE_TELEMETRY_CACHE_DIR"]).expanduser() if not telemetry_cache_dir: telemetry_cache_dir = get_telemetry_base_dir() / "cache" return telemetry_cache_dir / self._cache_file_name @@ -419,6 +454,7 @@ def __init__(self): self._logger = None self._cache_handler = None + self._recipe_only_ci_telemetry = False try: self._logger = self._create_logger() @@ -426,11 +462,9 @@ def __init__(self): self._cache_handler = TelemetryCacheHandler(self) self._setup_payload_callbacks() - if self._is_ci_environment(): - self.disable_telemetry() - self._initialized = True - return - self._log_heartbeat() + self._recipe_only_ci_telemetry = self._is_ci_environment() + if not self._is_ci_environment(): + self._log_heartbeat() if os.environ.get("OLIVE_DISABLE_TELEMETRY") == "1": self.disable_telemetry() self._initialized = True @@ -441,11 +475,11 @@ def __init__(self): @staticmethod def _is_ci_environment() -> bool: """Detect CI/CD environments by checking well-known environment variables.""" - return any(os.environ.get(var) for var in _CI_ENV_VARS) + return is_ci_environment() def _create_logger(self) -> Optional[TelemetryLogger]: try: - return get_telemetry_logger(base64.b64decode(CONNECTION_STRING).decode()) + return get_telemetry_logger(base64.b64decode(CONNECTION_STRING).decode(), service_name=APP_NAME) except Exception: return None @@ -498,6 +532,8 @@ def log( """ try: + if self._recipe_only_ci_telemetry and event_name != RECIPE_EVENT_NAME: + return attrs = _merge_metadata(attributes, metadata) if self._logger is None: return diff --git a/olive/telemetry/telemetry_extensions.py b/olive/telemetry/telemetry_extensions.py index e5b13395d..ff5a2c703 100644 --- a/olive/telemetry/telemetry_extensions.py +++ b/olive/telemetry/telemetry_extensions.py @@ -9,7 +9,7 @@ from types import TracebackType from typing import Any, Callable, Optional, TypeVar -from olive.telemetry.telemetry import ACTION_EVENT_NAME, ERROR_EVENT_NAME, _get_logger +from olive.telemetry.telemetry import ACTION_EVENT_NAME, ERROR_EVENT_NAME, RECIPE_EVENT_NAME, _get_logger from olive.telemetry.utils import _format_exception_message _TFunc = TypeVar("_TFunc", bound=Callable[..., Any]) @@ -45,6 +45,22 @@ def log_error( telemetry.log(ERROR_EVENT_NAME, attributes, metadata) +def log_recipe_result( + recipe_name: str, + success: bool, + metadata: Optional[dict[str, Any]] = None, + exception_type: Optional[str] = None, +) -> None: + telemetry = _get_logger() + attributes = { + "recipe_name": recipe_name, + "success": success, + } + if exception_type: + attributes["exception_type"] = exception_type + telemetry.log(RECIPE_EVENT_NAME, attributes, metadata) + + def _resolve_invoked_from(skip_frames: int = 0) -> str: """Resolve how Olive was invoked by examining the call stack. diff --git a/olive/telemetry/utils.py b/olive/telemetry/utils.py index 52a39acde..830ca055d 100644 --- a/olive/telemetry/utils.py +++ b/olive/telemetry/utils.py @@ -10,9 +10,64 @@ import traceback from pathlib import Path from types import TracebackType -from typing import Optional +from typing import ClassVar, Optional + +if os.name == "posix": + import fcntl +else: + fcntl = None + +if os.name == "nt": + import ctypes + import msvcrt + from ctypes import wintypes + + _LOCKFILE_EXCLUSIVE_LOCK = 0x00000002 + + class _Overlapped(ctypes.Structure): + _fields_: ClassVar[list[tuple[str, object]]] = [ + ("Internal", ctypes.c_void_p), + ("InternalHigh", ctypes.c_void_p), + ("Offset", wintypes.DWORD), + ("OffsetHigh", wintypes.DWORD), + ("hEvent", wintypes.HANDLE), + ] + + _kernel32 = ctypes.WinDLL("kernel32", use_last_error=True) + _lock_file_ex = _kernel32.LockFileEx + _lock_file_ex.argtypes = [ + wintypes.HANDLE, + wintypes.DWORD, + wintypes.DWORD, + wintypes.DWORD, + wintypes.DWORD, + ctypes.POINTER(_Overlapped), + ] + _lock_file_ex.restype = wintypes.BOOL + _unlock_file_ex = _kernel32.UnlockFileEx + _unlock_file_ex.argtypes = [ + wintypes.HANDLE, + wintypes.DWORD, + wintypes.DWORD, + wintypes.DWORD, + ctypes.POINTER(_Overlapped), + ] + _unlock_file_ex.restype = wintypes.BOOL +else: + ctypes = None + msvcrt = None + wintypes = None + _lock_file_ex = None + _unlock_file_ex = None + _Overlapped = None ORT_SUPPORT_DIR = r"Microsoft/DeveloperTools/.onnxruntime" +_WINDOWS_FILE_LOCK_LENGTH = 0x7FFFFFFF + + +def _raise_windows_lock_error(message: str) -> None: + error_code = ctypes.get_last_error() if ctypes is not None else 0 + raise OSError(error_code, message) def _resolve_home_dir() -> Path: @@ -72,7 +127,7 @@ def _format_exception_message(ex: BaseException, tb: Optional[TracebackType] = N class _ExclusiveFileLock: """Cross-platform exclusive file lock context manager. - Uses fcntl on Unix/Linux/macOS, msvcrt on Windows. + Uses fcntl on Unix/Linux/macOS and LockFileEx on Windows. Prevents cache corruption when multiple processes access the same file. Design decisions: @@ -89,6 +144,7 @@ def __init__(self, file_path: Path, mode: str): self.file_path = file_path self.mode = mode self.file = None + self._windows_overlapped = None def __enter__(self): self.file = open(self.file_path, self.mode, encoding="utf-8") @@ -96,25 +152,44 @@ def __enter__(self): try: # Platform-specific locking if os.name == "posix": - import fcntl - fcntl.flock(self.file.fileno(), fcntl.LOCK_EX) elif os.name == "nt": - import msvcrt - - # Lock 1 byte at position 0 - msvcrt.locking(self.file.fileno(), msvcrt.LK_LOCK, 1) + self._windows_overlapped = _Overlapped() + handle = msvcrt.get_osfhandle(self.file.fileno()) + if not _lock_file_ex( + handle, + _LOCKFILE_EXCLUSIVE_LOCK, + 0, + _WINDOWS_FILE_LOCK_LENGTH, + _WINDOWS_FILE_LOCK_LENGTH, + ctypes.byref(self._windows_overlapped), + ): + _raise_windows_lock_error("Failed to lock telemetry cache file") except Exception: self.file.close() self.file = None + self._windows_overlapped = None raise return self.file def __exit__(self, exc_type, exc_val, exc_tb): if self.file: - # Unlock happens automatically on close - self.file.close() + try: + if os.name == "nt" and self._windows_overlapped is not None: + handle = msvcrt.get_osfhandle(self.file.fileno()) + if not _unlock_file_ex( + handle, + 0, + _WINDOWS_FILE_LOCK_LENGTH, + _WINDOWS_FILE_LOCK_LENGTH, + ctypes.byref(self._windows_overlapped), + ): + _raise_windows_lock_error("Failed to unlock telemetry cache file") + finally: + self.file.close() + self.file = None + self._windows_overlapped = None def _exclusive_file_lock(file_path: Path, mode: str): diff --git a/olive/workflows/run/run.py b/olive/workflows/run/run.py index 89100e1c1..8cf87feb9 100644 --- a/olive/workflows/run/run.py +++ b/olive/workflows/run/run.py @@ -5,20 +5,38 @@ import logging from copy import deepcopy from pathlib import Path -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union -from olive.common.utils import set_tempdir +from olive.common.utils import hash_dict, hash_string, set_tempdir from olive.hardware.constants import ExecutionProvider from olive.logging import set_default_logger_severity, set_ort_logger_severity, set_verbosity_info from olive.package_config import OlivePackageConfig +from olive.resource_path import create_resource_path, find_all_resources from olive.systems.accelerator_creator import create_accelerator from olive.systems.common import SystemType +from olive.telemetry.telemetry import is_ci_environment +from olive.telemetry.telemetry_extensions import log_recipe_result from olive.workflows.run.config import RunConfig if TYPE_CHECKING: from olive.engine.config import RunPassConfig logger = logging.getLogger(__name__) +RECIPE_HASH_REDACTED_VALUE = "" +RECIPE_HASH_REDACTED_KEYS = { + "output_dir", + "cache_dir", + "tempdir", + "additional_files", + "dockerfile", + "build_context_path", + "python_environment_path", + "prepend_to_path", + "script_dir", + "model_script", + "package_config", + "work_dir", +} def get_required_packages(package_config: OlivePackageConfig, run_config: RunConfig) -> set[str]: @@ -104,7 +122,7 @@ def run_engine(package_config: OlivePackageConfig, run_config: RunConfig): # ort_log_severity_level: C++ logging levels try: - import onnxruntime as ort + import onnxruntime as ort # noqa: PLC0415 ort.set_default_logger_severity(run_config.engine.ort_log_severity_level) except Exception: @@ -152,30 +170,54 @@ def run( list_required_packages: bool = False, package_config: Optional[Union[str, Path, dict]] = None, tempdir: Optional[Union[str, Path]] = None, + recipe_telemetry_metadata: Optional[dict[str, Any]] = None, ): # set tempdir set_tempdir(tempdir) + package_config_provided = package_config is not None if package_config is None: package_config = OlivePackageConfig.get_default_config_path() - package_config = OlivePackageConfig.parse_file_or_obj(package_config) - run_config: RunConfig = RunConfig.parse_file_or_obj(run_config) - - if list_required_packages: - # set the log level to INFO for packages - set_verbosity_info() - required_packages = get_required_packages(package_config, run_config) - generate_files_from_packages(required_packages, "olive_requirements.txt") - return None - - if run_config.engine.host and run_config.engine.host.type == SystemType.Docker: - docker_system = run_config.engine.host.create_system() - return docker_system.run_workflow(run_config) - - # set log level for olive - set_default_logger_severity(run_config.engine.log_severity_level) - return run_engine(package_config, run_config) + parsed_run_config = None + success = False + exception_type = None + try: + package_config = OlivePackageConfig.parse_file_or_obj(package_config) + parsed_run_config = RunConfig.parse_file_or_obj(run_config) + + if list_required_packages: + # set the log level to INFO for packages + set_verbosity_info() + required_packages = get_required_packages(package_config, parsed_run_config) + generate_files_from_packages(required_packages, "olive_requirements.txt") + success = True + return None + + if parsed_run_config.engine.host and parsed_run_config.engine.host.type == SystemType.Docker: + docker_system = parsed_run_config.engine.host.create_system() + workflow_output = docker_system.run_workflow(parsed_run_config) + success = True + return workflow_output + + # set log level for olive + set_default_logger_severity(parsed_run_config.engine.log_severity_level) + workflow_output = run_engine(package_config, parsed_run_config) + success = True + return workflow_output + except Exception as exc: + exception_type = type(exc).__name__ + raise + finally: + metadata = _build_recipe_result_metadata( + run_config, + parsed_run_config, + recipe_telemetry_metadata, + list_required_packages=list_required_packages, + package_config_provided=package_config_provided, + ) + recipe_name = metadata.pop("recipe_name") + log_recipe_result(recipe_name, success=success, metadata=metadata, exception_type=exception_type) def generate_files_from_packages(packages, file_name): @@ -199,3 +241,146 @@ def get_used_passes_configs(run_config: RunConfig) -> list["RunPassConfig"]: def get_run_on_target(package_config: OlivePackageConfig, pass_config: "RunPassConfig") -> bool: pass_module_config = package_config.get_pass_module_config(pass_config.type) return pass_module_config.run_on_target + + +def _build_recipe_result_metadata( + run_config_input: Union[str, Path, dict], + run_config: Optional[RunConfig], + recipe_telemetry_metadata: Optional[dict[str, Any]], + *, + list_required_packages: bool, + package_config_provided: bool, +) -> dict[str, Any]: + metadata = dict(recipe_telemetry_metadata or {}) + default_source, default_format = _classify_run_config_source(run_config_input) + metadata.setdefault("recipe_source", default_source) + metadata.setdefault("recipe_format", default_format) + metadata.setdefault("execution_mode", "list_required_packages" if list_required_packages else "run") + metadata.setdefault("package_config_provided", package_config_provided) + metadata["is_ci"] = is_ci_environment() + + if run_config is None: + metadata.setdefault("recipe_name", metadata.get("recipe_command") or "WorkflowRun") + return metadata + + run_config_json = run_config.to_json(make_absolute=False) + model_metadata = _extract_input_model_metadata(run_config_json["input_model"]) + target_metadata = _extract_target_metadata(run_config) + pass_types = [pass_config.type for pass_config in get_used_passes_configs(run_config)] + + metadata.setdefault("recipe_name", metadata.get("recipe_command") or run_config.workflow_id) + metadata.setdefault("workflow_id", run_config.workflow_id) + metadata.setdefault("recipe_hash", _build_recipe_hash(run_config_json)) + metadata.setdefault("input_model_type", run_config.input_model.type) + metadata.setdefault("input_model_source", model_metadata["input_model_source"]) + metadata.setdefault("input_model_name_hash", model_metadata["input_model_name_hash"]) + metadata.setdefault("model_task", model_metadata["model_task"]) + metadata.setdefault("target_system_type", target_metadata["target_system_type"]) + metadata.setdefault("target_device", target_metadata["target_device"]) + metadata.setdefault("execution_provider", target_metadata["execution_provider"]) + metadata.setdefault("execution_providers", target_metadata["execution_providers"]) + metadata.setdefault("pass_types", ";".join(pass_types)) + metadata.setdefault("pass_count", len(pass_types)) + metadata.setdefault("data_config_count", len(run_config.data_configs)) + metadata.setdefault("search_enabled", bool(run_config.engine.search_strategy)) + return metadata + + +def _classify_run_config_source(run_config_input: Union[str, Path, dict]) -> tuple[str, str]: + if isinstance(run_config_input, dict): + return "config_dict", "dict" + + suffix = Path(run_config_input).suffix.lstrip(".").lower() + return "config_file", suffix or "unknown" + + +def _extract_input_model_metadata(input_model_config: dict[str, Any]) -> dict[str, Optional[str]]: + model_config = input_model_config.get("config", {}) + model_attributes = model_config.get("model_attributes", {}) + model_task = model_attributes.get("hf_task") or model_config.get("task") + raw_identifier = model_attributes.get("_name_or_path") or model_config.get("model_path") + return { + "input_model_source": _classify_input_model_source(raw_identifier), + "input_model_name_hash": _hash_value(raw_identifier), + "model_task": str(model_task) if model_task is not None else None, + } + + +def _classify_input_model_source(model_identifier: Any) -> str: + if model_identifier is None: + return "unknown" + if isinstance(model_identifier, dict): + resource_type = model_identifier.get("type") + if resource_type == "azureml_registry_model": + return "azureml" + return "structured_resource" + + identifier = str(model_identifier) + if identifier.startswith("azureml://"): + return "azureml" + if identifier.startswith("https://huggingface.co/"): + return "huggingface_url" + if identifier.startswith(("http://", "https://")): + return "url" + + resource_path = create_resource_path(identifier) + if resource_path.is_local_resource(): + return "local_file" if resource_path.type.value == "file" else "local_folder" + return "string_name" + + +def _extract_target_metadata(run_config: RunConfig) -> dict[str, Optional[str]]: + target_system = run_config.engine.target or run_config.engine.host + target_system_type = target_system.type.value if target_system is not None else None + target_device = None + execution_provider = None + execution_providers = None + + accelerators = target_system.config.accelerators if target_system and target_system.config else None + if accelerators: + accelerator = accelerators[0] + target_device = str(accelerator.device) if accelerator.device is not None else None + ep_values = accelerator.get_ep_strs() or [] + if ep_values: + execution_provider = ep_values[0] + execution_providers = ";".join(ep_values) + + return { + "target_system_type": target_system_type, + "target_device": target_device, + "execution_provider": execution_provider, + "execution_providers": execution_providers, + } + + +def _build_recipe_hash(run_config_json: dict[str, Any]) -> str: + sanitized = deepcopy(run_config_json) + _redact_recipe_hash_keys(sanitized) + for path in find_all_resources(sanitized): + _set_path_value(sanitized, path, RECIPE_HASH_REDACTED_VALUE) + return hash_dict(sanitized)[:16] + + +def _redact_recipe_hash_keys(value: Any, key: Optional[str] = None) -> Any: + if key in RECIPE_HASH_REDACTED_KEYS: + return RECIPE_HASH_REDACTED_VALUE + if isinstance(value, dict): + for child_key in list(value): + value[child_key] = _redact_recipe_hash_keys(value[child_key], child_key) + elif isinstance(value, list): + for index, item in enumerate(value): + value[index] = _redact_recipe_hash_keys(item, key) + return value + + +def _set_path_value(container: Any, path: tuple[Any, ...], value: Any) -> None: + current = container + for key in path[:-1]: + current = current[key] + current[path[-1]] = value + + +def _hash_value(value: Any) -> Optional[str]: + if value is None: + return None + return hash_string(str(value))[:16] diff --git a/test/test_telemetry.py b/test/test_telemetry.py new file mode 100644 index 000000000..f65096ee7 --- /dev/null +++ b/test/test_telemetry.py @@ -0,0 +1,108 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import os +import subprocess +import sys +import time +from pathlib import Path +from unittest.mock import Mock, patch + +import pytest +from olive.telemetry.library.telemetry_logger import TelemetryLogger +from olive.telemetry.telemetry import ACTION_EVENT_NAME, CACHE_FILE_NAME, RECIPE_EVENT_NAME, Telemetry, TelemetryCacheHandler +from olive.telemetry.utils import _exclusive_file_lock + + +def test_cache_path_uses_env_override(tmp_path, monkeypatch): + cache_dir = tmp_path / "telemetry-cache" + monkeypatch.setenv("OLIVE_TELEMETRY_CACHE_DIR", str(cache_dir)) + + handler = TelemetryCacheHandler(Mock()) + + assert handler.cache_path == cache_dir / CACHE_FILE_NAME + assert isinstance(handler.cache_path, Path) + + +def test_telemetry_logger_uses_explicit_service_name(): + TelemetryLogger.shutdown_default_logger() + TelemetryLogger._instance = None + TelemetryLogger._default_logger = None + + try: + logger = TelemetryLogger.get_default_logger( + connection_string="InstrumentationKey=12345678-1234-1234-1234-123456789abc-tenant", + service_name="Olive", + ) + assert logger._logger_provider.resource.attributes["service.name"] == "Olive" + finally: + TelemetryLogger.shutdown_default_logger() + TelemetryLogger._instance = None + TelemetryLogger._default_logger = None + + +def test_telemetry_only_logs_recipe_events_in_ci(monkeypatch): + monkeypatch.setenv("CI", "1") + Telemetry._instance = None + + mock_logger = Mock() + mock_logger.register_payload_transmitted_callback.return_value = lambda: None + + try: + with patch("olive.telemetry.telemetry.get_telemetry_logger", return_value=mock_logger): + telemetry = Telemetry() + telemetry.log(ACTION_EVENT_NAME, {"action_name": "WorkflowRun", "duration_ms": 1, "success": False}) + telemetry.log(RECIPE_EVENT_NAME, {"recipe_name": "WorkflowRun", "success": False}) + + assert mock_logger.log.call_count == 1 + assert mock_logger.log.call_args.args[0] == RECIPE_EVENT_NAME + finally: + Telemetry._instance = None + + +@pytest.mark.skipif(os.name != "nt", reason="Windows locking behavior is specific to Windows.") +def test_exclusive_file_lock_blocks_second_append_on_windows(tmp_path): + file_path = tmp_path / "olive.json" + child_code = """ +import sys +import time +from pathlib import Path +from olive.telemetry.utils import _exclusive_file_lock + +path = Path(sys.argv[1]) +path.write_text("payload", encoding="utf-8") +with _exclusive_file_lock(path, "a") as locked_file: + locked_file.write("child") + locked_file.flush() + print("locked", flush=True) + time.sleep(2) +""" + + process = subprocess.Popen( + [sys.executable, "-c", child_code, str(file_path)], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + try: + assert process.stdout is not None + assert process.stdout.readline().strip() == "locked" + + start = time.perf_counter() + with _exclusive_file_lock(file_path, mode="a") as locked_file: + wait_time = time.perf_counter() - start + locked_file.write("parent") + + assert wait_time >= 1.0 + finally: + try: + stdout, stderr = process.communicate(timeout=5) + except subprocess.TimeoutExpired: + process.kill() + stdout, stderr = process.communicate() + pytest.fail(f"child lock process timed out: stdout={stdout!r} stderr={stderr!r}") + + assert process.returncode == 0, stderr + assert file_path.read_text(encoding="utf-8") == "payloadchildparent" diff --git a/test/workflows/test_workflow_run.py b/test/workflows/test_workflow_run.py index 82cc4980b..241c0e8d6 100644 --- a/test/workflows/test_workflow_run.py +++ b/test/workflows/test_workflow_run.py @@ -125,3 +125,93 @@ def test_run_packages(): # cleanup requirements_file_path.unlink() + + +@patch("olive.workflows.run.run.log_recipe_result") +@patch("olive.workflows.run.run.run_engine") +def test_run_logs_recipe_result_success(mock_run_engine, mock_log_recipe_result): + config = { + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen2.5-0.5B-Instruct", + "task": "text-generation", + "load_kwargs": {"attn_implementation": "eager"}, + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [{"device": "gpu", "execution_providers": ["CUDAExecutionProvider"]}], + } + }, + "engine": {"target": "local_system"}, + "passes": {"dynamic_quant": {"type": "OnnxDynamicQuantization"}}, + } + expected_output = object() + mock_run_engine.return_value = expected_output + + output = olive_run( + config, + recipe_telemetry_metadata={ + "recipe_name": "Quantize", + "recipe_command": "Quantize", + "recipe_source": "generated_cli", + "recipe_format": "generated", + }, + ) + + assert output is expected_output + mock_log_recipe_result.assert_called_once() + assert mock_log_recipe_result.call_args.args[0] == "Quantize" + assert mock_log_recipe_result.call_args.kwargs["success"] is True + + metadata = mock_log_recipe_result.call_args.kwargs["metadata"] + assert metadata["recipe_command"] == "Quantize" + assert metadata["recipe_source"] == "generated_cli" + assert metadata["recipe_format"] == "generated" + assert metadata["workflow_id"] == "default_workflow" + assert metadata["input_model_type"] == "hfmodel" + assert metadata["input_model_source"] == "string_name" + assert metadata["model_task"] == "text-generation" + assert metadata["target_system_type"] == "LocalSystem" + assert metadata["target_device"] == "gpu" + assert metadata["execution_provider"] == "CUDAExecutionProvider" + assert metadata["execution_providers"] == "CUDAExecutionProvider" + assert metadata["pass_types"] == "onnxdynamicquantization" + assert metadata["pass_count"] == 1 + assert metadata["data_config_count"] == 0 + assert metadata["search_enabled"] is False + assert metadata["package_config_provided"] is False + assert metadata["is_ci"] is False + assert metadata["recipe_hash"] + assert metadata["input_model_name_hash"] + + +@patch("olive.workflows.run.run.log_recipe_result") +@patch("olive.workflows.run.run.run_engine") +def test_run_logs_recipe_result_failure(mock_run_engine, mock_log_recipe_result): + config = { + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen2.5-0.5B-Instruct", + "task": "text-generation", + "load_kwargs": {"attn_implementation": "eager"}, + }, + "passes": {"dynamic_quant": {"type": "OnnxDynamicQuantization"}}, + } + mock_run_engine.side_effect = ValueError("recipe failed") + + with pytest.raises(ValueError, match="recipe failed"): + olive_run( + config, + recipe_telemetry_metadata={ + "recipe_name": "Quantize", + "recipe_command": "Quantize", + "recipe_source": "generated_cli", + "recipe_format": "generated", + }, + ) + + mock_log_recipe_result.assert_called_once() + assert mock_log_recipe_result.call_args.args[0] == "Quantize" + assert mock_log_recipe_result.call_args.kwargs["success"] is False + assert mock_log_recipe_result.call_args.kwargs["exception_type"] == "ValueError" From 15a5c3108bbb94602c343c064f2573fee7f4661a Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 13:42:47 -0500 Subject: [PATCH 06/24] Remove avoidable telemetry lint suppressions Replace optional runtime imports with importlib-based lookups so the recent telemetry changes stay lint-clean without adding new noqa markers. Keep the focused telemetry tests import-sorted and ready for CI. Files changed: - olive/cli/base.py - olive/workflows/run/run.py - test/test_telemetry.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/cli/base.py | 13 +++++++------ olive/workflows/run/run.py | 3 ++- test/test_telemetry.py | 9 ++++++++- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/olive/cli/base.py b/olive/cli/base.py index 90ba0dcc3..5bc34af6f 100644 --- a/olive/cli/base.py +++ b/olive/cli/base.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- +import importlib import json import re import tempfile @@ -10,6 +11,8 @@ from pathlib import Path from typing import ClassVar, Optional +from packaging import version + from olive.common.constants import DEFAULT_HF_TASK from olive.common.user_module_loader import UserModuleLoader from olive.common.utils import hf_repo_exists, set_nested_dict_value, unescaped_str @@ -60,18 +63,16 @@ def _get_recipe_telemetry_metadata(self) -> dict[str, str]: @staticmethod def _parse_extra_options(kv_items): - from onnxruntime_genai import __version__ as OrtGenaiVersion # noqa: PLC0415 - from packaging import version # noqa: PLC0415 + ort_genai = importlib.import_module("onnxruntime_genai") + ort_genai_version = ort_genai.__version__ - if version.parse(OrtGenaiVersion) <= version.parse("0.9.0"): + if version.parse(ort_genai_version) <= version.parse("0.9.0"): raise ValueError( "onnxruntime-genai version <= 0.9.0 is not supported for extra_options in CLI. " "Please either upgrade to onnxruntime-genai version > 0.9.0 or use the model builder pass directly in the config file." ) - from onnxruntime_genai.models.builder import parse_extra_options # noqa: PLC0415 - - return parse_extra_options(kv_items) + return importlib.import_module("onnxruntime_genai.models.builder").parse_extra_options(kv_items) @staticmethod def _save_config_file(config: dict): diff --git a/olive/workflows/run/run.py b/olive/workflows/run/run.py index 8cf87feb9..e8bae5ba0 100644 --- a/olive/workflows/run/run.py +++ b/olive/workflows/run/run.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- +import importlib import logging from copy import deepcopy from pathlib import Path @@ -122,7 +123,7 @@ def run_engine(package_config: OlivePackageConfig, run_config: RunConfig): # ort_log_severity_level: C++ logging levels try: - import onnxruntime as ort # noqa: PLC0415 + ort = importlib.import_module("onnxruntime") ort.set_default_logger_severity(run_config.engine.ort_log_severity_level) except Exception: diff --git a/test/test_telemetry.py b/test/test_telemetry.py index f65096ee7..1af052f4e 100644 --- a/test/test_telemetry.py +++ b/test/test_telemetry.py @@ -10,8 +10,15 @@ from unittest.mock import Mock, patch import pytest + from olive.telemetry.library.telemetry_logger import TelemetryLogger -from olive.telemetry.telemetry import ACTION_EVENT_NAME, CACHE_FILE_NAME, RECIPE_EVENT_NAME, Telemetry, TelemetryCacheHandler +from olive.telemetry.telemetry import ( + ACTION_EVENT_NAME, + CACHE_FILE_NAME, + RECIPE_EVENT_NAME, + Telemetry, + TelemetryCacheHandler, +) from olive.telemetry.utils import _exclusive_file_lock From eb524c7a58be5d4208eec777157b9c42a1c617ca Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 13:53:39 -0500 Subject: [PATCH 07/24] Move service name handling into telemetry logger Keep exporter options focused on transport/export concerns and move service.name defaults into the logger/resource layer where they belong. This keeps Olive's explicit app name override separate from the shared logger fallback and removes unnecessary plumbing. Files changed: - olive/telemetry/library/options.py - olive/telemetry/library/telemetry_logger.py - test/test_telemetry.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/telemetry/library/options.py | 1 - olive/telemetry/library/telemetry_logger.py | 18 +++++++++--------- test/test_telemetry.py | 18 +++++++++++++++++- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/olive/telemetry/library/options.py b/olive/telemetry/library/options.py index 31fd1ba19..dd934cad2 100644 --- a/olive/telemetry/library/options.py +++ b/olive/telemetry/library/options.py @@ -62,7 +62,6 @@ class OneCollectorExporterOptions: """Configuration options for OneCollector exporter.""" connection_string: Optional[str] = None - service_name: Optional[str] = None transport_options: OneCollectorTransportOptions = field(default_factory=OneCollectorTransportOptions) # Internal fields populated during validation diff --git a/olive/telemetry/library/telemetry_logger.py b/olive/telemetry/library/telemetry_logger.py index 928031e15..19f671da1 100644 --- a/olive/telemetry/library/telemetry_logger.py +++ b/olive/telemetry/library/telemetry_logger.py @@ -19,7 +19,7 @@ from olive.telemetry.library.options import OneCollectorExporterOptions from olive.version import __version__ as VERSION -DEFAULT_SERVICE_NAME = __name__.split(".", maxsplit=1)[0] +DEFAULT_SERVICE_NAME = "olive" class TelemetryLogger: @@ -36,25 +36,27 @@ class TelemetryLogger: _logger_exporter: Optional[OneCollectorLogExporter] = None _logger_provider: Optional[LoggerProvider] = None - def __new__(cls, options: Optional[OneCollectorExporterOptions] = None): + def __new__(cls, options: Optional[OneCollectorExporterOptions] = None, service_name: Optional[str] = None): """Create or return the singleton instance. Args: options: Exporter options (only used on first instantiation) + service_name: Logical application/service name for emitted telemetry (only used on first instantiation) """ with cls._singleton_lock: if cls._instance is None: cls._instance = super().__new__(cls) - cls._instance._initialize(options) + cls._instance._initialize(options, service_name) return cls._instance - def _initialize(self, options: Optional[OneCollectorExporterOptions]) -> None: + def _initialize(self, options: Optional[OneCollectorExporterOptions], service_name: Optional[str]) -> None: """Initialize the logger (called only once). Args: options: Exporter configuration options + service_name: Logical application/service name for emitted telemetry """ try: @@ -62,7 +64,7 @@ def _initialize(self, options: Optional[OneCollectorExporterOptions]) -> None: self._logger_exporter = OneCollectorLogExporter(options=options) # Create logger provider - service_name = options.service_name or DEFAULT_SERVICE_NAME + service_name = service_name or DEFAULT_SERVICE_NAME self._logger_provider = LoggerProvider( resource=Resource.create( { @@ -164,10 +166,8 @@ def get_default_logger( if cls._default_logger is None: options = None if connection_string: - options = OneCollectorExporterOptions( - connection_string=connection_string, service_name=service_name - ) - cls._default_logger = cls(options=options) + options = OneCollectorExporterOptions(connection_string=connection_string) + cls._default_logger = cls(options=options, service_name=service_name) return cls._default_logger diff --git a/test/test_telemetry.py b/test/test_telemetry.py index 1af052f4e..e05a1acc8 100644 --- a/test/test_telemetry.py +++ b/test/test_telemetry.py @@ -11,7 +11,7 @@ import pytest -from olive.telemetry.library.telemetry_logger import TelemetryLogger +from olive.telemetry.library.telemetry_logger import DEFAULT_SERVICE_NAME, TelemetryLogger from olive.telemetry.telemetry import ( ACTION_EVENT_NAME, CACHE_FILE_NAME, @@ -49,6 +49,22 @@ def test_telemetry_logger_uses_explicit_service_name(): TelemetryLogger._default_logger = None +def test_telemetry_logger_uses_default_service_name(): + TelemetryLogger.shutdown_default_logger() + TelemetryLogger._instance = None + TelemetryLogger._default_logger = None + + try: + logger = TelemetryLogger.get_default_logger( + connection_string="InstrumentationKey=12345678-1234-1234-1234-123456789abc-tenant" + ) + assert logger._logger_provider.resource.attributes["service.name"] == DEFAULT_SERVICE_NAME + finally: + TelemetryLogger.shutdown_default_logger() + TelemetryLogger._instance = None + TelemetryLogger._default_logger = None + + def test_telemetry_only_logs_recipe_events_in_ci(monkeypatch): monkeypatch.setenv("CI", "1") Telemetry._instance = None From 862a9a1f37eccba295698657e49e386f8c8c3f3a Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 14:03:54 -0500 Subject: [PATCH 08/24] Scope service-name cleanup to Olive usage Keep Olive''s explicit service-name override in the logger path, but restore the previous shared-library fallback and compatibility behavior so this cleanup does not broaden unrelated API or default changes. Files changed: - olive/telemetry/library/options.py - olive/telemetry/library/telemetry_logger.py - test/test_telemetry.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/telemetry/library/options.py | 1 + olive/telemetry/library/telemetry_logger.py | 4 ++-- test/test_telemetry.py | 18 +----------------- 3 files changed, 4 insertions(+), 19 deletions(-) diff --git a/olive/telemetry/library/options.py b/olive/telemetry/library/options.py index dd934cad2..31fd1ba19 100644 --- a/olive/telemetry/library/options.py +++ b/olive/telemetry/library/options.py @@ -62,6 +62,7 @@ class OneCollectorExporterOptions: """Configuration options for OneCollector exporter.""" connection_string: Optional[str] = None + service_name: Optional[str] = None transport_options: OneCollectorTransportOptions = field(default_factory=OneCollectorTransportOptions) # Internal fields populated during validation diff --git a/olive/telemetry/library/telemetry_logger.py b/olive/telemetry/library/telemetry_logger.py index 19f671da1..d39876848 100644 --- a/olive/telemetry/library/telemetry_logger.py +++ b/olive/telemetry/library/telemetry_logger.py @@ -19,7 +19,7 @@ from olive.telemetry.library.options import OneCollectorExporterOptions from olive.version import __version__ as VERSION -DEFAULT_SERVICE_NAME = "olive" +DEFAULT_SERVICE_NAME = __name__.split(".", maxsplit=1)[0] class TelemetryLogger: @@ -64,7 +64,7 @@ def _initialize(self, options: Optional[OneCollectorExporterOptions], service_na self._logger_exporter = OneCollectorLogExporter(options=options) # Create logger provider - service_name = service_name or DEFAULT_SERVICE_NAME + service_name = service_name or (options.service_name if options else None) or DEFAULT_SERVICE_NAME self._logger_provider = LoggerProvider( resource=Resource.create( { diff --git a/test/test_telemetry.py b/test/test_telemetry.py index e05a1acc8..1af052f4e 100644 --- a/test/test_telemetry.py +++ b/test/test_telemetry.py @@ -11,7 +11,7 @@ import pytest -from olive.telemetry.library.telemetry_logger import DEFAULT_SERVICE_NAME, TelemetryLogger +from olive.telemetry.library.telemetry_logger import TelemetryLogger from olive.telemetry.telemetry import ( ACTION_EVENT_NAME, CACHE_FILE_NAME, @@ -49,22 +49,6 @@ def test_telemetry_logger_uses_explicit_service_name(): TelemetryLogger._default_logger = None -def test_telemetry_logger_uses_default_service_name(): - TelemetryLogger.shutdown_default_logger() - TelemetryLogger._instance = None - TelemetryLogger._default_logger = None - - try: - logger = TelemetryLogger.get_default_logger( - connection_string="InstrumentationKey=12345678-1234-1234-1234-123456789abc-tenant" - ) - assert logger._logger_provider.resource.attributes["service.name"] == DEFAULT_SERVICE_NAME - finally: - TelemetryLogger.shutdown_default_logger() - TelemetryLogger._instance = None - TelemetryLogger._default_logger = None - - def test_telemetry_only_logs_recipe_events_in_ci(monkeypatch): monkeypatch.setenv("CI", "1") Telemetry._instance = None From 86d4d02f2cf27e238b76b2b6d4b6db464e735531 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 14:28:37 -0500 Subject: [PATCH 09/24] Revert unnecessary non-telemetry branch changes Trim the branch back to telemetry behavior and the minimal plumbing needed to support it. Restore the original local-import patterns in CLI and workflow code, keep only targeted lint suppressions where those restored lines are required, and simplify the telemetry logger app-name plumbing without changing the feature behavior. Files changed: - olive/cli/base.py - olive/cli/run.py - olive/telemetry/library/telemetry_logger.py - olive/workflows/run/run.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/cli/base.py | 19 ++++++++++--------- olive/cli/run.py | 5 +++-- olive/telemetry/library/telemetry_logger.py | 20 ++++++++++---------- olive/workflows/run/run.py | 3 +-- 4 files changed, 24 insertions(+), 23 deletions(-) diff --git a/olive/cli/base.py b/olive/cli/base.py index 5bc34af6f..ce1671040 100644 --- a/olive/cli/base.py +++ b/olive/cli/base.py @@ -2,17 +2,13 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -import importlib import json import re -import tempfile from abc import ABC, abstractmethod from argparse import ArgumentParser, Namespace from pathlib import Path from typing import ClassVar, Optional -from packaging import version - from olive.common.constants import DEFAULT_HF_TASK from olive.common.user_module_loader import UserModuleLoader from olive.common.utils import hf_repo_exists, set_nested_dict_value, unescaped_str @@ -20,7 +16,6 @@ from olive.hardware.accelerator import AcceleratorSpec from olive.hardware.constants import DEVICE_TO_EXECUTION_PROVIDERS from olive.resource_path import OLIVE_RESOURCE_ANNOTATIONS -from olive.workflows import run as olive_run class BaseOliveCLICommand(ABC): @@ -34,6 +29,10 @@ def __init__(self, parser: ArgumentParser, args: Namespace, unknown_args: Option parser.error(f"Unknown arguments: {unknown_args}") def _run_workflow(self): + import tempfile # noqa: PLC0415 + + from olive.workflows import run as olive_run # noqa: PLC0415 + Path(self.args.output_path).mkdir(parents=True, exist_ok=True) with tempfile.TemporaryDirectory(prefix="olive-cli-tmp-", dir=self.args.output_path) as tempdir: @@ -63,16 +62,18 @@ def _get_recipe_telemetry_metadata(self) -> dict[str, str]: @staticmethod def _parse_extra_options(kv_items): - ort_genai = importlib.import_module("onnxruntime_genai") - ort_genai_version = ort_genai.__version__ + from onnxruntime_genai import __version__ as OrtGenaiVersion # noqa: PLC0415 + from packaging import version # noqa: PLC0415 - if version.parse(ort_genai_version) <= version.parse("0.9.0"): + if version.parse(OrtGenaiVersion) <= version.parse("0.9.0"): raise ValueError( "onnxruntime-genai version <= 0.9.0 is not supported for extra_options in CLI. " "Please either upgrade to onnxruntime-genai version > 0.9.0 or use the model builder pass directly in the config file." ) - return importlib.import_module("onnxruntime_genai.models.builder").parse_extra_options(kv_items) + from onnxruntime_genai.models.builder import parse_extra_options # noqa: PLC0415 + + return parse_extra_options(kv_items) @staticmethod def _save_config_file(config: dict): diff --git a/olive/cli/run.py b/olive/cli/run.py index 1d8080581..f7c9f6e81 100644 --- a/olive/cli/run.py +++ b/olive/cli/run.py @@ -12,9 +12,7 @@ add_telemetry_options, get_input_model_config, ) -from olive.common.config_utils import load_config_file from olive.telemetry import action -from olive.workflows import run as olive_run class WorkflowRunCommand(BaseOliveCLICommand): @@ -52,6 +50,9 @@ def register_subcommand(parser: ArgumentParser): @action def run(self): + from olive.common.config_utils import load_config_file # noqa: PLC0415 + from olive.workflows import run as olive_run # noqa: PLC0415 + # allow the run_config to be a dict already (for api use) run_config_input = self.args.run_config run_config = run_config_input diff --git a/olive/telemetry/library/telemetry_logger.py b/olive/telemetry/library/telemetry_logger.py index d39876848..626e1da87 100644 --- a/olive/telemetry/library/telemetry_logger.py +++ b/olive/telemetry/library/telemetry_logger.py @@ -19,8 +19,6 @@ from olive.telemetry.library.options import OneCollectorExporterOptions from olive.version import __version__ as VERSION -DEFAULT_SERVICE_NAME = __name__.split(".", maxsplit=1)[0] - class TelemetryLogger: """Singleton telemetry logger for simplified OneCollector integration. @@ -36,27 +34,25 @@ class TelemetryLogger: _logger_exporter: Optional[OneCollectorLogExporter] = None _logger_provider: Optional[LoggerProvider] = None - def __new__(cls, options: Optional[OneCollectorExporterOptions] = None, service_name: Optional[str] = None): + def __new__(cls, options: Optional[OneCollectorExporterOptions] = None): """Create or return the singleton instance. Args: options: Exporter options (only used on first instantiation) - service_name: Logical application/service name for emitted telemetry (only used on first instantiation) """ with cls._singleton_lock: if cls._instance is None: cls._instance = super().__new__(cls) - cls._instance._initialize(options, service_name) + cls._instance._initialize(options) return cls._instance - def _initialize(self, options: Optional[OneCollectorExporterOptions], service_name: Optional[str]) -> None: + def _initialize(self, options: Optional[OneCollectorExporterOptions]) -> None: """Initialize the logger (called only once). Args: options: Exporter configuration options - service_name: Logical application/service name for emitted telemetry """ try: @@ -64,7 +60,9 @@ def _initialize(self, options: Optional[OneCollectorExporterOptions], service_na self._logger_exporter = OneCollectorLogExporter(options=options) # Create logger provider - service_name = service_name or (options.service_name if options else None) or DEFAULT_SERVICE_NAME + service_name = ( + options.service_name if options and options.service_name else __name__.split(".", maxsplit=1)[0] + ) self._logger_provider = LoggerProvider( resource=Resource.create( { @@ -166,8 +164,10 @@ def get_default_logger( if cls._default_logger is None: options = None if connection_string: - options = OneCollectorExporterOptions(connection_string=connection_string) - cls._default_logger = cls(options=options, service_name=service_name) + options = OneCollectorExporterOptions( + connection_string=connection_string, service_name=service_name + ) + cls._default_logger = cls(options=options) return cls._default_logger diff --git a/olive/workflows/run/run.py b/olive/workflows/run/run.py index e8bae5ba0..8cf87feb9 100644 --- a/olive/workflows/run/run.py +++ b/olive/workflows/run/run.py @@ -2,7 +2,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -import importlib import logging from copy import deepcopy from pathlib import Path @@ -123,7 +122,7 @@ def run_engine(package_config: OlivePackageConfig, run_config: RunConfig): # ort_log_severity_level: C++ logging levels try: - ort = importlib.import_module("onnxruntime") + import onnxruntime as ort # noqa: PLC0415 ort.set_default_logger_severity(run_config.engine.ort_log_severity_level) except Exception: From 99f0e17dec10dd84a9ccdd1199f7ac8aa102e7b1 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 14:38:13 -0500 Subject: [PATCH 10/24] Address PR review feedback on telemetry changes Fix the correctness and security issues raised on PR #2441 by handling empty telemetry cache-dir overrides safely, preserving non-empty unreadable flush files instead of deleting them, restoring the legacy .json.flush naming pattern, handling non-pathlike recipe config inputs without masking the original error, and cleaning up the Windows ctypes import pattern for CodeQL. Files changed: - olive/telemetry/telemetry.py - olive/telemetry/utils.py - olive/workflows/run/run.py - test/test_telemetry.py - test/workflows/test_workflow_run.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/telemetry/telemetry.py | 18 +++++++++++++----- olive/telemetry/utils.py | 2 +- olive/workflows/run/run.py | 10 +++++++--- test/test_telemetry.py | 21 +++++++++++++++++++++ test/workflows/test_workflow_run.py | 5 +++++ 5 files changed, 47 insertions(+), 9 deletions(-) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 6a9bebe17..5a7f911bc 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -274,8 +274,11 @@ def cache_path(self) -> Optional[Path]: """ telemetry_cache_dir = None - if "OLIVE_TELEMETRY_CACHE_DIR" in os.environ: - telemetry_cache_dir = Path(os.environ["OLIVE_TELEMETRY_CACHE_DIR"]).expanduser() + telemetry_cache_dir_override = os.environ.get("OLIVE_TELEMETRY_CACHE_DIR") + if telemetry_cache_dir_override: + telemetry_cache_dir_override = telemetry_cache_dir_override.strip() + if telemetry_cache_dir_override: + telemetry_cache_dir = Path(telemetry_cache_dir_override).expanduser() if not telemetry_cache_dir: telemetry_cache_dir = get_telemetry_base_dir() / "cache" return telemetry_cache_dir / self._cache_file_name @@ -370,7 +373,9 @@ def _restore_flush_file(self, flush_path: Optional[Path], cache_path: Path) -> N cache_file.write(line + "\n") flush_path.unlink(missing_ok=True) except Exception: - pass + # Best-effort cache restore must never interrupt telemetry flow. + # Leave the flush file in place so a later retry can attempt recovery again. + return def _flush_cache_file(self, cache_path: Path) -> None: """Flush cached events back to telemetry service. @@ -385,7 +390,7 @@ def _flush_cache_file(self, cache_path: Path) -> None: return # Atomically rename to claim ownership — only one process can succeed - flush_path = cache_path.with_suffix(".flush") + flush_path = cache_path.with_name(f"{cache_path.name}.flush") try: cache_path.replace(flush_path) except FileNotFoundError: @@ -393,7 +398,10 @@ def _flush_cache_file(self, cache_path: Path) -> None: entries = _read_cache_entries(flush_path) if not entries: - flush_path.unlink(missing_ok=True) + if flush_path.stat().st_size == 0: + flush_path.unlink(missing_ok=True) + else: + self._restore_flush_file(flush_path, cache_path) return # Replay cached events — _is_flushing flag prevents re-caching diff --git a/olive/telemetry/utils.py b/olive/telemetry/utils.py index 830ca055d..716b4831c 100644 --- a/olive/telemetry/utils.py +++ b/olive/telemetry/utils.py @@ -19,8 +19,8 @@ if os.name == "nt": import ctypes + import ctypes.wintypes as wintypes import msvcrt - from ctypes import wintypes _LOCKFILE_EXCLUSIVE_LOCK = 0x00000002 diff --git a/olive/workflows/run/run.py b/olive/workflows/run/run.py index 8cf87feb9..0a29f2162 100644 --- a/olive/workflows/run/run.py +++ b/olive/workflows/run/run.py @@ -4,6 +4,7 @@ # -------------------------------------------------------------------------- import logging from copy import deepcopy +from os import PathLike from pathlib import Path from typing import TYPE_CHECKING, Any, Optional, Union @@ -286,12 +287,15 @@ def _build_recipe_result_metadata( return metadata -def _classify_run_config_source(run_config_input: Union[str, Path, dict]) -> tuple[str, str]: +def _classify_run_config_source(run_config_input: Any) -> tuple[str, str]: if isinstance(run_config_input, dict): return "config_dict", "dict" - suffix = Path(run_config_input).suffix.lstrip(".").lower() - return "config_file", suffix or "unknown" + if isinstance(run_config_input, (str, PathLike)): + suffix = Path(run_config_input).suffix.lstrip(".").lower() + return "config_file", suffix or "unknown" + + return "config_object", "object" def _extract_input_model_metadata(input_model_config: dict[str, Any]) -> dict[str, Optional[str]]: diff --git a/test/test_telemetry.py b/test/test_telemetry.py index 1af052f4e..5ebfe0946 100644 --- a/test/test_telemetry.py +++ b/test/test_telemetry.py @@ -32,6 +32,14 @@ def test_cache_path_uses_env_override(tmp_path, monkeypatch): assert isinstance(handler.cache_path, Path) +def test_cache_path_ignores_empty_env_override(tmp_path, monkeypatch): + monkeypatch.setenv("OLIVE_TELEMETRY_CACHE_DIR", " ") + + with patch("olive.telemetry.telemetry.get_telemetry_base_dir", return_value=tmp_path): + handler = TelemetryCacheHandler(Mock()) + assert handler.cache_path == tmp_path / "cache" / CACHE_FILE_NAME + + def test_telemetry_logger_uses_explicit_service_name(): TelemetryLogger.shutdown_default_logger() TelemetryLogger._instance = None @@ -68,6 +76,19 @@ def test_telemetry_only_logs_recipe_events_in_ci(monkeypatch): Telemetry._instance = None +def test_flush_cache_preserves_nonempty_unreadable_file(tmp_path): + handler = TelemetryCacheHandler(Mock()) + cache_path = tmp_path / CACHE_FILE_NAME + flush_path = cache_path.with_name(f"{cache_path.name}.flush") + cache_path.write_text("not-json\n", encoding="utf-8") + + handler._flush_cache_file(cache_path) + + assert cache_path.exists() + assert cache_path.read_text(encoding="utf-8") == "not-json\n" + assert not flush_path.exists() + + @pytest.mark.skipif(os.name != "nt", reason="Windows locking behavior is specific to Windows.") def test_exclusive_file_lock_blocks_second_append_on_windows(tmp_path): file_path = tmp_path / "olive.json" diff --git a/test/workflows/test_workflow_run.py b/test/workflows/test_workflow_run.py index 241c0e8d6..3d7a4dbc5 100644 --- a/test/workflows/test_workflow_run.py +++ b/test/workflows/test_workflow_run.py @@ -6,6 +6,7 @@ import pytest from olive.workflows import run as olive_run +from olive.workflows.run.run import _classify_run_config_source from test.utils import ( get_pytorch_model, get_pytorch_model_config, @@ -215,3 +216,7 @@ def test_run_logs_recipe_result_failure(mock_run_engine, mock_log_recipe_result) assert mock_log_recipe_result.call_args.args[0] == "Quantize" assert mock_log_recipe_result.call_args.kwargs["success"] is False assert mock_log_recipe_result.call_args.kwargs["exception_type"] == "ValueError" + + +def test_classify_run_config_source_handles_non_pathlike_object(): + assert _classify_run_config_source(object()) == ("config_object", "object") From 788e91c547bf181fd29490deab1198003ce96c03 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 15:46:34 -0500 Subject: [PATCH 11/24] Remove low-value service-name telemetry test Drop the explicit service.name wiring test from test/test_telemetry.py since it is mostly implementation-detail coverage and does not protect the higher-value telemetry behavior changes on this branch. Files changed: - test/test_telemetry.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/test_telemetry.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/test/test_telemetry.py b/test/test_telemetry.py index 5ebfe0946..49507c8d5 100644 --- a/test/test_telemetry.py +++ b/test/test_telemetry.py @@ -11,7 +11,6 @@ import pytest -from olive.telemetry.library.telemetry_logger import TelemetryLogger from olive.telemetry.telemetry import ( ACTION_EVENT_NAME, CACHE_FILE_NAME, @@ -40,23 +39,6 @@ def test_cache_path_ignores_empty_env_override(tmp_path, monkeypatch): assert handler.cache_path == tmp_path / "cache" / CACHE_FILE_NAME -def test_telemetry_logger_uses_explicit_service_name(): - TelemetryLogger.shutdown_default_logger() - TelemetryLogger._instance = None - TelemetryLogger._default_logger = None - - try: - logger = TelemetryLogger.get_default_logger( - connection_string="InstrumentationKey=12345678-1234-1234-1234-123456789abc-tenant", - service_name="Olive", - ) - assert logger._logger_provider.resource.attributes["service.name"] == "Olive" - finally: - TelemetryLogger.shutdown_default_logger() - TelemetryLogger._instance = None - TelemetryLogger._default_logger = None - - def test_telemetry_only_logs_recipe_events_in_ci(monkeypatch): monkeypatch.setenv("CI", "1") Telemetry._instance = None From 06f125245c7eee0456db4e40c605a314c00f0efc Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 16:16:09 -0500 Subject: [PATCH 12/24] Address remaining GitHub Advanced Security comments Resolve the remaining github-advanced-security findings by removing unused PLC0415 noqa markers, keeping the optional-import behavior via minimal import cleanup, and updating the telemetry tests to satisfy the protected-access and consider-using-with lint comments without changing the tested behavior. Files changed: - olive/cli/base.py - olive/cli/run.py - olive/workflows/run/run.py - test/test_telemetry.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/cli/base.py | 19 +++++++++---------- olive/cli/run.py | 5 ++--- olive/workflows/run/run.py | 3 ++- test/test_telemetry.py | 9 ++++----- 4 files changed, 17 insertions(+), 19 deletions(-) diff --git a/olive/cli/base.py b/olive/cli/base.py index ce1671040..5bc34af6f 100644 --- a/olive/cli/base.py +++ b/olive/cli/base.py @@ -2,13 +2,17 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- +import importlib import json import re +import tempfile from abc import ABC, abstractmethod from argparse import ArgumentParser, Namespace from pathlib import Path from typing import ClassVar, Optional +from packaging import version + from olive.common.constants import DEFAULT_HF_TASK from olive.common.user_module_loader import UserModuleLoader from olive.common.utils import hf_repo_exists, set_nested_dict_value, unescaped_str @@ -16,6 +20,7 @@ from olive.hardware.accelerator import AcceleratorSpec from olive.hardware.constants import DEVICE_TO_EXECUTION_PROVIDERS from olive.resource_path import OLIVE_RESOURCE_ANNOTATIONS +from olive.workflows import run as olive_run class BaseOliveCLICommand(ABC): @@ -29,10 +34,6 @@ def __init__(self, parser: ArgumentParser, args: Namespace, unknown_args: Option parser.error(f"Unknown arguments: {unknown_args}") def _run_workflow(self): - import tempfile # noqa: PLC0415 - - from olive.workflows import run as olive_run # noqa: PLC0415 - Path(self.args.output_path).mkdir(parents=True, exist_ok=True) with tempfile.TemporaryDirectory(prefix="olive-cli-tmp-", dir=self.args.output_path) as tempdir: @@ -62,18 +63,16 @@ def _get_recipe_telemetry_metadata(self) -> dict[str, str]: @staticmethod def _parse_extra_options(kv_items): - from onnxruntime_genai import __version__ as OrtGenaiVersion # noqa: PLC0415 - from packaging import version # noqa: PLC0415 + ort_genai = importlib.import_module("onnxruntime_genai") + ort_genai_version = ort_genai.__version__ - if version.parse(OrtGenaiVersion) <= version.parse("0.9.0"): + if version.parse(ort_genai_version) <= version.parse("0.9.0"): raise ValueError( "onnxruntime-genai version <= 0.9.0 is not supported for extra_options in CLI. " "Please either upgrade to onnxruntime-genai version > 0.9.0 or use the model builder pass directly in the config file." ) - from onnxruntime_genai.models.builder import parse_extra_options # noqa: PLC0415 - - return parse_extra_options(kv_items) + return importlib.import_module("onnxruntime_genai.models.builder").parse_extra_options(kv_items) @staticmethod def _save_config_file(config: dict): diff --git a/olive/cli/run.py b/olive/cli/run.py index f7c9f6e81..1d8080581 100644 --- a/olive/cli/run.py +++ b/olive/cli/run.py @@ -12,7 +12,9 @@ add_telemetry_options, get_input_model_config, ) +from olive.common.config_utils import load_config_file from olive.telemetry import action +from olive.workflows import run as olive_run class WorkflowRunCommand(BaseOliveCLICommand): @@ -50,9 +52,6 @@ def register_subcommand(parser: ArgumentParser): @action def run(self): - from olive.common.config_utils import load_config_file # noqa: PLC0415 - from olive.workflows import run as olive_run # noqa: PLC0415 - # allow the run_config to be a dict already (for api use) run_config_input = self.args.run_config run_config = run_config_input diff --git a/olive/workflows/run/run.py b/olive/workflows/run/run.py index 0a29f2162..d45d0a329 100644 --- a/olive/workflows/run/run.py +++ b/olive/workflows/run/run.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- +import importlib import logging from copy import deepcopy from os import PathLike @@ -123,7 +124,7 @@ def run_engine(package_config: OlivePackageConfig, run_config: RunConfig): # ort_log_severity_level: C++ logging levels try: - import onnxruntime as ort # noqa: PLC0415 + ort = importlib.import_module("onnxruntime") ort.set_default_logger_severity(run_config.engine.ort_log_severity_level) except Exception: diff --git a/test/test_telemetry.py b/test/test_telemetry.py index 49507c8d5..426ef968c 100644 --- a/test/test_telemetry.py +++ b/test/test_telemetry.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- +# pylint: disable=protected-access import os import subprocess import sys @@ -89,14 +90,12 @@ def test_exclusive_file_lock_blocks_second_append_on_windows(tmp_path): time.sleep(2) """ - process = subprocess.Popen( + with subprocess.Popen( [sys.executable, "-c", child_code, str(file_path)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, - ) - - try: + ) as process: assert process.stdout is not None assert process.stdout.readline().strip() == "locked" @@ -106,7 +105,7 @@ def test_exclusive_file_lock_blocks_second_append_on_windows(tmp_path): locked_file.write("parent") assert wait_time >= 1.0 - finally: + try: stdout, stderr = process.communicate(timeout=5) except subprocess.TimeoutExpired: From 42dbce5d02946d8e7af19f58b5172a8db0da33b0 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 16:31:29 -0500 Subject: [PATCH 13/24] Simplify telemetry utils responsibilities Remove dead base64 cache helpers from olive/telemetry/utils.py and move exception formatting next to the telemetry extension code that actually uses it. Keep the Win32 locking and cache-dir logic intact while reducing unrelated utility clutter. Files changed: - olive/telemetry/utils.py - olive/telemetry/telemetry_extensions.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/telemetry/telemetry_extensions.py | 21 ++++++++++++- olive/telemetry/utils.py | 42 +------------------------ 2 files changed, 21 insertions(+), 42 deletions(-) diff --git a/olive/telemetry/telemetry_extensions.py b/olive/telemetry/telemetry_extensions.py index ff5a2c703..8b5fc0412 100644 --- a/olive/telemetry/telemetry_extensions.py +++ b/olive/telemetry/telemetry_extensions.py @@ -6,11 +6,11 @@ import functools import inspect import time +import traceback from types import TracebackType from typing import Any, Callable, Optional, TypeVar from olive.telemetry.telemetry import ACTION_EVENT_NAME, ERROR_EVENT_NAME, RECIPE_EVENT_NAME, _get_logger -from olive.telemetry.utils import _format_exception_message _TFunc = TypeVar("_TFunc", bound=Callable[..., Any]) @@ -61,6 +61,25 @@ def log_recipe_result( telemetry.log(RECIPE_EVENT_NAME, attributes, metadata) +def _format_exception_message(ex: BaseException, tb: Optional[TracebackType] = None) -> str: + """Format an exception and trim local paths for readability.""" + folder = "Olive" + file_line = 'File "' + formatted = traceback.format_exception(type(ex), ex, tb, limit=5) + lines = [] + for line in formatted: + line_trunc = line.strip() + if line_trunc.startswith(file_line) and folder in line_trunc: + idx = line_trunc.find(folder) + if idx != -1: + line_trunc = line_trunc[idx + len(folder) :] + elif line_trunc.startswith(file_line): + idx = line_trunc[len(file_line) :].find('"') + line_trunc = line_trunc[idx + len(file_line) :] + lines.append(line_trunc) + return "\n".join(lines) + + def _resolve_invoked_from(skip_frames: int = 0) -> str: """Resolve how Olive was invoked by examining the call stack. diff --git a/olive/telemetry/utils.py b/olive/telemetry/utils.py index 716b4831c..28cec45eb 100644 --- a/olive/telemetry/utils.py +++ b/olive/telemetry/utils.py @@ -2,15 +2,12 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -import base64 import functools import os import platform import tempfile -import traceback from pathlib import Path -from types import TracebackType -from typing import ClassVar, Optional +from typing import ClassVar if os.name == "posix": import fcntl @@ -105,25 +102,6 @@ def get_telemetry_base_dir() -> Path: return Path(cache_dir).expanduser() / ORT_SUPPORT_DIR -def _format_exception_message(ex: BaseException, tb: Optional[TracebackType] = None) -> str: - """Format an exception and trim local paths for readability.""" - folder = "Olive" - file_line = 'File "' - formatted = traceback.format_exception(type(ex), ex, tb, limit=5) - lines = [] - for line in formatted: - line_trunc = line.strip() - if line_trunc.startswith(file_line) and folder in line_trunc: - idx = line_trunc.find(folder) - if idx != -1: - line_trunc = line_trunc[idx + len(folder) :] - elif line_trunc.startswith(file_line): - idx = line_trunc[len(file_line) :].find('"') - line_trunc = line_trunc[idx + len(file_line) :] - lines.append(line_trunc) - return "\n".join(lines) - - class _ExclusiveFileLock: """Cross-platform exclusive file lock context manager. @@ -200,21 +178,3 @@ def _exclusive_file_lock(file_path: Path, mode: str): :return: Context manager that returns an open file handle. """ return _ExclusiveFileLock(file_path, mode) - - -def _encode_cache_line(plaintext: str) -> str: - """Encode a single cache line using base64. - - :param plaintext: The plaintext string to encode. - :return: Base64-encoded string (safe for a single text line). - """ - return base64.b64encode(plaintext.encode("utf-8")).decode("ascii") - - -def _decode_cache_line(encoded: str) -> str: - """Decode a single base64-encoded cache line. - - :param encoded: The base64-encoded string. - :return: The decoded plaintext string. - """ - return base64.b64decode(encoded.encode("ascii")).decode("utf-8") From d948cd9e90cc07d4a1a83dbec130bfb40ddbfad1 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 17:14:28 -0500 Subject: [PATCH 14/24] Store CI detection result once in telemetry init Compute the CI environment flag once during Telemetry initialization and reuse it for recipe-only gating and heartbeat suppression instead of calling the check twice back-to-back. Files changed: - olive/telemetry/telemetry.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/telemetry/telemetry.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 5a7f911bc..68c26bfcc 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -470,8 +470,9 @@ def __init__(self): self._cache_handler = TelemetryCacheHandler(self) self._setup_payload_callbacks() - self._recipe_only_ci_telemetry = self._is_ci_environment() - if not self._is_ci_environment(): + is_ci = self._is_ci_environment() + self._recipe_only_ci_telemetry = is_ci + if not is_ci: self._log_heartbeat() if os.environ.get("OLIVE_DISABLE_TELEMETRY") == "1": self.disable_telemetry() From 1ff609ae0256f86a81f077e6767cfc3a14ff5347 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 17:43:09 -0500 Subject: [PATCH 15/24] Refine recipe telemetry semantics and config tracking Keep target telemetry fields only for explicitly configured targets, add separate host fields, remove the ambiguous input model name hash, and add redacted config-overrides plus package-config hash metadata so recipe telemetry can show which overrides users actually provide without folding environment-specific package config into recipe_hash. Files changed: - docs/Privacy.md - olive/telemetry/telemetry.py - olive/workflows/run/run.py - test/workflows/test_workflow_run.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/Privacy.md | 2 +- olive/telemetry/telemetry.py | 11 +- olive/workflows/run/run.py | 178 ++++++++++++++++++++++++---- test/workflows/test_workflow_run.py | 87 +++++++++++++- 4 files changed, 249 insertions(+), 29 deletions(-) diff --git a/docs/Privacy.md b/docs/Privacy.md index 9e1001e72..6ec870678 100644 --- a/docs/Privacy.md +++ b/docs/Privacy.md @@ -13,4 +13,4 @@ In addition, Olive may collect additional telemetry data such as: - Performance data - Exception information -Collection of this additional telemetry can be disabled by adding the `--disable_telemetry` flag to any Olive CLI command, or by setting the `OLIVE_DISABLE_TELEMETRY` environment variable to `1` before running. In CI/CD environments (e.g., GitHub Actions, Azure Pipelines, Jenkins), Olive suppresses the general heartbeat/action/error events and only emits the `OliveRecipe` event. If telemetry is enabled, but cannot be sent to Microsoft, it will be stored locally and sent when a connection is available. You can override the default cache location by setting the `OLIVE_TELEMETRY_CACHE_DIR` environment variable to a valid directory path. +Collection of this additional telemetry can be disabled by adding the `--disable_telemetry` flag to any Olive CLI command, or by setting the `OLIVE_DISABLE_TELEMETRY` environment variable to `1` before running. In CI/CD environments (e.g., GitHub Actions, Azure Pipelines, Jenkins), Olive suppresses the general heartbeat/action/error events and only emits the `OliveRecipe` event. The `OliveRecipe` event may include recipe metadata such as pass types, explicit target and host settings, whether a custom package config was provided, a hash of that custom package config, and a redacted snapshot of explicitly supplied config overrides. If telemetry is enabled, but cannot be sent to Microsoft, it will be stored locally and sent when a connection is available. You can override the default cache location by setting the `OLIVE_TELEMETRY_CACHE_DIR` environment variable to a valid directory path. diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 68c26bfcc..513fe3c34 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -80,21 +80,26 @@ "recipe_command", "execution_mode", "workflow_id", + "config_overrides", "success", "exception_type", "input_model_type", "input_model_source", - "input_model_name_hash", "model_task", "target_system_type", "target_device", - "execution_provider", - "execution_providers", + "target_execution_provider", + "target_execution_providers", + "host_system_type", + "host_device", + "host_execution_provider", + "host_execution_providers", "pass_types", "pass_count", "data_config_count", "search_enabled", "package_config_provided", + "package_config_hash", "is_ci", "app_version", "app_instance_id", diff --git a/olive/workflows/run/run.py b/olive/workflows/run/run.py index d45d0a329..794c5fa08 100644 --- a/olive/workflows/run/run.py +++ b/olive/workflows/run/run.py @@ -3,13 +3,15 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import importlib +import json import logging from copy import deepcopy from os import PathLike from pathlib import Path from typing import TYPE_CHECKING, Any, Optional, Union -from olive.common.utils import hash_dict, hash_string, set_tempdir +from olive.common.config_utils import load_config_file +from olive.common.utils import hash_dict, set_tempdir from olive.hardware.constants import ExecutionProvider from olive.logging import set_default_logger_severity, set_ort_logger_severity, set_verbosity_info from olive.package_config import OlivePackageConfig @@ -25,6 +27,8 @@ logger = logging.getLogger(__name__) RECIPE_HASH_REDACTED_VALUE = "" +CONFIG_REFERENCE_REDACTED_VALUE = "" +CONFIG_CALLABLE_REDACTED_VALUE = "" RECIPE_HASH_REDACTED_KEYS = { "output_dir", "cache_dir", @@ -36,9 +40,18 @@ "prepend_to_path", "script_dir", "model_script", + # package_config is tracked separately via package_config_provided, but + # excluded from recipe_hash because it is an environment/infrastructure path. "package_config", "work_dir", } +CONFIG_SNAPSHOT_REDACTED_KEYS = RECIPE_HASH_REDACTED_KEYS | { + "model_path", + "_name_or_path", + "adapter_path", + "user_script", +} +CONFIG_REFERENCE_KEYS = {"host", "target", "evaluator"} def get_required_packages(package_config: OlivePackageConfig, run_config: RunConfig) -> set[str]: @@ -177,6 +190,19 @@ def run( # set tempdir set_tempdir(tempdir) + try: + run_config_telemetry_input = _load_config_input_for_telemetry(run_config) + except Exception: + run_config_telemetry_input = None + + package_config_input = package_config + try: + package_config_telemetry_input = ( + _load_config_input_for_telemetry(package_config_input) if package_config_input is not None else None + ) + except Exception: + package_config_telemetry_input = None + package_config_provided = package_config is not None if package_config is None: package_config = OlivePackageConfig.get_default_config_path() @@ -213,9 +239,11 @@ def run( finally: metadata = _build_recipe_result_metadata( run_config, + run_config_telemetry_input, parsed_run_config, recipe_telemetry_metadata, list_required_packages=list_required_packages, + package_config_input=package_config_telemetry_input, package_config_provided=package_config_provided, ) recipe_name = metadata.pop("recipe_name") @@ -247,10 +275,12 @@ def get_run_on_target(package_config: OlivePackageConfig, pass_config: "RunPassC def _build_recipe_result_metadata( run_config_input: Union[str, Path, dict], + run_config_telemetry_input: Optional[Any], run_config: Optional[RunConfig], recipe_telemetry_metadata: Optional[dict[str, Any]], *, list_required_packages: bool, + package_config_input: Optional[Union[str, Path, dict]], package_config_provided: bool, ) -> dict[str, Any]: metadata = dict(recipe_telemetry_metadata or {}) @@ -259,6 +289,9 @@ def _build_recipe_result_metadata( metadata.setdefault("recipe_format", default_format) metadata.setdefault("execution_mode", "list_required_packages" if list_required_packages else "run") metadata.setdefault("package_config_provided", package_config_provided) + metadata.setdefault("config_overrides", _build_config_overrides(run_config_telemetry_input)) + if package_config_provided: + metadata.setdefault("package_config_hash", _build_package_config_hash(package_config_input)) metadata["is_ci"] = is_ci_environment() if run_config is None: @@ -268,6 +301,7 @@ def _build_recipe_result_metadata( run_config_json = run_config.to_json(make_absolute=False) model_metadata = _extract_input_model_metadata(run_config_json["input_model"]) target_metadata = _extract_target_metadata(run_config) + host_metadata = _extract_host_metadata(run_config) pass_types = [pass_config.type for pass_config in get_used_passes_configs(run_config)] metadata.setdefault("recipe_name", metadata.get("recipe_command") or run_config.workflow_id) @@ -275,12 +309,9 @@ def _build_recipe_result_metadata( metadata.setdefault("recipe_hash", _build_recipe_hash(run_config_json)) metadata.setdefault("input_model_type", run_config.input_model.type) metadata.setdefault("input_model_source", model_metadata["input_model_source"]) - metadata.setdefault("input_model_name_hash", model_metadata["input_model_name_hash"]) metadata.setdefault("model_task", model_metadata["model_task"]) - metadata.setdefault("target_system_type", target_metadata["target_system_type"]) - metadata.setdefault("target_device", target_metadata["target_device"]) - metadata.setdefault("execution_provider", target_metadata["execution_provider"]) - metadata.setdefault("execution_providers", target_metadata["execution_providers"]) + _set_metadata_if_present(metadata, target_metadata) + _set_metadata_if_present(metadata, host_metadata) metadata.setdefault("pass_types", ";".join(pass_types)) metadata.setdefault("pass_count", len(pass_types)) metadata.setdefault("data_config_count", len(run_config.data_configs)) @@ -299,6 +330,97 @@ def _classify_run_config_source(run_config_input: Any) -> tuple[str, str]: return "config_object", "object" +def _build_config_overrides(config_input: Any) -> Optional[str]: + try: + config_data = _load_config_input_for_telemetry(config_input) + if config_data is None: + return None + + snapshot = _sanitize_config_snapshot(config_data) + if snapshot in (None, {}, []): + return None + + return json.dumps(snapshot, ensure_ascii=False, sort_keys=True, separators=(",", ":")) + except Exception: + return None + + +def _build_package_config_hash(config_input: Any) -> Optional[str]: + try: + config_data = _load_config_input_for_telemetry(config_input) + if not isinstance(config_data, dict): + return None + + snapshot = _sanitize_config_snapshot(config_data) + if not isinstance(snapshot, dict): + return None + + return hash_dict(snapshot)[:16] + except Exception: + return None + + +def _load_config_input_for_telemetry(config_input: Any) -> Optional[Any]: + if config_input is None: + return None + if isinstance(config_input, dict): + return deepcopy(config_input) + if isinstance(config_input, (str, PathLike)): + return load_config_file(config_input) + + model_dump = getattr(config_input, "model_dump", None) + if callable(model_dump): + return model_dump(exclude_defaults=True, exclude_none=True, by_alias=True) + return None + + +def _sanitize_config_snapshot(value: Any, key: Optional[str] = None) -> Any: + if key in CONFIG_SNAPSHOT_REDACTED_KEYS or _is_path_like_key(key): + return RECIPE_HASH_REDACTED_VALUE + if key in CONFIG_REFERENCE_KEYS and isinstance(value, str): + return CONFIG_REFERENCE_REDACTED_VALUE + + if isinstance(value, dict): + if key == "systems": + return [_sanitize_config_snapshot(system, "system") for system in value.values()] + if key == "passes": + passes = [] + for pass_configs in value.values(): + if isinstance(pass_configs, list): + passes.extend(pass_configs) + else: + passes.append(pass_configs) + return [_sanitize_config_snapshot(pass_config, "pass") for pass_config in passes] + if key == "evaluators": + return [_sanitize_config_snapshot(evaluator, "evaluator_config") for evaluator in value.values()] + return { + child_key: _sanitize_config_snapshot(child_value, child_key) + for child_key, child_value in value.items() + if child_value is not None + } + if isinstance(value, list): + return [_sanitize_config_snapshot(item, key) for item in value] + if isinstance(value, tuple): + return [_sanitize_config_snapshot(item, key) for item in value] + if isinstance(value, Path): + return RECIPE_HASH_REDACTED_VALUE + if callable(value): + return CONFIG_CALLABLE_REDACTED_VALUE + if isinstance(value, (str, int, float, bool)) or value is None: + return value + if hasattr(value, "value") and isinstance(value.value, (str, int, float, bool)): + return value.value + return f"<{type(value).__name__}>" + + +def _is_path_like_key(key: Optional[str]) -> bool: + if key is None: + return False + return key in {"path", "paths", "dir", "dirs", "file", "files"} or key.endswith( + ("_path", "_paths", "_dir", "_dirs", "_file", "_files") + ) + + def _extract_input_model_metadata(input_model_config: dict[str, Any]) -> dict[str, Optional[str]]: model_config = input_model_config.get("config", {}) model_attributes = model_config.get("model_attributes", {}) @@ -306,7 +428,6 @@ def _extract_input_model_metadata(input_model_config: dict[str, Any]) -> dict[st raw_identifier = model_attributes.get("_name_or_path") or model_config.get("model_path") return { "input_model_source": _classify_input_model_source(raw_identifier), - "input_model_name_hash": _hash_value(raw_identifier), "model_task": str(model_task) if model_task is not None else None, } @@ -335,29 +456,48 @@ def _classify_input_model_source(model_identifier: Any) -> str: def _extract_target_metadata(run_config: RunConfig) -> dict[str, Optional[str]]: - target_system = run_config.engine.target or run_config.engine.host - target_system_type = target_system.type.value if target_system is not None else None - target_device = None + target_system = run_config.engine.target + return _extract_system_metadata(target_system, "target") + + +def _extract_host_metadata(run_config: RunConfig) -> dict[str, Optional[str]]: + host_system = run_config.engine.host + if host_system is None: + return { + "host_system_type": SystemType.Local.value, + } + return _extract_system_metadata(host_system, "host") + + +def _extract_system_metadata(system_config: Optional[Any], field_prefix: str) -> dict[str, Optional[str]]: + system_type = system_config.type.value if system_config is not None else None + device = None execution_provider = None execution_providers = None - accelerators = target_system.config.accelerators if target_system and target_system.config else None + accelerators = system_config.config.accelerators if system_config and system_config.config else None if accelerators: accelerator = accelerators[0] - target_device = str(accelerator.device) if accelerator.device is not None else None + device = str(accelerator.device) if accelerator.device is not None else None ep_values = accelerator.get_ep_strs() or [] if ep_values: execution_provider = ep_values[0] execution_providers = ";".join(ep_values) return { - "target_system_type": target_system_type, - "target_device": target_device, - "execution_provider": execution_provider, - "execution_providers": execution_providers, + f"{field_prefix}_system_type": system_type, + f"{field_prefix}_device": device, + f"{field_prefix}_execution_provider": execution_provider, + f"{field_prefix}_execution_providers": execution_providers, } +def _set_metadata_if_present(metadata: dict[str, Any], values: dict[str, Optional[str]]) -> None: + for key, value in values.items(): + if value is not None: + metadata.setdefault(key, value) + + def _build_recipe_hash(run_config_json: dict[str, Any]) -> str: sanitized = deepcopy(run_config_json) _redact_recipe_hash_keys(sanitized) @@ -383,9 +523,3 @@ def _set_path_value(container: Any, path: tuple[Any, ...], value: Any) -> None: for key in path[:-1]: current = current[key] current[path[-1]] = value - - -def _hash_value(value: Any) -> Optional[str]: - if value is None: - return None - return hash_string(str(value))[:16] diff --git a/test/workflows/test_workflow_run.py b/test/workflows/test_workflow_run.py index 3d7a4dbc5..b0382b935 100644 --- a/test/workflows/test_workflow_run.py +++ b/test/workflows/test_workflow_run.py @@ -1,3 +1,4 @@ +import json import sys from copy import deepcopy from pathlib import Path @@ -175,8 +176,12 @@ def test_run_logs_recipe_result_success(mock_run_engine, mock_log_recipe_result) assert metadata["model_task"] == "text-generation" assert metadata["target_system_type"] == "LocalSystem" assert metadata["target_device"] == "gpu" - assert metadata["execution_provider"] == "CUDAExecutionProvider" - assert metadata["execution_providers"] == "CUDAExecutionProvider" + assert metadata["target_execution_provider"] == "CUDAExecutionProvider" + assert metadata["target_execution_providers"] == "CUDAExecutionProvider" + assert metadata["host_system_type"] == "LocalSystem" + assert "host_device" not in metadata + assert "host_execution_provider" not in metadata + assert "host_execution_providers" not in metadata assert metadata["pass_types"] == "onnxdynamicquantization" assert metadata["pass_count"] == 1 assert metadata["data_config_count"] == 0 @@ -184,7 +189,13 @@ def test_run_logs_recipe_result_success(mock_run_engine, mock_log_recipe_result) assert metadata["package_config_provided"] is False assert metadata["is_ci"] is False assert metadata["recipe_hash"] - assert metadata["input_model_name_hash"] + assert "input_model_name_hash" not in metadata + + config_overrides = json.loads(metadata["config_overrides"]) + assert config_overrides["input_model"]["model_path"] == "" + assert config_overrides["engine"]["target"] == "" + assert config_overrides["systems"][0]["type"] == "LocalSystem" + assert config_overrides["systems"][0]["accelerators"][0]["execution_providers"] == ["CUDAExecutionProvider"] @patch("olive.workflows.run.run.log_recipe_result") @@ -218,5 +229,75 @@ def test_run_logs_recipe_result_failure(mock_run_engine, mock_log_recipe_result) assert mock_log_recipe_result.call_args.kwargs["exception_type"] == "ValueError" +@patch("olive.workflows.run.run.log_recipe_result") +@patch("olive.workflows.run.run.run_engine") +def test_run_logs_recipe_host_metadata_without_explicit_target(mock_run_engine, mock_log_recipe_result): + config = { + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen2.5-0.5B-Instruct", + "task": "text-generation", + "load_kwargs": {"attn_implementation": "eager"}, + }, + "systems": { + "host_system": { + "type": "LocalSystem", + "accelerators": [{"device": "cpu", "execution_providers": ["CPUExecutionProvider"]}], + } + }, + "engine": {"host": "host_system"}, + } + mock_run_engine.return_value = object() + + olive_run( + config, + recipe_telemetry_metadata={ + "recipe_name": "Quantize", + "recipe_command": "Quantize", + "recipe_source": "generated_cli", + "recipe_format": "generated", + }, + ) + + metadata = mock_log_recipe_result.call_args.kwargs["metadata"] + assert "target_system_type" not in metadata + assert "target_device" not in metadata + assert "target_execution_provider" not in metadata + assert "target_execution_providers" not in metadata + assert metadata["host_system_type"] == "LocalSystem" + assert metadata["host_device"] == "cpu" + assert metadata["host_execution_provider"] == "CPUExecutionProvider" + assert metadata["host_execution_providers"] == "CPUExecutionProvider" + + +@patch("olive.workflows.run.run.log_recipe_result") +@patch("olive.workflows.run.run.run_engine") +def test_run_logs_package_config_hash_when_package_config_provided(mock_run_engine, mock_log_recipe_result): + config = { + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen2.5-0.5B-Instruct", + "task": "text-generation", + "load_kwargs": {"attn_implementation": "eager"}, + } + } + mock_run_engine.return_value = object() + + olive_run( + config, + package_config={}, + recipe_telemetry_metadata={ + "recipe_name": "Quantize", + "recipe_command": "Quantize", + "recipe_source": "generated_cli", + "recipe_format": "generated", + }, + ) + + metadata = mock_log_recipe_result.call_args.kwargs["metadata"] + assert metadata["package_config_provided"] is True + assert metadata["package_config_hash"] + + def test_classify_run_config_source_handles_non_pathlike_object(): assert _classify_run_config_source(object()) == ("config_object", "object") From 89876690ca601a84a19d6dbad040d87b01db1cb0 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 18:56:22 -0500 Subject: [PATCH 16/24] Replace package config hash with override values Log redacted package-config overrides instead of an opaque hash so Olive telemetry captures the specific package settings users changed, while still excluding package_config from recipe_hash and avoiding raw module-path leakage. Files changed: - docs/Privacy.md - olive/telemetry/telemetry.py - olive/workflows/run/run.py - test/workflows/test_workflow_run.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/Privacy.md | 2 +- olive/telemetry/telemetry.py | 2 +- olive/workflows/run/run.py | 73 ++++++++++++++++++++++++++--- test/workflows/test_workflow_run.py | 17 +++++-- 4 files changed, 83 insertions(+), 11 deletions(-) diff --git a/docs/Privacy.md b/docs/Privacy.md index 6ec870678..239b30e41 100644 --- a/docs/Privacy.md +++ b/docs/Privacy.md @@ -13,4 +13,4 @@ In addition, Olive may collect additional telemetry data such as: - Performance data - Exception information -Collection of this additional telemetry can be disabled by adding the `--disable_telemetry` flag to any Olive CLI command, or by setting the `OLIVE_DISABLE_TELEMETRY` environment variable to `1` before running. In CI/CD environments (e.g., GitHub Actions, Azure Pipelines, Jenkins), Olive suppresses the general heartbeat/action/error events and only emits the `OliveRecipe` event. The `OliveRecipe` event may include recipe metadata such as pass types, explicit target and host settings, whether a custom package config was provided, a hash of that custom package config, and a redacted snapshot of explicitly supplied config overrides. If telemetry is enabled, but cannot be sent to Microsoft, it will be stored locally and sent when a connection is available. You can override the default cache location by setting the `OLIVE_TELEMETRY_CACHE_DIR` environment variable to a valid directory path. +Collection of this additional telemetry can be disabled by adding the `--disable_telemetry` flag to any Olive CLI command, or by setting the `OLIVE_DISABLE_TELEMETRY` environment variable to `1` before running. In CI/CD environments (e.g., GitHub Actions, Azure Pipelines, Jenkins), Olive suppresses the general heartbeat/action/error events and only emits the `OliveRecipe` event. The `OliveRecipe` event may include recipe metadata such as pass types, explicit target and host settings, whether a custom package config was provided, a redacted snapshot of custom package-config overrides, and a redacted snapshot of explicitly supplied config overrides. If telemetry is enabled, but cannot be sent to Microsoft, it will be stored locally and sent when a connection is available. You can override the default cache location by setting the `OLIVE_TELEMETRY_CACHE_DIR` environment variable to a valid directory path. diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 513fe3c34..c15614a5f 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -99,7 +99,7 @@ "data_config_count", "search_enabled", "package_config_provided", - "package_config_hash", + "package_config_overrides", "is_ci", "app_version", "app_instance_id", diff --git a/olive/workflows/run/run.py b/olive/workflows/run/run.py index 794c5fa08..fca19e681 100644 --- a/olive/workflows/run/run.py +++ b/olive/workflows/run/run.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- +import functools import importlib import json import logging @@ -40,8 +41,9 @@ "prepend_to_path", "script_dir", "model_script", - # package_config is tracked separately via package_config_provided, but - # excluded from recipe_hash because it is an environment/infrastructure path. + # package_config is tracked separately via package_config_provided and + # package_config_overrides, but excluded from recipe_hash because it is an + # environment/infrastructure path. "package_config", "work_dir", } @@ -52,6 +54,7 @@ "user_script", } CONFIG_REFERENCE_KEYS = {"host", "target", "evaluator"} +_NO_OVERRIDE = object() def get_required_packages(package_config: OlivePackageConfig, run_config: RunConfig) -> set[str]: @@ -291,7 +294,7 @@ def _build_recipe_result_metadata( metadata.setdefault("package_config_provided", package_config_provided) metadata.setdefault("config_overrides", _build_config_overrides(run_config_telemetry_input)) if package_config_provided: - metadata.setdefault("package_config_hash", _build_package_config_hash(package_config_input)) + metadata.setdefault("package_config_overrides", _build_package_config_overrides(package_config_input)) metadata["is_ci"] = is_ci_environment() if run_config is None: @@ -345,21 +348,79 @@ def _build_config_overrides(config_input: Any) -> Optional[str]: return None -def _build_package_config_hash(config_input: Any) -> Optional[str]: +def _build_package_config_overrides(config_input: Any) -> Optional[str]: try: config_data = _load_config_input_for_telemetry(config_input) if not isinstance(config_data, dict): return None - snapshot = _sanitize_config_snapshot(config_data) + default_config = _load_default_package_config_for_telemetry() + baseline = ( + _normalize_package_config_snapshot(default_config) if isinstance(default_config, dict) else _NO_OVERRIDE + ) + overrides = _extract_config_overrides(_normalize_package_config_snapshot(config_data), baseline) + if overrides is _NO_OVERRIDE: + return None + + snapshot = _sanitize_config_snapshot(overrides) if not isinstance(snapshot, dict): return None - return hash_dict(snapshot)[:16] + return json.dumps(snapshot, ensure_ascii=False, sort_keys=True, separators=(",", ":")) except Exception: return None +@functools.lru_cache +def _load_default_package_config_for_telemetry() -> Optional[dict[str, Any]]: + try: + default_config = load_config_file(OlivePackageConfig.get_default_config_path()) + except Exception: + return None + + return default_config if isinstance(default_config, dict) else None + + +def _normalize_package_config_snapshot(config_data: Any) -> Any: + if not isinstance(config_data, dict): + return config_data + + normalized = deepcopy(config_data) + passes = normalized.get("passes") + if isinstance(passes, dict): + normalized["passes"] = {str(pass_name).lower(): pass_config for pass_name, pass_config in passes.items()} + return normalized + + +def _extract_config_overrides(value: Any, baseline: Any = _NO_OVERRIDE) -> Any: + if baseline is _NO_OVERRIDE: + return deepcopy(value) + + if isinstance(value, dict) and isinstance(baseline, dict): + overrides = {} + for key, child_value in value.items(): + child_override = _extract_config_overrides(child_value, baseline.get(key, _NO_OVERRIDE)) + if child_override is not _NO_OVERRIDE: + overrides[key] = child_override + if overrides: + return overrides + return _NO_OVERRIDE if value == baseline else {} + + if isinstance(value, list): + if isinstance(baseline, list) and value == baseline: + return _NO_OVERRIDE + return deepcopy(value) + + if isinstance(value, tuple): + value_list = list(value) + baseline_list = list(baseline) if isinstance(baseline, tuple) else baseline + if isinstance(baseline_list, list) and value_list == baseline_list: + return _NO_OVERRIDE + return value_list + + return deepcopy(value) if value != baseline else _NO_OVERRIDE + + def _load_config_input_for_telemetry(config_input: Any) -> Optional[Any]: if config_input is None: return None diff --git a/test/workflows/test_workflow_run.py b/test/workflows/test_workflow_run.py index b0382b935..270a56d19 100644 --- a/test/workflows/test_workflow_run.py +++ b/test/workflows/test_workflow_run.py @@ -272,7 +272,7 @@ def test_run_logs_recipe_host_metadata_without_explicit_target(mock_run_engine, @patch("olive.workflows.run.run.log_recipe_result") @patch("olive.workflows.run.run.run_engine") -def test_run_logs_package_config_hash_when_package_config_provided(mock_run_engine, mock_log_recipe_result): +def test_run_logs_package_config_overrides_when_package_config_provided(mock_run_engine, mock_log_recipe_result): config = { "input_model": { "type": "HfModel", @@ -285,7 +285,15 @@ def test_run_logs_package_config_hash_when_package_config_provided(mock_run_engi olive_run( config, - package_config={}, + package_config={ + "passes": { + "AddOliveMetadata": { + "module_path": "olive.passes.onnx.add_metadata.AddOliveMetadata", + "supported_providers": ["CPUExecutionProvider"], + } + }, + "extra_dependencies": {"custom_accelerator": ["custom-package"]}, + }, recipe_telemetry_metadata={ "recipe_name": "Quantize", "recipe_command": "Quantize", @@ -296,7 +304,10 @@ def test_run_logs_package_config_hash_when_package_config_provided(mock_run_engi metadata = mock_log_recipe_result.call_args.kwargs["metadata"] assert metadata["package_config_provided"] is True - assert metadata["package_config_hash"] + package_config_overrides = json.loads(metadata["package_config_overrides"]) + assert package_config_overrides["passes"][0]["supported_providers"] == ["CPUExecutionProvider"] + assert "module_path" not in package_config_overrides["passes"][0] + assert package_config_overrides["extra_dependencies"]["custom_accelerator"] == ["custom-package"] def test_classify_run_config_source_handles_non_pathlike_object(): From ff60650862c8dd020452a3cba4017a75a03403b6 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 19:19:51 -0500 Subject: [PATCH 17/24] Guard Azure CI secret-dependent login steps Skip Hugging Face and Docker logins when PR secrets are unavailable or unresolved so Azure unit-test jobs do not fail before tests start on fork-style runs, while still preserving normal login behavior when valid credentials are present. Files changed: - .azure_pipelines/job_templates/build-docker-image-template.yaml - .azure_pipelines/job_templates/huggingface-login-template.yaml - .azure_pipelines/job_templates/olive-test-linux-gpu-template.yaml - .azure_pipelines/scripts/run_test.sh Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../build-docker-image-template.yaml | 26 +++++++++++++------ .../huggingface-login-template.yaml | 10 ++++++- .../olive-test-linux-gpu-template.yaml | 11 +++++++- .azure_pipelines/scripts/run_test.sh | 10 ++++++- 4 files changed, 46 insertions(+), 11 deletions(-) diff --git a/.azure_pipelines/job_templates/build-docker-image-template.yaml b/.azure_pipelines/job_templates/build-docker-image-template.yaml index 9d671c132..90ec0726e 100644 --- a/.azure_pipelines/job_templates/build-docker-image-template.yaml +++ b/.azure_pipelines/job_templates/build-docker-image-template.yaml @@ -8,15 +8,25 @@ parameters: trt_version: '' steps: -- script: | - docker login -u $(docker-username) -p $(docker-password) - docker build \ - --build-arg BASE_IMAGE=${{ parameters.base_image }} \ - --build-arg TENSORRT_VERSION=${{ parameters.trt_version }} \ - --build-arg PYTHON_VERSION=${{ parameters.python_version }} \ - -t ${{ parameters.docker_image }} \ - -f $(Build.SourcesDirectory)/${{ parameters.dockerfile }} . +- pwsh: | + $username = $env:DOCKER_USERNAME + $password = $env:DOCKER_PASSWORD + if ( + [string]::IsNullOrWhiteSpace($username) -or + [string]::IsNullOrWhiteSpace($password) -or + $username -match '^\$\([^)]+\)$' -or + $password -match '^\$\([^)]+\)$' + ) { + Write-Host "Skipping docker login because registry credentials are unavailable." + } else { + $password | docker login -u $username --password-stdin + } + + docker build --build-arg BASE_IMAGE=${{ parameters.base_image }} --build-arg TENSORRT_VERSION=${{ parameters.trt_version }} --build-arg PYTHON_VERSION=${{ parameters.python_version }} -t ${{ parameters.docker_image }} -f "$(Build.SourcesDirectory)/${{ parameters.dockerfile }}" . displayName: Build Docker Image + env: + DOCKER_USERNAME: $(docker-username) + DOCKER_PASSWORD: $(docker-password) - script: | docker version diff --git a/.azure_pipelines/job_templates/huggingface-login-template.yaml b/.azure_pipelines/job_templates/huggingface-login-template.yaml index 59ae97b75..10645dbd7 100644 --- a/.azure_pipelines/job_templates/huggingface-login-template.yaml +++ b/.azure_pipelines/job_templates/huggingface-login-template.yaml @@ -2,5 +2,13 @@ parameters: hf_token: 'huggingface_token' steps: -- script: hf auth login --token ${{ parameters.hf_token }} +- pwsh: | + $token = $env:HF_TOKEN + if ([string]::IsNullOrWhiteSpace($token) -or $token -match '^\$\([^)]+\)$') { + Write-Host "Skipping Hugging Face login because no token is available." + exit 0 + } + hf auth login --token "$token" displayName: 'Hugging Face Login' + env: + HF_TOKEN: ${{ parameters.hf_token }} diff --git a/.azure_pipelines/job_templates/olive-test-linux-gpu-template.yaml b/.azure_pipelines/job_templates/olive-test-linux-gpu-template.yaml index 10cee435c..5ed90dcfb 100644 --- a/.azure_pipelines/job_templates/olive-test-linux-gpu-template.yaml +++ b/.azure_pipelines/job_templates/olive-test-linux-gpu-template.yaml @@ -59,6 +59,13 @@ jobs: trt_version: ${{ parameters.trt_version }} - script: | + hf_token_input="${HF_TOKEN:-}" + case "$hf_token_input" in + '$('*')') + hf_token_input="" + ;; + esac + docker run \ --shm-size=4g \ --gpus=all \ @@ -72,9 +79,11 @@ jobs: test/${{ parameters.requirements_file }} \ ${{ parameters.test_path }} \ false \ - $(hf_token) \ + "$hf_token_input" \ "${{ parameters.pytest_marker }}" displayName: Run Tests in Docker + env: + HF_TOKEN: $(hf_token) - task: CredScan@3 displayName: 'Run CredScan' diff --git a/.azure_pipelines/scripts/run_test.sh b/.azure_pipelines/scripts/run_test.sh index caa5fcb1c..b81dcb0c0 100644 --- a/.azure_pipelines/scripts/run_test.sh +++ b/.azure_pipelines/scripts/run_test.sh @@ -40,7 +40,15 @@ BUILD_CUDA_EXT=0 pip install --no-build-isolation "git+https://github.com/PanQiW # Set HF Token pip install huggingface-hub -hf auth login --token "$7" +hf_token="$7" +case "$hf_token" in +"" | '$('*')') + echo "Skipping Hugging Face login because no token is available." + ;; +*) + hf auth login --token "$hf_token" + ;; +esac pip list From 1cae73fffb294e0e7dc5780c80c70d35b21f6bb7 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 20:06:54 -0500 Subject: [PATCH 18/24] Revert Azure CI secret login guards Revert the Azure pipeline login guard changes because the pipeline behavior should match main and those changes are not necessary for the telemetry PR. Files changed: - .azure_pipelines/job_templates/build-docker-image-template.yaml - .azure_pipelines/job_templates/huggingface-login-template.yaml - .azure_pipelines/job_templates/olive-test-linux-gpu-template.yaml - .azure_pipelines/scripts/run_test.sh Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../build-docker-image-template.yaml | 26 ++++++------------- .../huggingface-login-template.yaml | 10 +------ .../olive-test-linux-gpu-template.yaml | 11 +------- .azure_pipelines/scripts/run_test.sh | 10 +------ 4 files changed, 11 insertions(+), 46 deletions(-) diff --git a/.azure_pipelines/job_templates/build-docker-image-template.yaml b/.azure_pipelines/job_templates/build-docker-image-template.yaml index 90ec0726e..9d671c132 100644 --- a/.azure_pipelines/job_templates/build-docker-image-template.yaml +++ b/.azure_pipelines/job_templates/build-docker-image-template.yaml @@ -8,25 +8,15 @@ parameters: trt_version: '' steps: -- pwsh: | - $username = $env:DOCKER_USERNAME - $password = $env:DOCKER_PASSWORD - if ( - [string]::IsNullOrWhiteSpace($username) -or - [string]::IsNullOrWhiteSpace($password) -or - $username -match '^\$\([^)]+\)$' -or - $password -match '^\$\([^)]+\)$' - ) { - Write-Host "Skipping docker login because registry credentials are unavailable." - } else { - $password | docker login -u $username --password-stdin - } - - docker build --build-arg BASE_IMAGE=${{ parameters.base_image }} --build-arg TENSORRT_VERSION=${{ parameters.trt_version }} --build-arg PYTHON_VERSION=${{ parameters.python_version }} -t ${{ parameters.docker_image }} -f "$(Build.SourcesDirectory)/${{ parameters.dockerfile }}" . +- script: | + docker login -u $(docker-username) -p $(docker-password) + docker build \ + --build-arg BASE_IMAGE=${{ parameters.base_image }} \ + --build-arg TENSORRT_VERSION=${{ parameters.trt_version }} \ + --build-arg PYTHON_VERSION=${{ parameters.python_version }} \ + -t ${{ parameters.docker_image }} \ + -f $(Build.SourcesDirectory)/${{ parameters.dockerfile }} . displayName: Build Docker Image - env: - DOCKER_USERNAME: $(docker-username) - DOCKER_PASSWORD: $(docker-password) - script: | docker version diff --git a/.azure_pipelines/job_templates/huggingface-login-template.yaml b/.azure_pipelines/job_templates/huggingface-login-template.yaml index 10645dbd7..59ae97b75 100644 --- a/.azure_pipelines/job_templates/huggingface-login-template.yaml +++ b/.azure_pipelines/job_templates/huggingface-login-template.yaml @@ -2,13 +2,5 @@ parameters: hf_token: 'huggingface_token' steps: -- pwsh: | - $token = $env:HF_TOKEN - if ([string]::IsNullOrWhiteSpace($token) -or $token -match '^\$\([^)]+\)$') { - Write-Host "Skipping Hugging Face login because no token is available." - exit 0 - } - hf auth login --token "$token" +- script: hf auth login --token ${{ parameters.hf_token }} displayName: 'Hugging Face Login' - env: - HF_TOKEN: ${{ parameters.hf_token }} diff --git a/.azure_pipelines/job_templates/olive-test-linux-gpu-template.yaml b/.azure_pipelines/job_templates/olive-test-linux-gpu-template.yaml index 5ed90dcfb..10cee435c 100644 --- a/.azure_pipelines/job_templates/olive-test-linux-gpu-template.yaml +++ b/.azure_pipelines/job_templates/olive-test-linux-gpu-template.yaml @@ -59,13 +59,6 @@ jobs: trt_version: ${{ parameters.trt_version }} - script: | - hf_token_input="${HF_TOKEN:-}" - case "$hf_token_input" in - '$('*')') - hf_token_input="" - ;; - esac - docker run \ --shm-size=4g \ --gpus=all \ @@ -79,11 +72,9 @@ jobs: test/${{ parameters.requirements_file }} \ ${{ parameters.test_path }} \ false \ - "$hf_token_input" \ + $(hf_token) \ "${{ parameters.pytest_marker }}" displayName: Run Tests in Docker - env: - HF_TOKEN: $(hf_token) - task: CredScan@3 displayName: 'Run CredScan' diff --git a/.azure_pipelines/scripts/run_test.sh b/.azure_pipelines/scripts/run_test.sh index b81dcb0c0..caa5fcb1c 100644 --- a/.azure_pipelines/scripts/run_test.sh +++ b/.azure_pipelines/scripts/run_test.sh @@ -40,15 +40,7 @@ BUILD_CUDA_EXT=0 pip install --no-build-isolation "git+https://github.com/PanQiW # Set HF Token pip install huggingface-hub -hf_token="$7" -case "$hf_token" in -"" | '$('*')') - echo "Skipping Hugging Face login because no token is available." - ;; -*) - hf auth login --token "$hf_token" - ;; -esac +hf auth login --token "$7" pip list From 14130d4508c74b88bf332496ffe17b23aa77473e Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 4 May 2026 22:54:39 -0500 Subject: [PATCH 19/24] Address telemetry review comments Avoid duplicate OliveRecipe telemetry from Docker workflows by suppressing inner workflow recipe events and forwarding CI detection into workflow containers. Keep CI telemetry ephemeral by skipping cache setup, and make recipe metadata stable by avoiding filesystem-sensitive model/resource classification. Files changed: - docs/Privacy.md - olive/systems/docker/docker_system.py - olive/telemetry/constants.py - olive/telemetry/telemetry.py - olive/workflows/run/run.py - test/systems/docker/test_docker_system.py - test/test_telemetry.py - test/workflows/test_workflow_run.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/Privacy.md | 2 +- olive/systems/docker/docker_system.py | 8 ++- olive/telemetry/constants.py | 3 +- olive/telemetry/telemetry.py | 6 +- olive/workflows/run/run.py | 55 +++++++------- test/systems/docker/test_docker_system.py | 21 ++++++ test/test_telemetry.py | 2 + test/workflows/test_workflow_run.py | 88 ++++++++++++++++++++++- 8 files changed, 149 insertions(+), 36 deletions(-) diff --git a/docs/Privacy.md b/docs/Privacy.md index 239b30e41..17127ba99 100644 --- a/docs/Privacy.md +++ b/docs/Privacy.md @@ -13,4 +13,4 @@ In addition, Olive may collect additional telemetry data such as: - Performance data - Exception information -Collection of this additional telemetry can be disabled by adding the `--disable_telemetry` flag to any Olive CLI command, or by setting the `OLIVE_DISABLE_TELEMETRY` environment variable to `1` before running. In CI/CD environments (e.g., GitHub Actions, Azure Pipelines, Jenkins), Olive suppresses the general heartbeat/action/error events and only emits the `OliveRecipe` event. The `OliveRecipe` event may include recipe metadata such as pass types, explicit target and host settings, whether a custom package config was provided, a redacted snapshot of custom package-config overrides, and a redacted snapshot of explicitly supplied config overrides. If telemetry is enabled, but cannot be sent to Microsoft, it will be stored locally and sent when a connection is available. You can override the default cache location by setting the `OLIVE_TELEMETRY_CACHE_DIR` environment variable to a valid directory path. +Collection of this additional telemetry can be disabled by adding the `--disable_telemetry` flag to any Olive CLI command, or by setting the `OLIVE_DISABLE_TELEMETRY` environment variable to `1` before running. In CI/CD environments (e.g., GitHub Actions, Azure Pipelines, Jenkins), Olive suppresses the general heartbeat/action/error events and only emits the `OliveRecipe` event. The `OliveRecipe` event may include recipe metadata such as pass types, explicitly configured target settings, the host system type and any explicitly configured host accelerator settings, whether a custom package config was provided, a redacted snapshot of custom package-config overrides, and a redacted snapshot of explicitly supplied config overrides. Outside CI/CD environments, if telemetry is enabled but cannot be sent to Microsoft, it will be stored locally and sent when a connection is available. You can override the default cache location by setting the `OLIVE_TELEMETRY_CACHE_DIR` environment variable to a valid directory path. diff --git a/olive/systems/docker/docker_system.py b/olive/systems/docker/docker_system.py index 8371cccd4..07e388cd6 100644 --- a/olive/systems/docker/docker_system.py +++ b/olive/systems/docker/docker_system.py @@ -5,6 +5,7 @@ import copy import json import logging +import os import sys import tempfile from pathlib import Path @@ -19,6 +20,8 @@ from olive.systems.common import AcceleratorConfig, SystemType from olive.systems.olive_system import OliveSystem from olive.systems.system_config import LocalTargetUserConfig, SystemConfig +from olive.telemetry.constants import SUPPRESS_WORKFLOW_TELEMETRY_ENV +from olive.telemetry.telemetry import is_ci_environment from olive.workflows.run.config import RunConfig if TYPE_CHECKING: @@ -241,6 +244,9 @@ def _prepare_environment(self, base_env) -> dict: # Add default environment variables environment.setdefault("PYTHONPYCACHEPREFIX", "/tmp") environment["OLIVE_LOG_LEVEL"] = logging.getLevelName(logger.getEffectiveLevel()) + environment[SUPPRESS_WORKFLOW_TELEMETRY_ENV] = "1" + if is_ci_environment(): + environment["CI"] = "1" # Add HuggingFace token if needed if self.hf_token: @@ -303,8 +309,6 @@ def _create_runner_script_mount(self) -> tuple[str, str]: @staticmethod def _get_huggingface_token() -> Optional[str]: """Get HuggingFace token from environment or file.""" - import os - # Check environment variable token = os.getenv("HF_TOKEN") if token: diff --git a/olive/telemetry/constants.py b/olive/telemetry/constants.py index 535929866..420da88b3 100644 --- a/olive/telemetry/constants.py +++ b/olive/telemetry/constants.py @@ -3,6 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- -"""OneCollector connection string.""" +"""Telemetry constants.""" CONNECTION_STRING = "SW5zdHJ1bWVudGF0aW9uS2V5PTYyMTUwOTExZGMwMDRmYzliYjY3YmE5NjA2NDI3ZTU2LWVjNjFmOWFmLTVkN2EtNGQxOS1hZjMxLWI5Y2Q2OWU5ODdmMS02OTE1" +SUPPRESS_WORKFLOW_TELEMETRY_ENV = "OLIVE_SUPPRESS_WORKFLOW_TELEMETRY" diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index c15614a5f..274e86c00 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -473,11 +473,11 @@ def __init__(self): self._logger = self._create_logger() event_source.disable() - self._cache_handler = TelemetryCacheHandler(self) - self._setup_payload_callbacks() is_ci = self._is_ci_environment() self._recipe_only_ci_telemetry = is_ci if not is_ci: + self._cache_handler = TelemetryCacheHandler(self) + self._setup_payload_callbacks() self._log_heartbeat() if os.environ.get("OLIVE_DISABLE_TELEMETRY") == "1": self.disable_telemetry() @@ -500,7 +500,7 @@ def _create_logger(self) -> Optional[TelemetryLogger]: def _setup_payload_callbacks(self) -> None: # Register callback for payload transmission events # No need to store unregister function - logger shutdown will clean up callbacks - if self._logger is None: + if self._logger is None or self._cache_handler is None: return self._logger.register_payload_transmitted_callback( self._cache_handler.on_payload_transmitted, diff --git a/olive/workflows/run/run.py b/olive/workflows/run/run.py index fca19e681..62c1ca26f 100644 --- a/olive/workflows/run/run.py +++ b/olive/workflows/run/run.py @@ -6,9 +6,10 @@ import importlib import json import logging +import os from copy import deepcopy from os import PathLike -from pathlib import Path +from pathlib import Path, PurePosixPath, PureWindowsPath from typing import TYPE_CHECKING, Any, Optional, Union from olive.common.config_utils import load_config_file @@ -16,9 +17,9 @@ from olive.hardware.constants import ExecutionProvider from olive.logging import set_default_logger_severity, set_ort_logger_severity, set_verbosity_info from olive.package_config import OlivePackageConfig -from olive.resource_path import create_resource_path, find_all_resources from olive.systems.accelerator_creator import create_accelerator from olive.systems.common import SystemType +from olive.telemetry.constants import SUPPRESS_WORKFLOW_TELEMETRY_ENV from olive.telemetry.telemetry import is_ci_environment from olive.telemetry.telemetry_extensions import log_recipe_result from olive.workflows.run.config import RunConfig @@ -227,7 +228,7 @@ def run( if parsed_run_config.engine.host and parsed_run_config.engine.host.type == SystemType.Docker: docker_system = parsed_run_config.engine.host.create_system() - workflow_output = docker_system.run_workflow(parsed_run_config) + workflow_output = docker_system.run_workflow(deepcopy(parsed_run_config)) success = True return workflow_output @@ -240,17 +241,18 @@ def run( exception_type = type(exc).__name__ raise finally: - metadata = _build_recipe_result_metadata( - run_config, - run_config_telemetry_input, - parsed_run_config, - recipe_telemetry_metadata, - list_required_packages=list_required_packages, - package_config_input=package_config_telemetry_input, - package_config_provided=package_config_provided, - ) - recipe_name = metadata.pop("recipe_name") - log_recipe_result(recipe_name, success=success, metadata=metadata, exception_type=exception_type) + if os.environ.get(SUPPRESS_WORKFLOW_TELEMETRY_ENV) != "1": + metadata = _build_recipe_result_metadata( + run_config, + run_config_telemetry_input, + parsed_run_config, + recipe_telemetry_metadata, + list_required_packages=list_required_packages, + package_config_input=package_config_telemetry_input, + package_config_provided=package_config_provided, + ) + recipe_name = metadata.pop("recipe_name") + log_recipe_result(recipe_name, success=success, metadata=metadata, exception_type=exception_type) def generate_files_from_packages(packages, file_name): @@ -510,12 +512,20 @@ def _classify_input_model_source(model_identifier: Any) -> str: if identifier.startswith(("http://", "https://")): return "url" - resource_path = create_resource_path(identifier) - if resource_path.is_local_resource(): - return "local_file" if resource_path.type.value == "file" else "local_folder" + if _is_explicit_local_model_path(identifier): + suffix = PureWindowsPath(identifier).suffix or PurePosixPath(identifier).suffix + return "local_file" if suffix else "local_folder" return "string_name" +def _is_explicit_local_model_path(identifier: str) -> bool: + return ( + identifier.startswith(("./", "../", ".\\", "..\\", "~/", "~\\", "/", "\\\\")) + or PureWindowsPath(identifier).is_absolute() + or PurePosixPath(identifier).is_absolute() + ) + + def _extract_target_metadata(run_config: RunConfig) -> dict[str, Optional[str]]: target_system = run_config.engine.target return _extract_system_metadata(target_system, "target") @@ -562,13 +572,11 @@ def _set_metadata_if_present(metadata: dict[str, Any], values: dict[str, Optiona def _build_recipe_hash(run_config_json: dict[str, Any]) -> str: sanitized = deepcopy(run_config_json) _redact_recipe_hash_keys(sanitized) - for path in find_all_resources(sanitized): - _set_path_value(sanitized, path, RECIPE_HASH_REDACTED_VALUE) return hash_dict(sanitized)[:16] def _redact_recipe_hash_keys(value: Any, key: Optional[str] = None) -> Any: - if key in RECIPE_HASH_REDACTED_KEYS: + if key in RECIPE_HASH_REDACTED_KEYS or _is_path_like_key(key): return RECIPE_HASH_REDACTED_VALUE if isinstance(value, dict): for child_key in list(value): @@ -577,10 +585,3 @@ def _redact_recipe_hash_keys(value: Any, key: Optional[str] = None) -> Any: for index, item in enumerate(value): value[index] = _redact_recipe_hash_keys(item, key) return value - - -def _set_path_value(container: Any, path: tuple[Any, ...], value: Any) -> None: - current = container - for key in path[:-1]: - current = current[key] - current[path[-1]] = value diff --git a/test/systems/docker/test_docker_system.py b/test/systems/docker/test_docker_system.py index 5430b6858..3d668c6b6 100644 --- a/test/systems/docker/test_docker_system.py +++ b/test/systems/docker/test_docker_system.py @@ -8,6 +8,7 @@ from olive.systems.docker.docker_system import DockerSystem from olive.systems.system_config import DockerTargetUserConfig +from olive.telemetry.constants import SUPPRESS_WORKFLOW_TELEMETRY_ENV from test.utils import ONNX_MODEL_PATH # pylint: disable=attribute-defined-outside-init,protected-access @@ -136,10 +137,30 @@ def test_run_workflow(self, mock_find_resources, mock_tempdir, mock_from_env, tm command = mock_docker_client.containers.run.call_args[1]["command"] assert "workflow_runner.py" in command assert "--config" in command + assert mock_docker_client.containers.run.call_args.kwargs["environment"][SUPPRESS_WORKFLOW_TELEMETRY_ENV] == "1" # Verify cleanup mock_container.remove.assert_called_once() + @patch("olive.systems.docker.docker_system.docker.from_env") + def test_prepare_environment_forwards_ci_to_workflow_container(self, mock_from_env, monkeypatch): + mock_docker_client = MagicMock() + mock_from_env.return_value = mock_docker_client + mock_docker_client.images.get.return_value = MagicMock() + monkeypatch.setenv("TF_BUILD", "True") + docker_config = self.get_default_docker_config() + docker_system = DockerSystem( + image_name=docker_config.image_name, + build_context_path=docker_config.build_context_path, + dockerfile=docker_config.dockerfile, + work_dir=docker_config.work_dir, + ) + + environment = docker_system._prepare_environment({}) + + assert environment["CI"] == "1" + assert environment[SUPPRESS_WORKFLOW_TELEMETRY_ENV] == "1" + @patch("olive.systems.docker.docker_system.docker.from_env") @patch("olive.systems.docker.docker_system.tempfile.TemporaryDirectory") @patch("olive.systems.docker.docker_system.find_all_resources") diff --git a/test/test_telemetry.py b/test/test_telemetry.py index 426ef968c..bb394d6cb 100644 --- a/test/test_telemetry.py +++ b/test/test_telemetry.py @@ -55,6 +55,8 @@ def test_telemetry_only_logs_recipe_events_in_ci(monkeypatch): assert mock_logger.log.call_count == 1 assert mock_logger.log.call_args.args[0] == RECIPE_EVENT_NAME + assert telemetry._cache_handler is None + mock_logger.register_payload_transmitted_callback.assert_not_called() finally: Telemetry._instance = None diff --git a/test/workflows/test_workflow_run.py b/test/workflows/test_workflow_run.py index 270a56d19..d3e3961f7 100644 --- a/test/workflows/test_workflow_run.py +++ b/test/workflows/test_workflow_run.py @@ -2,12 +2,13 @@ import sys from copy import deepcopy from pathlib import Path -from unittest.mock import patch +from unittest.mock import Mock, patch import pytest +from olive.telemetry.constants import SUPPRESS_WORKFLOW_TELEMETRY_ENV from olive.workflows import run as olive_run -from olive.workflows.run.run import _classify_run_config_source +from olive.workflows.run.run import _build_recipe_hash, _classify_input_model_source, _classify_run_config_source from test.utils import ( get_pytorch_model, get_pytorch_model_config, @@ -229,6 +230,66 @@ def test_run_logs_recipe_result_failure(mock_run_engine, mock_log_recipe_result) assert mock_log_recipe_result.call_args.kwargs["exception_type"] == "ValueError" +@patch("olive.workflows.run.run.log_recipe_result") +@patch("olive.workflows.run.run.run_engine") +def test_run_skips_recipe_result_when_workflow_telemetry_is_suppressed( + mock_run_engine, mock_log_recipe_result, monkeypatch +): + monkeypatch.setenv(SUPPRESS_WORKFLOW_TELEMETRY_ENV, "1") + expected_output = object() + mock_run_engine.return_value = expected_output + + output = olive_run( + { + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen2.5-0.5B-Instruct", + "task": "text-generation", + } + } + ) + + assert output is expected_output + mock_log_recipe_result.assert_not_called() + + +@patch("olive.workflows.run.run.log_recipe_result") +@patch("olive.systems.system_config.SystemConfig.create_system") +def test_run_logs_single_parent_recipe_result_for_docker_host(mock_create_system, mock_log_recipe_result): + expected_output = object() + docker_system = Mock() + + def run_workflow(container_run_config): + container_run_config.engine.host = container_run_config.engine.target + return expected_output + + docker_system.run_workflow.side_effect = run_workflow + mock_create_system.return_value = docker_system + config = { + "input_model": {"type": "ONNXModel", "model_path": "model.onnx"}, + "systems": { + "docker_system": { + "type": "Docker", + "config": { + "dockerfile": "Dockerfile", + "build_context_path": "build_context", + "image_name": "test-image:latest", + "work_dir": "/olive-ws", + }, + }, + "local_system": {"type": "LocalSystem"}, + }, + "engine": {"host": "docker_system", "target": "local_system"}, + } + + output = olive_run(config) + + assert output is expected_output + mock_log_recipe_result.assert_called_once() + metadata = mock_log_recipe_result.call_args.kwargs["metadata"] + assert metadata["host_system_type"] == "Docker" + + @patch("olive.workflows.run.run.log_recipe_result") @patch("olive.workflows.run.run.run_engine") def test_run_logs_recipe_host_metadata_without_explicit_target(mock_run_engine, mock_log_recipe_result): @@ -312,3 +373,26 @@ def test_run_logs_package_config_overrides_when_package_config_provided(mock_run def test_classify_run_config_source_handles_non_pathlike_object(): assert _classify_run_config_source(object()) == ("config_object", "object") + + +def test_classify_input_model_source_does_not_depend_on_local_filesystem(tmp_path, monkeypatch): + assert _classify_input_model_source("Qwen/Qwen2.5-0.5B-Instruct") == "string_name" + + monkeypatch.chdir(tmp_path) + (tmp_path / "bert-base-uncased").mkdir() + + assert _classify_input_model_source("bert-base-uncased") == "string_name" + assert _classify_input_model_source("./model.onnx") == "local_file" + + +def test_recipe_hash_does_not_depend_on_local_model_path_presence(tmp_path, monkeypatch): + config = { + "input_model": {"type": "HfModel", "config": {"model_path": "bert-base-uncased"}}, + "engine": {"output_dir": "output"}, + } + recipe_hash = _build_recipe_hash(config) + + monkeypatch.chdir(tmp_path) + (tmp_path / "bert-base-uncased").mkdir() + + assert _build_recipe_hash(config) == recipe_hash From d9247e242ef16fdac8d0f6c95ca87528e066178e Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Tue, 5 May 2026 00:45:34 -0500 Subject: [PATCH 20/24] Fix telemetry pipeline test regressions Keep CLI workflow imports patchable so existing CLI tests and command mocks still intercept workflow execution. Update CLI test expectations for recipe telemetry metadata, and make the CI-sensitive workflow telemetry assertion deterministic. Files changed: - olive/cli/base.py - olive/cli/run.py - test/cli/test_cli.py - test/workflows/test_workflow_run.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/cli/base.py | 3 ++- olive/cli/run.py | 3 ++- test/cli/test_cli.py | 19 ++++++++++++++++++- test/workflows/test_workflow_run.py | 3 ++- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/olive/cli/base.py b/olive/cli/base.py index 5bc34af6f..906584311 100644 --- a/olive/cli/base.py +++ b/olive/cli/base.py @@ -20,7 +20,6 @@ from olive.hardware.accelerator import AcceleratorSpec from olive.hardware.constants import DEVICE_TO_EXECUTION_PROVIDERS from olive.resource_path import OLIVE_RESOURCE_ANNOTATIONS -from olive.workflows import run as olive_run class BaseOliveCLICommand(ABC): @@ -34,6 +33,8 @@ def __init__(self, parser: ArgumentParser, args: Namespace, unknown_args: Option parser.error(f"Unknown arguments: {unknown_args}") def _run_workflow(self): + from olive.workflows import run as olive_run + Path(self.args.output_path).mkdir(parents=True, exist_ok=True) with tempfile.TemporaryDirectory(prefix="olive-cli-tmp-", dir=self.args.output_path) as tempdir: diff --git a/olive/cli/run.py b/olive/cli/run.py index 1d8080581..64752626e 100644 --- a/olive/cli/run.py +++ b/olive/cli/run.py @@ -14,7 +14,6 @@ ) from olive.common.config_utils import load_config_file from olive.telemetry import action -from olive.workflows import run as olive_run class WorkflowRunCommand(BaseOliveCLICommand): @@ -52,6 +51,8 @@ def register_subcommand(parser: ArgumentParser): @action def run(self): + from olive.workflows import run as olive_run + # allow the run_config to be a dict already (for api use) run_config_input = self.args.run_config run_config = run_config_input diff --git a/test/cli/test_cli.py b/test/cli/test_cli.py index a7cb39e24..80c8783ca 100644 --- a/test/cli/test_cli.py +++ b/test/cli/test_cli.py @@ -107,7 +107,17 @@ def test_workflow_run_command(mock_run, tempdir, list_required_packages, tmp_pat # assert mock_run.assert_called_once_with( - {"key": "value"}, package_config=None, tempdir=tempdir, list_required_packages=list_required_packages + {"key": "value"}, + package_config=None, + tempdir=tempdir, + list_required_packages=list_required_packages, + recipe_telemetry_metadata={ + "recipe_command": "WorkflowRun", + "recipe_source": "config_file", + "recipe_format": "json", + "execution_mode": "list_required_packages" if list_required_packages else "run", + "package_config_provided": False, + }, ) @@ -147,6 +157,13 @@ def test_workflow_run_command_with_overrides(mock_run, tmp_path): list_required_packages=False, package_config=None, tempdir=None, + recipe_telemetry_metadata={ + "recipe_command": "WorkflowRun", + "recipe_source": "config_file", + "recipe_format": "json", + "execution_mode": "run", + "package_config_provided": False, + }, ) diff --git a/test/workflows/test_workflow_run.py b/test/workflows/test_workflow_run.py index d3e3961f7..7c711eea1 100644 --- a/test/workflows/test_workflow_run.py +++ b/test/workflows/test_workflow_run.py @@ -132,7 +132,8 @@ def test_run_packages(): @patch("olive.workflows.run.run.log_recipe_result") @patch("olive.workflows.run.run.run_engine") -def test_run_logs_recipe_result_success(mock_run_engine, mock_log_recipe_result): +@patch("olive.workflows.run.run.is_ci_environment", return_value=False) +def test_run_logs_recipe_result_success(_, mock_run_engine, mock_log_recipe_result): config = { "input_model": { "type": "HfModel", From befbb084c390dae1246ad7932402f2c869e99080 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Fri, 8 May 2026 22:47:19 -0500 Subject: [PATCH 21/24] Log workflow exceptions as error telemetry Keep OliveRecipe focused on recipe outcome metadata and use the existing log_error path for workflow exceptions. This avoids duplicating exception fields on recipe events while preserving detailed formatted exception messages in error telemetry. Files changed: - olive/telemetry/telemetry_extensions.py - olive/telemetry/telemetry.py - olive/workflows/run/run.py - test/workflows/test_workflow_run.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/telemetry/telemetry.py | 1 - olive/telemetry/telemetry_extensions.py | 3 --- olive/workflows/run/run.py | 13 +++++++++---- test/workflows/test_workflow_run.py | 8 ++++++-- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 274e86c00..a37b29bf5 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -82,7 +82,6 @@ "workflow_id", "config_overrides", "success", - "exception_type", "input_model_type", "input_model_source", "model_task", diff --git a/olive/telemetry/telemetry_extensions.py b/olive/telemetry/telemetry_extensions.py index 8b5fc0412..068aa9dd1 100644 --- a/olive/telemetry/telemetry_extensions.py +++ b/olive/telemetry/telemetry_extensions.py @@ -49,15 +49,12 @@ def log_recipe_result( recipe_name: str, success: bool, metadata: Optional[dict[str, Any]] = None, - exception_type: Optional[str] = None, ) -> None: telemetry = _get_logger() attributes = { "recipe_name": recipe_name, "success": success, } - if exception_type: - attributes["exception_type"] = exception_type telemetry.log(RECIPE_EVENT_NAME, attributes, metadata) diff --git a/olive/workflows/run/run.py b/olive/workflows/run/run.py index 62c1ca26f..5dd6b44d5 100644 --- a/olive/workflows/run/run.py +++ b/olive/workflows/run/run.py @@ -21,7 +21,7 @@ from olive.systems.common import SystemType from olive.telemetry.constants import SUPPRESS_WORKFLOW_TELEMETRY_ENV from olive.telemetry.telemetry import is_ci_environment -from olive.telemetry.telemetry_extensions import log_recipe_result +from olive.telemetry.telemetry_extensions import _format_exception_message, log_error, log_recipe_result from olive.workflows.run.config import RunConfig if TYPE_CHECKING: @@ -213,7 +213,7 @@ def run( parsed_run_config = None success = False - exception_type = None + exception = None try: package_config = OlivePackageConfig.parse_file_or_obj(package_config) parsed_run_config = RunConfig.parse_file_or_obj(run_config) @@ -238,9 +238,14 @@ def run( success = True return workflow_output except Exception as exc: - exception_type = type(exc).__name__ + exception = exc raise finally: + if exception is not None: + log_error( + exception_type=type(exception).__name__, + exception_message=_format_exception_message(exception, exception.__traceback__), + ) if os.environ.get(SUPPRESS_WORKFLOW_TELEMETRY_ENV) != "1": metadata = _build_recipe_result_metadata( run_config, @@ -252,7 +257,7 @@ def run( package_config_provided=package_config_provided, ) recipe_name = metadata.pop("recipe_name") - log_recipe_result(recipe_name, success=success, metadata=metadata, exception_type=exception_type) + log_recipe_result(recipe_name, success=success, metadata=metadata) def generate_files_from_packages(packages, file_name): diff --git a/test/workflows/test_workflow_run.py b/test/workflows/test_workflow_run.py index 7c711eea1..a3c9ade43 100644 --- a/test/workflows/test_workflow_run.py +++ b/test/workflows/test_workflow_run.py @@ -200,9 +200,10 @@ def test_run_logs_recipe_result_success(_, mock_run_engine, mock_log_recipe_resu assert config_overrides["systems"][0]["accelerators"][0]["execution_providers"] == ["CUDAExecutionProvider"] +@patch("olive.workflows.run.run.log_error") @patch("olive.workflows.run.run.log_recipe_result") @patch("olive.workflows.run.run.run_engine") -def test_run_logs_recipe_result_failure(mock_run_engine, mock_log_recipe_result): +def test_run_logs_recipe_result_failure(mock_run_engine, mock_log_recipe_result, mock_log_error): config = { "input_model": { "type": "HfModel", @@ -228,7 +229,10 @@ def test_run_logs_recipe_result_failure(mock_run_engine, mock_log_recipe_result) mock_log_recipe_result.assert_called_once() assert mock_log_recipe_result.call_args.args[0] == "Quantize" assert mock_log_recipe_result.call_args.kwargs["success"] is False - assert mock_log_recipe_result.call_args.kwargs["exception_type"] == "ValueError" + assert "exception_type" not in mock_log_recipe_result.call_args.kwargs + mock_log_error.assert_called_once() + assert mock_log_error.call_args.kwargs["exception_type"] == "ValueError" + assert "recipe failed" in mock_log_error.call_args.kwargs["exception_message"] @patch("olive.workflows.run.run.log_recipe_result") From c90e653fea92bd14f7e18522a0eca8d34d9d4ce4 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Fri, 8 May 2026 23:13:05 -0500 Subject: [PATCH 22/24] Reduce CLI import churn Keep optional and workflow-heavy imports lazy so generated CLI commands preserve existing import behavior while still reporting recipe telemetry. Files changed: - olive/cli/base.py - olive/cli/run.py - olive/workflows/run/run.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/cli/base.py | 16 ++++++++-------- olive/cli/run.py | 5 +++-- olive/workflows/run/run.py | 3 +-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/olive/cli/base.py b/olive/cli/base.py index 906584311..26bd04210 100644 --- a/olive/cli/base.py +++ b/olive/cli/base.py @@ -2,17 +2,13 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -import importlib import json import re -import tempfile from abc import ABC, abstractmethod from argparse import ArgumentParser, Namespace from pathlib import Path from typing import ClassVar, Optional -from packaging import version - from olive.common.constants import DEFAULT_HF_TASK from olive.common.user_module_loader import UserModuleLoader from olive.common.utils import hf_repo_exists, set_nested_dict_value, unescaped_str @@ -33,6 +29,8 @@ def __init__(self, parser: ArgumentParser, args: Namespace, unknown_args: Option parser.error(f"Unknown arguments: {unknown_args}") def _run_workflow(self): + import tempfile + from olive.workflows import run as olive_run Path(self.args.output_path).mkdir(parents=True, exist_ok=True) @@ -64,16 +62,18 @@ def _get_recipe_telemetry_metadata(self) -> dict[str, str]: @staticmethod def _parse_extra_options(kv_items): - ort_genai = importlib.import_module("onnxruntime_genai") - ort_genai_version = ort_genai.__version__ + from onnxruntime_genai import __version__ as OrtGenaiVersion + from packaging import version - if version.parse(ort_genai_version) <= version.parse("0.9.0"): + if version.parse(OrtGenaiVersion) <= version.parse("0.9.0"): raise ValueError( "onnxruntime-genai version <= 0.9.0 is not supported for extra_options in CLI. " "Please either upgrade to onnxruntime-genai version > 0.9.0 or use the model builder pass directly in the config file." ) - return importlib.import_module("onnxruntime_genai.models.builder").parse_extra_options(kv_items) + from onnxruntime_genai.models.builder import parse_extra_options + + return parse_extra_options(kv_items) @staticmethod def _save_config_file(config: dict): diff --git a/olive/cli/run.py b/olive/cli/run.py index 64752626e..4b0360fed 100644 --- a/olive/cli/run.py +++ b/olive/cli/run.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from argparse import ArgumentParser -from pathlib import Path from olive.cli.base import ( BaseOliveCLICommand, @@ -12,7 +11,6 @@ add_telemetry_options, get_input_model_config, ) -from olive.common.config_utils import load_config_file from olive.telemetry import action @@ -51,6 +49,9 @@ def register_subcommand(parser: ArgumentParser): @action def run(self): + from pathlib import Path + + from olive.common.config_utils import load_config_file from olive.workflows import run as olive_run # allow the run_config to be a dict already (for api use) diff --git a/olive/workflows/run/run.py b/olive/workflows/run/run.py index 5dd6b44d5..7d425152c 100644 --- a/olive/workflows/run/run.py +++ b/olive/workflows/run/run.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import functools -import importlib import json import logging import os @@ -141,7 +140,7 @@ def run_engine(package_config: OlivePackageConfig, run_config: RunConfig): # ort_log_severity_level: C++ logging levels try: - ort = importlib.import_module("onnxruntime") + import onnxruntime as ort ort.set_default_logger_severity(run_config.engine.ort_log_severity_level) except Exception: From 51d08fd604df1846105cfaaadf5a3128c8a36090 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Sat, 9 May 2026 00:39:02 -0500 Subject: [PATCH 23/24] Simplify Docker recipe telemetry suppression Use an explicit run() parameter for the inner Docker workflow runner instead of an environment variable so only the parent workflow emits the recipe result event. Files changed: - olive/workflows/run/run.py - olive/systems/docker/workflow_runner.py - olive/systems/docker/docker_system.py - olive/telemetry/constants.py - test/workflows/test_workflow_run.py - test/systems/docker/test_docker_system.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/systems/docker/docker_system.py | 2 -- olive/systems/docker/workflow_runner.py | 2 +- olive/telemetry/constants.py | 1 - olive/workflows/run/run.py | 5 ++--- test/systems/docker/test_docker_system.py | 17 ++++++++++++++--- test/workflows/test_workflow_run.py | 9 +++------ 6 files changed, 20 insertions(+), 16 deletions(-) diff --git a/olive/systems/docker/docker_system.py b/olive/systems/docker/docker_system.py index 07e388cd6..a440bc2b0 100644 --- a/olive/systems/docker/docker_system.py +++ b/olive/systems/docker/docker_system.py @@ -20,7 +20,6 @@ from olive.systems.common import AcceleratorConfig, SystemType from olive.systems.olive_system import OliveSystem from olive.systems.system_config import LocalTargetUserConfig, SystemConfig -from olive.telemetry.constants import SUPPRESS_WORKFLOW_TELEMETRY_ENV from olive.telemetry.telemetry import is_ci_environment from olive.workflows.run.config import RunConfig @@ -244,7 +243,6 @@ def _prepare_environment(self, base_env) -> dict: # Add default environment variables environment.setdefault("PYTHONPYCACHEPREFIX", "/tmp") environment["OLIVE_LOG_LEVEL"] = logging.getLevelName(logger.getEffectiveLevel()) - environment[SUPPRESS_WORKFLOW_TELEMETRY_ENV] = "1" if is_ci_environment(): environment["CI"] = "1" diff --git a/olive/systems/docker/workflow_runner.py b/olive/systems/docker/workflow_runner.py index 5842d0bd4..be0d59d67 100644 --- a/olive/systems/docker/workflow_runner.py +++ b/olive/systems/docker/workflow_runner.py @@ -20,7 +20,7 @@ def runner_entry(config): config = json.load(f) logger.info("Running workflow with config: %s", config) - olive_run(config) + olive_run(config, emit_recipe_telemetry=False) if __name__ == "__main__": diff --git a/olive/telemetry/constants.py b/olive/telemetry/constants.py index 420da88b3..25a60e813 100644 --- a/olive/telemetry/constants.py +++ b/olive/telemetry/constants.py @@ -6,4 +6,3 @@ """Telemetry constants.""" CONNECTION_STRING = "SW5zdHJ1bWVudGF0aW9uS2V5PTYyMTUwOTExZGMwMDRmYzliYjY3YmE5NjA2NDI3ZTU2LWVjNjFmOWFmLTVkN2EtNGQxOS1hZjMxLWI5Y2Q2OWU5ODdmMS02OTE1" -SUPPRESS_WORKFLOW_TELEMETRY_ENV = "OLIVE_SUPPRESS_WORKFLOW_TELEMETRY" diff --git a/olive/workflows/run/run.py b/olive/workflows/run/run.py index 7d425152c..dc338f69c 100644 --- a/olive/workflows/run/run.py +++ b/olive/workflows/run/run.py @@ -5,7 +5,6 @@ import functools import json import logging -import os from copy import deepcopy from os import PathLike from pathlib import Path, PurePosixPath, PureWindowsPath @@ -18,7 +17,6 @@ from olive.package_config import OlivePackageConfig from olive.systems.accelerator_creator import create_accelerator from olive.systems.common import SystemType -from olive.telemetry.constants import SUPPRESS_WORKFLOW_TELEMETRY_ENV from olive.telemetry.telemetry import is_ci_environment from olive.telemetry.telemetry_extensions import _format_exception_message, log_error, log_recipe_result from olive.workflows.run.config import RunConfig @@ -189,6 +187,7 @@ def run( package_config: Optional[Union[str, Path, dict]] = None, tempdir: Optional[Union[str, Path]] = None, recipe_telemetry_metadata: Optional[dict[str, Any]] = None, + emit_recipe_telemetry: bool = True, ): # set tempdir set_tempdir(tempdir) @@ -245,7 +244,7 @@ def run( exception_type=type(exception).__name__, exception_message=_format_exception_message(exception, exception.__traceback__), ) - if os.environ.get(SUPPRESS_WORKFLOW_TELEMETRY_ENV) != "1": + if emit_recipe_telemetry: metadata = _build_recipe_result_metadata( run_config, run_config_telemetry_input, diff --git a/test/systems/docker/test_docker_system.py b/test/systems/docker/test_docker_system.py index 3d668c6b6..ef20a43d1 100644 --- a/test/systems/docker/test_docker_system.py +++ b/test/systems/docker/test_docker_system.py @@ -2,13 +2,13 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- +import json from unittest.mock import MagicMock, patch import pytest from olive.systems.docker.docker_system import DockerSystem from olive.systems.system_config import DockerTargetUserConfig -from olive.telemetry.constants import SUPPRESS_WORKFLOW_TELEMETRY_ENV from test.utils import ONNX_MODEL_PATH # pylint: disable=attribute-defined-outside-init,protected-access @@ -137,7 +137,6 @@ def test_run_workflow(self, mock_find_resources, mock_tempdir, mock_from_env, tm command = mock_docker_client.containers.run.call_args[1]["command"] assert "workflow_runner.py" in command assert "--config" in command - assert mock_docker_client.containers.run.call_args.kwargs["environment"][SUPPRESS_WORKFLOW_TELEMETRY_ENV] == "1" # Verify cleanup mock_container.remove.assert_called_once() @@ -159,7 +158,19 @@ def test_prepare_environment_forwards_ci_to_workflow_container(self, mock_from_e environment = docker_system._prepare_environment({}) assert environment["CI"] == "1" - assert environment[SUPPRESS_WORKFLOW_TELEMETRY_ENV] == "1" + + def test_workflow_runner_disables_inner_recipe_telemetry(self, tmp_path, monkeypatch): + from olive.systems.docker import workflow_runner + + monkeypatch.delenv("HF_TOKEN", raising=False) + config = {"input_model": {"type": "ONNXModel", "model_path": "model.onnx"}} + config_path = tmp_path / "config.json" + config_path.write_text(json.dumps(config)) + + with patch.object(workflow_runner, "olive_run") as mock_olive_run: + workflow_runner.runner_entry(config_path) + + mock_olive_run.assert_called_once_with(config, emit_recipe_telemetry=False) @patch("olive.systems.docker.docker_system.docker.from_env") @patch("olive.systems.docker.docker_system.tempfile.TemporaryDirectory") diff --git a/test/workflows/test_workflow_run.py b/test/workflows/test_workflow_run.py index a3c9ade43..a79d0c251 100644 --- a/test/workflows/test_workflow_run.py +++ b/test/workflows/test_workflow_run.py @@ -6,7 +6,6 @@ import pytest -from olive.telemetry.constants import SUPPRESS_WORKFLOW_TELEMETRY_ENV from olive.workflows import run as olive_run from olive.workflows.run.run import _build_recipe_hash, _classify_input_model_source, _classify_run_config_source from test.utils import ( @@ -237,10 +236,7 @@ def test_run_logs_recipe_result_failure(mock_run_engine, mock_log_recipe_result, @patch("olive.workflows.run.run.log_recipe_result") @patch("olive.workflows.run.run.run_engine") -def test_run_skips_recipe_result_when_workflow_telemetry_is_suppressed( - mock_run_engine, mock_log_recipe_result, monkeypatch -): - monkeypatch.setenv(SUPPRESS_WORKFLOW_TELEMETRY_ENV, "1") +def test_run_skips_recipe_result_when_recipe_telemetry_is_not_emitted(mock_run_engine, mock_log_recipe_result): expected_output = object() mock_run_engine.return_value = expected_output @@ -251,7 +247,8 @@ def test_run_skips_recipe_result_when_workflow_telemetry_is_suppressed( "model_path": "Qwen/Qwen2.5-0.5B-Instruct", "task": "text-generation", } - } + }, + emit_recipe_telemetry=False, ) assert output is expected_output From 38f118c841a54c84627882b90bc71a2e45638195 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Sat, 9 May 2026 01:55:59 -0500 Subject: [PATCH 24/24] Keep platform imports local Restore local imports for Docker token lookup and telemetry file locking to avoid unnecessary module-level import churn. Files changed: - olive/systems/docker/docker_system.py - olive/telemetry/utils.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/systems/docker/docker_system.py | 6 ++++-- olive/telemetry/utils.py | 13 ++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/olive/systems/docker/docker_system.py b/olive/systems/docker/docker_system.py index a440bc2b0..2a479ec69 100644 --- a/olive/systems/docker/docker_system.py +++ b/olive/systems/docker/docker_system.py @@ -5,7 +5,6 @@ import copy import json import logging -import os import sys import tempfile from pathlib import Path @@ -20,7 +19,6 @@ from olive.systems.common import AcceleratorConfig, SystemType from olive.systems.olive_system import OliveSystem from olive.systems.system_config import LocalTargetUserConfig, SystemConfig -from olive.telemetry.telemetry import is_ci_environment from olive.workflows.run.config import RunConfig if TYPE_CHECKING: @@ -234,6 +232,8 @@ def _prepare_run_params(self) -> dict: def _prepare_environment(self, base_env) -> dict: """Prepare environment variables for container.""" + from olive.telemetry.telemetry import is_ci_environment + # Convert list to dict if needed if isinstance(base_env, list): environment = {env.split("=")[0]: env.split("=")[1] for env in base_env} @@ -307,6 +307,8 @@ def _create_runner_script_mount(self) -> tuple[str, str]: @staticmethod def _get_huggingface_token() -> Optional[str]: """Get HuggingFace token from environment or file.""" + import os + # Check environment variable token = os.getenv("HF_TOKEN") if token: diff --git a/olive/telemetry/utils.py b/olive/telemetry/utils.py index 28cec45eb..806f5f93d 100644 --- a/olive/telemetry/utils.py +++ b/olive/telemetry/utils.py @@ -9,15 +9,9 @@ from pathlib import Path from typing import ClassVar -if os.name == "posix": - import fcntl -else: - fcntl = None - if os.name == "nt": import ctypes import ctypes.wintypes as wintypes - import msvcrt _LOCKFILE_EXCLUSIVE_LOCK = 0x00000002 @@ -52,7 +46,6 @@ class _Overlapped(ctypes.Structure): _unlock_file_ex.restype = wintypes.BOOL else: ctypes = None - msvcrt = None wintypes = None _lock_file_ex = None _unlock_file_ex = None @@ -130,8 +123,12 @@ def __enter__(self): try: # Platform-specific locking if os.name == "posix": + import fcntl + fcntl.flock(self.file.fileno(), fcntl.LOCK_EX) elif os.name == "nt": + import msvcrt + self._windows_overlapped = _Overlapped() handle = msvcrt.get_osfhandle(self.file.fileno()) if not _lock_file_ex( @@ -155,6 +152,8 @@ def __exit__(self, exc_type, exc_val, exc_tb): if self.file: try: if os.name == "nt" and self._windows_overlapped is not None: + import msvcrt + handle = msvcrt.get_osfhandle(self.file.fileno()) if not _unlock_file_ex( handle,