microsoft · DingmaomaoBJTU · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026
@@ -121,6 +121,43 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
         default=None,
         help="If set, copy the two ONNX (with external data) here as prefill.onnx / decode.onnx.",
     )
+
+    genai = p.add_argument_group(
+        "genai bundle",
+        "Options for producing an onnxruntime-genai inference bundle.",
+    )
+    genai.add_argument(
+        "--genai-bundle",
+        type=Path,
+        default=None,
+        metavar="DIR",
+        help=(
+            "If set, assemble a complete onnxruntime-genai bundle in DIR: "
+            "ctx.onnx (prefill), iter.onnx (decode), genai_config.json, and "
+            "tokenizer files.  Provide --embeddings and --lm-head to include "
+            "the placeholder models required for end-to-end inference."
+        ),
+    )
+    genai.add_argument(
+        "--embeddings",
+        type=Path,
+        default=None,
+        metavar="ONNX",
+        help=(
+            "Path to the embeddings ONNX to copy into the genai bundle as "
+            "embeddings.onnx.  Required for end-to-end genai inference."
+        ),
+    )
+    genai.add_argument(
+        "--lm-head",
+        type=Path,
+        default=None,
+        metavar="ONNX",
+        help=(
+            "Path to the lm_head ONNX to copy into the genai bundle as "
+            "lm_head.onnx.  Required for end-to-end genai inference."
+        ),
+    )
     return p.parse_args(argv)
 
 
@@ -164,6 +201,39 @@ def main(argv: list[str] | None = None) -> int:
             copy_onnx_model(src, dst)
             print(f"   -> copied to {dst}")
 
+    # -----------------------------------------------------------------------
+    # Optional: assemble an onnxruntime-genai bundle.
+    # -----------------------------------------------------------------------
+    if args.genai_bundle is not None:
+        from winml.modelkit.models.hf.qwen3.genai import write_genai_bundle
+
+        prefill_path = Path(model.sub_models["decoder_prefill"].onnx_path)
+        decode_path = Path(model.sub_models["decoder_gen"].onnx_path)
+
+        print(f"\n=== assembling genai bundle -> {args.genai_bundle} ===")
+        config_path = write_genai_bundle(
+            args.genai_bundle,
+            context_onnx=prefill_path,
+            iterator_onnx=decode_path,
+            model_id=args.model_id,
+            max_cache_len=args.max_cache_len,
+            prefill_seq_len=args.prefill_seq_len,
+            embeddings_src=args.embeddings,
+            lm_head_src=args.lm_head,
+            ep="qnn" if args.device == "npu" else args.device,
+        )
+        print(f"   genai_config.json -> {config_path}")
+        if args.embeddings is None:
+            print(
+                "   WARNING: --embeddings not provided; "
+                "add embeddings.onnx to the bundle before inference."
+            )
+        if args.lm_head is None:
+            print(
+                "   WARNING: --lm-head not provided; "
+                "add lm_head.onnx to the bundle before inference."
+            )
+
     return 0
 
 

@@ -0,0 +1,151 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+r"""onnxruntime-genai inference for a genai bundle (decoder-pipeline).
+
+Loads the genai bundle produced by ``export_qwen3_transformer_only.py
+--genai-bundle <DIR>`` and runs greedy text generation using
+:class:`~winml.modelkit.session.GenaiSession`.
+
+The bundle directory must contain ``genai_config.json`` and the four ONNX
+graphs it references (``embeddings.onnx``, ``ctx.onnx``, ``iter.onnx``,
+``lm_head.onnx``) plus HF tokenizer files.
+
+Usage::
+
+    # CPU sanity check (works anywhere onnxruntime-genai is installed)
+    uv run python scripts/infer_genai.py --prompt "Hello, who are you?" --chat
+
+    # Qualcomm NPU (registers the QNN EP via the Windows ML EP catalog)
+    uv run python scripts/infer_genai.py \\
+        --prompt "Explain what a transformer is." \\
+        --ep qnn --chat
+
+    # Point at a non-default bundle
+    uv run python scripts/infer_genai.py \\
+        --model-dir out/my_bundle --prompt "Hi" --ep cpu
+
+    # Pre-compile QNN stages to EPContext on first run; reuse cache on subsequent runs.
+    # Eliminates per-run JIT overhead (~60-90 s saved on Snapdragon X Elite).
+    uv run python scripts/infer_genai.py \\
+        --prompt "Hello" --ep mixed --compile
+
+Dependencies (install in a fresh venv)::
+
+    pip install onnxruntime-genai-winml
+    pip install "windowsml[with-ort]"   # registers QNN EP; also provides onnxruntime
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+import time
+from pathlib import Path
+
+from winml.modelkit.session import GenaiSession, GenerationConfig
+
+
+# Default bundle directory: <repo-root>/out/qwen3_bundle
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+DEFAULT_MODEL_DIR = _REPO_ROOT / "out" / "qwen3_bundle"
+
+_SUPPORTED_EPS = ["cpu", "mixed", "qnn", "dml"]
+
+
+def _wrap_chat_template(prompt: str) -> str:
+    """Wrap *prompt* in the ChatML chat template."""
+    return GenaiSession.apply_chatml_template(prompt)
+
+
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    """Parse CLI arguments."""
+    p = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    p.add_argument(
+        "--prompt",
+        default="Give me a short introduction to large language models.",
+        help="Input prompt (default: %(default)s).",
+    )
+    p.add_argument(
+        "--model-dir",
+        type=Path,
+        default=DEFAULT_MODEL_DIR,
+        metavar="DIR",
+        help=(
+            "Path to the genai bundle directory containing genai_config.json "
+            "and the ONNX / tokenizer files (default: %(default)s)."
+        ),
+    )
+    p.add_argument(
+        "--ep",
+        choices=_SUPPORTED_EPS,
+        default="mixed",
+        help="Execution provider: 'mixed' uses genai_config.json as-is (default); "
+        "'cpu' forces all stages to CPU; 'qnn'/'dml' for full NPU/GPU.",
+    )
+    p.add_argument(
+        "--max-new",
+        type=int,
+        default=128,
+        help="Maximum number of new tokens to generate (default: %(default)s).",
+    )
+    p.add_argument(
+        "--chat",
+        action="store_true",
+        help="Wrap --prompt in the ChatML template (<|im_start|>user/assistant).",
+    )
+    p.add_argument(
+        "--compile",
+        action="store_true",
+        help=(
+            "Pre-compile QNN pipeline stages to EPContext ONNX before loading. "
+            "On first use this triggers ort.ModelCompiler per stage (~60-90 s for iter). "
+            "Compiled artifacts are cached in bundle_dir/_compiled/; "
+            "subsequent runs reuse the cache and skip JIT. "
+            "Has no effect when --ep cpu."
+        ),
+    )
+    p.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable onnxruntime-genai native model I/O logging.",
+    )
+    return p.parse_args(argv)
+
+
+def main(argv: list[str] | None = None) -> int:
+    """Load the genai bundle and run generation."""
+    args = parse_args(argv)
+
+    text = _wrap_chat_template(args.prompt) if args.chat else args.prompt
+    gen_cfg = GenerationConfig(max_new_tokens=args.max_new, do_sample=False)
+
+    try:
+        session = GenaiSession(
+            args.model_dir, ep=args.ep, verbose=args.verbose, compile=args.compile
+        )
+    except FileNotFoundError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 1
+
+    print(f"[load] ep={args.ep}  bundle={args.model_dir}")
+    with session:
+        print(f"[ctx]  context_length={session.context_length}")
+        print("[gen] ", end="", flush=True)
+        t0 = time.monotonic()
+        n = 0
+        for token_str in session.generate_streaming(text, gen_cfg):
+            print(token_str, end="", flush=True)
+            n += 1
+
+    dt = time.monotonic() - t0
+    print(f"\n\n[done] {n} tokens in {dt:.1f}s  ({n / dt:.1f} tok/s)")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -3,4 +3,30 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-"""Qwen3 transformer-only export support (modeling, export ops, IO configs)."""
+"""Qwen3 transformer-only export + genai bundle support.
+
+Modules:
+  qwen_transformer_only  — OnnxConfig, build config, composite model class.
+  qwen3_modeling         — winml-owned Qwen3 module definitions (forward bindings).
+  qwen3_export_ops       — custom ONNX symbolic ops (LpNorm, GQA, 1x1 Conv).
+  genai                  — genai_config.json generator + bundle assembler.
+"""
+
+from .genai import (
+    DecoderIOMapping,
+    PipelineStage,
+    build_decoder_pipeline_stages,
+    build_genai_config,
+    build_qwen3_transformer_only_stages,
+    write_genai_bundle,
+)
+
+
+__all__ = [
+    "DecoderIOMapping",
+    "PipelineStage",
+    "build_decoder_pipeline_stages",
+    "build_genai_config",
+    "build_qwen3_transformer_only_stages",
+    "write_genai_bundle",
+]
@@ -0,0 +1,53 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+"""Qwen3 genai bundle support — thin shim over :mod:`winml.modelkit.utils.genai`.
+
+All generic logic (``PipelineStage``, ``DecoderIOMapping``, ``build_genai_config``,
+``build_decoder_pipeline_stages``, ``write_genai_bundle``) lives in
+:mod:`winml.modelkit.utils.genai` so it can be reused by other model families.
+
+This module re-exports that API unchanged and adds
+``build_qwen3_transformer_only_stages`` as a backward-compatible alias for
+``build_decoder_pipeline_stages``.  New code should prefer the generic names.
+"""
+
+from __future__ import annotations
+
+from winml.modelkit.utils.genai import (
+    DEFAULT_CONTEXT_FILENAME,
+    DEFAULT_EMBEDDINGS_FILENAME,
+    DEFAULT_ITERATOR_FILENAME,
+    DEFAULT_LM_HEAD_FILENAME,
+    DecoderIOMapping,
+    PipelineStage,
+    _detect_format_patterns,
+    build_decoder_pipeline_stages,
+    build_genai_config,
+    qnn_stage_session_options,
+    write_genai_bundle,
+)
+
+
+# Backward-compatible alias: existing callers that import
+# ``build_qwen3_transformer_only_stages`` continue to work unchanged.
+build_qwen3_transformer_only_stages = build_decoder_pipeline_stages
+
+# Keep the internal helper accessible for tests that import it directly.
+_qnn_stage_session_options = qnn_stage_session_options
+
+__all__ = [
+    "DEFAULT_CONTEXT_FILENAME",
+    "DEFAULT_EMBEDDINGS_FILENAME",
+    "DEFAULT_ITERATOR_FILENAME",
+    "DEFAULT_LM_HEAD_FILENAME",
+    "DecoderIOMapping",
+    "PipelineStage",
+    "_detect_format_patterns",
+    "build_decoder_pipeline_stages",
+    "build_genai_config",
+    "build_qwen3_transformer_only_stages",
+    "qnn_stage_session_options",
+    "write_genai_bundle",
+]
@@ -18,6 +18,7 @@
 
 from typing import TYPE_CHECKING, Any
 
+from .calibration import get_quant_finalizer
 from .config import QuantizeResult, WinMLQuantizationConfig
 
 
@@ -29,18 +30,17 @@
 ]
 
 
-# Names below are loaded lazily via ``__getattr__`` to avoid pulling in
-# onnxruntime.quantization/torch at import time. The TYPE_CHECKING re-imports
-# give static analyzers (mypy, CodeQL) visibility into what ``__all__`` exports
-# without triggering the heavy imports at runtime.
+# ``quantize_onnx`` is loaded lazily via ``__getattr__`` to avoid pulling in
+# onnxruntime.quantization at import time. The TYPE_CHECKING re-import gives
+# static analyzers (mypy, CodeQL) visibility into what ``__all__`` exports.
+# ``get_quant_finalizer`` is imported directly above — its module chain
+# (calibration/__init__ -> registry) is lightweight and safe at import time.
 if TYPE_CHECKING:
-    from .calibration import get_quant_finalizer
     from .quantizer import quantize_onnx
 
 
 _LAZY_IMPORTS: dict[str, tuple[str, str]] = {
     "quantize_onnx": (".quantizer", "quantize_onnx"),
-    "get_quant_finalizer": (".calibration", "get_quant_finalizer"),
 }
 
 

@@ -38,3 +38,4 @@ def finalize(
         model_id: str | None = None,
     ) -> WinMLQuantizationConfig:
         """Return ``quant`` populated with the graph-derived quant settings."""
+        ...
@@ -5,6 +5,13 @@
 """WinMLSession - ONNX Runtime session manager with WinML EP integration."""
 
 from .ep_registry import WinMLEPRegistry
+from .genai_session import (
+    GenaiLoadError,
+    GenaiNotInstalledError,
+    GenaiSession,
+    GenaiSessionError,
+    GenerationConfig,
+)
 from .monitor.ep_monitor import EPMonitor, NullEPMonitor
 from .monitor.hw_monitor import HWMonitor
 from .monitor.openvino_monitor import OpenVinoMonitor
@@ -17,6 +24,11 @@
 
 __all__ = [
     "EPMonitor",
+    "GenaiLoadError",
+    "GenaiNotInstalledError",
+    "GenaiSession",
+    "GenaiSessionError",
+    "GenerationConfig",
     "HWMonitor",
     "InferenceError",
     "NullEPMonitor",