Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions scripts/export_qwen3_transformer_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,43 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
default=None,
help="If set, copy the two ONNX (with external data) here as prefill.onnx / decode.onnx.",
)

genai = p.add_argument_group(
"genai bundle",
"Options for producing an onnxruntime-genai inference bundle.",
)
genai.add_argument(
"--genai-bundle",
type=Path,
default=None,
metavar="DIR",
help=(
"If set, assemble a complete onnxruntime-genai bundle in DIR: "
"ctx.onnx (prefill), iter.onnx (decode), genai_config.json, and "
"tokenizer files. Provide --embeddings and --lm-head to include "
"the placeholder models required for end-to-end inference."
),
)
genai.add_argument(
"--embeddings",
type=Path,
default=None,
metavar="ONNX",
help=(
"Path to the embeddings ONNX to copy into the genai bundle as "
"embeddings.onnx. Required for end-to-end genai inference."
),
)
genai.add_argument(
"--lm-head",
type=Path,
default=None,
metavar="ONNX",
help=(
"Path to the lm_head ONNX to copy into the genai bundle as "
"lm_head.onnx. Required for end-to-end genai inference."
),
)
return p.parse_args(argv)


Expand Down Expand Up @@ -164,6 +201,39 @@ def main(argv: list[str] | None = None) -> int:
copy_onnx_model(src, dst)
print(f" -> copied to {dst}")

# -----------------------------------------------------------------------
# Optional: assemble an onnxruntime-genai bundle.
# -----------------------------------------------------------------------
if args.genai_bundle is not None:
from winml.modelkit.models.hf.qwen3.genai import write_genai_bundle

prefill_path = Path(model.sub_models["decoder_prefill"].onnx_path)
decode_path = Path(model.sub_models["decoder_gen"].onnx_path)

print(f"\n=== assembling genai bundle -> {args.genai_bundle} ===")
config_path = write_genai_bundle(
args.genai_bundle,
context_onnx=prefill_path,
iterator_onnx=decode_path,
model_id=args.model_id,
max_cache_len=args.max_cache_len,
prefill_seq_len=args.prefill_seq_len,
embeddings_src=args.embeddings,
lm_head_src=args.lm_head,
ep="qnn" if args.device == "npu" else args.device,
)
print(f" genai_config.json -> {config_path}")
if args.embeddings is None:
print(
" WARNING: --embeddings not provided; "
"add embeddings.onnx to the bundle before inference."
)
if args.lm_head is None:
print(
" WARNING: --lm-head not provided; "
"add lm_head.onnx to the bundle before inference."
)

return 0


Expand Down
151 changes: 151 additions & 0 deletions scripts/infer_genai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
r"""onnxruntime-genai inference for a genai bundle (decoder-pipeline).

Loads the genai bundle produced by ``export_qwen3_transformer_only.py
--genai-bundle <DIR>`` and runs greedy text generation using
:class:`~winml.modelkit.session.GenaiSession`.

The bundle directory must contain ``genai_config.json`` and the four ONNX
graphs it references (``embeddings.onnx``, ``ctx.onnx``, ``iter.onnx``,
``lm_head.onnx``) plus HF tokenizer files.

Usage::

# CPU sanity check (works anywhere onnxruntime-genai is installed)
uv run python scripts/infer_genai.py --prompt "Hello, who are you?" --chat

# Qualcomm NPU (registers the QNN EP via the Windows ML EP catalog)
uv run python scripts/infer_genai.py \\
--prompt "Explain what a transformer is." \\
--ep qnn --chat

# Point at a non-default bundle
uv run python scripts/infer_genai.py \\
--model-dir out/my_bundle --prompt "Hi" --ep cpu

# Pre-compile QNN stages to EPContext on first run; reuse cache on subsequent runs.
# Eliminates per-run JIT overhead (~60-90 s saved on Snapdragon X Elite).
uv run python scripts/infer_genai.py \\
--prompt "Hello" --ep mixed --compile

Dependencies (install in a fresh venv)::

pip install onnxruntime-genai-winml
pip install "windowsml[with-ort]" # registers QNN EP; also provides onnxruntime
"""

from __future__ import annotations

import argparse
import sys
import time
from pathlib import Path

from winml.modelkit.session import GenaiSession, GenerationConfig


# Default bundle directory: <repo-root>/out/qwen3_bundle
_REPO_ROOT = Path(__file__).resolve().parent.parent
DEFAULT_MODEL_DIR = _REPO_ROOT / "out" / "qwen3_bundle"

_SUPPORTED_EPS = ["cpu", "mixed", "qnn", "dml"]


def _wrap_chat_template(prompt: str) -> str:
"""Wrap *prompt* in the ChatML chat template."""
return GenaiSession.apply_chatml_template(prompt)


def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
"""Parse CLI arguments."""
p = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument(
"--prompt",
default="Give me a short introduction to large language models.",
help="Input prompt (default: %(default)s).",
)
p.add_argument(
"--model-dir",
type=Path,
default=DEFAULT_MODEL_DIR,
metavar="DIR",
help=(
"Path to the genai bundle directory containing genai_config.json "
"and the ONNX / tokenizer files (default: %(default)s)."
),
)
p.add_argument(
"--ep",
choices=_SUPPORTED_EPS,
default="mixed",
help="Execution provider: 'mixed' uses genai_config.json as-is (default); "
"'cpu' forces all stages to CPU; 'qnn'/'dml' for full NPU/GPU.",
)
p.add_argument(
"--max-new",
type=int,
default=128,
help="Maximum number of new tokens to generate (default: %(default)s).",
)
p.add_argument(
"--chat",
action="store_true",
help="Wrap --prompt in the ChatML template (<|im_start|>user/assistant).",
)
p.add_argument(
"--compile",
action="store_true",
help=(
"Pre-compile QNN pipeline stages to EPContext ONNX before loading. "
"On first use this triggers ort.ModelCompiler per stage (~60-90 s for iter). "
"Compiled artifacts are cached in bundle_dir/_compiled/; "
"subsequent runs reuse the cache and skip JIT. "
"Has no effect when --ep cpu."
),
)
p.add_argument(
"--verbose",
action="store_true",
help="Enable onnxruntime-genai native model I/O logging.",
)
return p.parse_args(argv)


def main(argv: list[str] | None = None) -> int:
"""Load the genai bundle and run generation."""
args = parse_args(argv)

text = _wrap_chat_template(args.prompt) if args.chat else args.prompt
gen_cfg = GenerationConfig(max_new_tokens=args.max_new, do_sample=False)

try:
session = GenaiSession(
args.model_dir, ep=args.ep, verbose=args.verbose, compile=args.compile
)
except FileNotFoundError as exc:
print(f"ERROR: {exc}", file=sys.stderr)
return 1

print(f"[load] ep={args.ep} bundle={args.model_dir}")
with session:
print(f"[ctx] context_length={session.context_length}")
print("[gen] ", end="", flush=True)
t0 = time.monotonic()
n = 0
for token_str in session.generate_streaming(text, gen_cfg):
print(token_str, end="", flush=True)
n += 1

dt = time.monotonic() - t0
print(f"\n\n[done] {n} tokens in {dt:.1f}s ({n / dt:.1f} tok/s)")
return 0


if __name__ == "__main__":
sys.exit(main())
28 changes: 27 additions & 1 deletion src/winml/modelkit/models/hf/qwen3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,30 @@
# Licensed under the MIT License.
# --------------------------------------------------------------------------

"""Qwen3 transformer-only export support (modeling, export ops, IO configs)."""
"""Qwen3 transformer-only export + genai bundle support.

Modules:
qwen_transformer_only — OnnxConfig, build config, composite model class.
qwen3_modeling — winml-owned Qwen3 module definitions (forward bindings).
qwen3_export_ops — custom ONNX symbolic ops (LpNorm, GQA, 1x1 Conv).
genai — genai_config.json generator + bundle assembler.
"""

from .genai import (
DecoderIOMapping,
PipelineStage,
build_decoder_pipeline_stages,
build_genai_config,
build_qwen3_transformer_only_stages,
write_genai_bundle,
)


__all__ = [
"DecoderIOMapping",
"PipelineStage",
"build_decoder_pipeline_stages",
"build_genai_config",
"build_qwen3_transformer_only_stages",
"write_genai_bundle",
]
53 changes: 53 additions & 0 deletions src/winml/modelkit/models/hf/qwen3/genai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
"""Qwen3 genai bundle support — thin shim over :mod:`winml.modelkit.utils.genai`.

All generic logic (``PipelineStage``, ``DecoderIOMapping``, ``build_genai_config``,
``build_decoder_pipeline_stages``, ``write_genai_bundle``) lives in
:mod:`winml.modelkit.utils.genai` so it can be reused by other model families.

This module re-exports that API unchanged and adds
``build_qwen3_transformer_only_stages`` as a backward-compatible alias for
``build_decoder_pipeline_stages``. New code should prefer the generic names.
"""

from __future__ import annotations

from winml.modelkit.utils.genai import (
DEFAULT_CONTEXT_FILENAME,
DEFAULT_EMBEDDINGS_FILENAME,
DEFAULT_ITERATOR_FILENAME,
DEFAULT_LM_HEAD_FILENAME,
DecoderIOMapping,
PipelineStage,
_detect_format_patterns,
build_decoder_pipeline_stages,
build_genai_config,
qnn_stage_session_options,
write_genai_bundle,
)


# Backward-compatible alias: existing callers that import
# ``build_qwen3_transformer_only_stages`` continue to work unchanged.
build_qwen3_transformer_only_stages = build_decoder_pipeline_stages

# Keep the internal helper accessible for tests that import it directly.
_qnn_stage_session_options = qnn_stage_session_options

__all__ = [
"DEFAULT_CONTEXT_FILENAME",
"DEFAULT_EMBEDDINGS_FILENAME",
"DEFAULT_ITERATOR_FILENAME",
"DEFAULT_LM_HEAD_FILENAME",
"DecoderIOMapping",
"PipelineStage",
"_detect_format_patterns",
"build_decoder_pipeline_stages",
"build_genai_config",
"build_qwen3_transformer_only_stages",
"qnn_stage_session_options",
"write_genai_bundle",
]
12 changes: 6 additions & 6 deletions src/winml/modelkit/quant/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from typing import TYPE_CHECKING, Any

from .calibration import get_quant_finalizer
from .config import QuantizeResult, WinMLQuantizationConfig


Expand All @@ -29,18 +30,17 @@
]


# Names below are loaded lazily via ``__getattr__`` to avoid pulling in
# onnxruntime.quantization/torch at import time. The TYPE_CHECKING re-imports
# give static analyzers (mypy, CodeQL) visibility into what ``__all__`` exports
# without triggering the heavy imports at runtime.
# ``quantize_onnx`` is loaded lazily via ``__getattr__`` to avoid pulling in
# onnxruntime.quantization at import time. The TYPE_CHECKING re-import gives
# static analyzers (mypy, CodeQL) visibility into what ``__all__`` exports.
# ``get_quant_finalizer`` is imported directly above — its module chain
# (calibration/__init__ -> registry) is lightweight and safe at import time.
if TYPE_CHECKING:
from .calibration import get_quant_finalizer
from .quantizer import quantize_onnx


_LAZY_IMPORTS: dict[str, tuple[str, str]] = {
"quantize_onnx": (".quantizer", "quantize_onnx"),
"get_quant_finalizer": (".calibration", "get_quant_finalizer"),
}


Expand Down
1 change: 1 addition & 0 deletions src/winml/modelkit/quant/calibration/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,4 @@ def finalize(
model_id: str | None = None,
) -> WinMLQuantizationConfig:
"""Return ``quant`` populated with the graph-derived quant settings."""
...
12 changes: 12 additions & 0 deletions src/winml/modelkit/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
"""WinMLSession - ONNX Runtime session manager with WinML EP integration."""

from .ep_registry import WinMLEPRegistry
from .genai_session import (
GenaiLoadError,
GenaiNotInstalledError,
GenaiSession,
GenaiSessionError,
GenerationConfig,
)
from .monitor.ep_monitor import EPMonitor, NullEPMonitor
from .monitor.hw_monitor import HWMonitor
from .monitor.openvino_monitor import OpenVinoMonitor
Expand All @@ -17,6 +24,11 @@

__all__ = [
"EPMonitor",
"GenaiLoadError",
"GenaiNotInstalledError",
"GenaiSession",
"GenaiSessionError",
"GenerationConfig",
"HWMonitor",
"InferenceError",
"NullEPMonitor",
Expand Down
Loading