From b0fa443173fbb81dcc4395d6dfcd349cfa71c5f6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 30 Jun 2026 12:22:22 +0800 Subject: [PATCH] fix(e2e): skip --precision fp16 tests on MIGraphX EP (compile hangs) MIGraphX cannot compile FP16 models and hangs until timeout. Add require_not_ep("migraphx") guard to tests that explicitly pass --precision fp16 and trigger model compilation. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/e2e/test_eval_e2e.py | 871 +++++++++++++++++++++++++------------ 1 file changed, 590 insertions(+), 281 deletions(-) diff --git a/tests/e2e/test_eval_e2e.py b/tests/e2e/test_eval_e2e.py index b5edcdefd..a2d1f8580 100644 --- a/tests/e2e/test_eval_e2e.py +++ b/tests/e2e/test_eval_e2e.py @@ -71,7 +71,7 @@ def tiny_textcls_script(tmp_path: Path) -> Path: """ script = tmp_path / "build_tiny_textcls.py" script.write_text( - '''import argparse + """import argparse from datasets import Dataset ROWS = [ @@ -91,7 +91,7 @@ def tiny_textcls_script(tmp_path: Path) -> Path: p.add_argument("--output", required=True) args = p.parse_args() Dataset.from_list(ROWS).save_to_disk(args.output) -''', +""", encoding="utf-8", ) return script @@ -132,7 +132,10 @@ def _assert_metrics_present(output_path: Path, required_keys: list[str]) -> dict def _assert_in_range( - metrics: dict, key: str, lo: float, hi: float, + metrics: dict, + key: str, + lo: float, + hi: float, ) -> None: """Assert ``metrics[key]`` is a finite number within ``[lo, hi]``. @@ -146,9 +149,7 @@ def _assert_in_range( f"metric {key} not numeric: {value!r} ({type(value).__name__})" ) assert math.isfinite(value), f"metric {key} is not finite: {value}" - assert lo <= value <= hi, ( - f"metric {key}={value} outside expected range [{lo}, {hi}]" - ) + assert lo <= value <= hi, f"metric {key}={value} outside expected range [{lo}, {hi}]" # =========================================================================== @@ -166,13 +167,20 @@ def test_image_classification(self, runner: CliRunner, tmp_path: Path) -> None: # HF evaluate.evaluator("image-classification") returns `accuracy`. # --streaming avoids caching full mini-imagenet (~1-2 GB). out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "google/vit-base-patch16-224", - "--task", "image-classification", - "--streaming", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "google/vit-base-patch16-224", + "--task", + "image-classification", + "--streaming", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present(out, ["accuracy"]) # ViT-base full ImageNet ≈ 0.81; floor at 0.5 still catches # broken-pipeline regressions on 10 samples. @@ -184,12 +192,19 @@ def test_text_classification(self, runner: CliRunner, tmp_path: Path) -> None: # Skip e2e for VitisAI due to Windows Access violation in model compilation for some models require_not_ep("vitisai") out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "Intel/bert-base-uncased-mrpc", - "--task", "text-classification", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "Intel/bert-base-uncased-mrpc", + "--task", + "text-classification", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present(out, ["accuracy"]) # bert-mrpc full MRPC ≈ 0.86; MRPC majority baseline ≈ 0.68. # Magnitude assertion is QNN-only: VitisAI W8A8 quantization @@ -201,12 +216,19 @@ def test_token_classification(self, runner: CliRunner, tmp_path: Path) -> None: # Skip e2e for VitisAI due to Windows Access violation in model compilation for some models require_not_ep("vitisai") out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "dslim/bert-base-NER", - "--task", "token-classification", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "dslim/bert-base-NER", + "--task", + "token-classification", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present( out, ["overall_precision", "overall_recall", "overall_f1", "overall_accuracy"], @@ -220,13 +242,20 @@ def test_object_detection(self, runner: CliRunner, tmp_path: Path) -> None: # COCO val is ~6 GB; --streaming keeps only the bytes needed # for the sampled subset. out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "hustvl/yolos-small", - "--task", "object-detection", - "--streaming", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "hustvl/yolos-small", + "--task", + "object-detection", + "--streaming", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present(out, ["map", "map_50", "mar_100"]) # COCO mAP / mAR are bounded by [0, 1]; torchmetrics may report -1 # when no positives are sampled, which is acceptable for tiny N. @@ -238,27 +267,43 @@ def test_image_segmentation(self, runner: CliRunner, tmp_path: Path) -> None: # Skip e2e for VitisAI due to Windows Access violation in model compilation for some models require_not_ep("vitisai") out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "nvidia/segformer-b1-finetuned-ade-512-512", - "--task", "image-segmentation", - "--dataset", "danjacobellis/scene_parse_150", - "--split", "validation", - "--streaming", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "nvidia/segformer-b1-finetuned-ade-512-512", + "--task", + "image-segmentation", + "--dataset", + "danjacobellis/scene_parse_150", + "--split", + "validation", + "--streaming", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present(out, ["mean_iou"]) _assert_in_range(data["metrics"], "mean_iou", 0.0, 1.0) def test_question_answering(self, runner: CliRunner, tmp_path: Path) -> None: require_ep("qnn") out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "distilbert/distilbert-base-cased-distilled-squad", - "--task", "question-answering", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "distilbert/distilbert-base-cased-distilled-squad", + "--task", + "question-answering", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present(out, ["exact_match", "f1"]) # distilbert-squad full SQuAD v1: EM ≈ 77, F1 ≈ 85 (percentages). # Both are harsh on N=10 (heavy per-sample variance with seed=42). @@ -268,12 +313,19 @@ def test_question_answering(self, runner: CliRunner, tmp_path: Path) -> None: def test_feature_extraction(self, runner: CliRunner, tmp_path: Path) -> None: out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "sentence-transformers/all-MiniLM-L6-v2", - "--task", "feature-extraction", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "sentence-transformers/all-MiniLM-L6-v2", + "--task", + "feature-extraction", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) # Spearman correlation reported as percentage in [-100, 100]. # MiniLM-L6-v2 full STSB ≈ 80; 10-sample noise can be large. # Magnitude assertion is QNN-only: VitisAI W8A8 quantization @@ -285,19 +337,28 @@ def test_feature_extraction(self, runner: CliRunner, tmp_path: Path) -> None: def test_sentence_similarity(self, runner: CliRunner, tmp_path: Path) -> None: # Alias for feature-extraction. out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "sentence-transformers/all-MiniLM-L6-v2", - "--task", "sentence-similarity", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "sentence-transformers/all-MiniLM-L6-v2", + "--task", + "sentence-similarity", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) # Same quantization caveat as test_feature_extraction. data = _assert_metrics_present(out, ["cosine_spearman"]) if is_host("qnn"): _assert_in_range(data["metrics"], "cosine_spearman", 40.0, 100.0) def test_image_feature_extraction( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: # kNN accuracies reported as percentages 0..100. # --streaming avoids caching mini-imagenet. @@ -307,15 +368,23 @@ def test_image_feature_extraction( # modality-aware task vocabulary, so it is not a valid task for a vision # model (it would resolve to the text evaluator/dataset and fail). out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "facebook/dinov2-small", - "--task", "image-feature-extraction", - "--streaming", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "facebook/dinov2-small", + "--task", + "image-feature-extraction", + "--streaming", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present( - out, ["knn_top1_accuracy", "knn_top5_accuracy"], + out, + ["knn_top1_accuracy", "knn_top5_accuracy"], ) # Smoke-only: at --samples 10 over mini-imagenet's 100 classes, # leave-one-out kNN is statistical noise (even unquantized fp32 @@ -331,18 +400,30 @@ def test_image_to_text_fp16(self, runner: CliRunner, tmp_path: Path) -> None: # Only test that exercises non-auto --precision. # Skip e2e for VitisAI due to Windows Access violation in model compilation for some models require_not_ep("vitisai") + require_not_ep("migraphx") out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "Salesforce/blip-image-captioning-base", - "--task", "image-to-text", - "--dataset", "lmms-lab/flickr30k", - "--split", "test", - "--streaming", - "--samples", SAMPLES, - "--precision", "fp16", - "--column", "label_column=caption", - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "Salesforce/blip-image-captioning-base", + "--task", + "image-to-text", + "--dataset", + "lmms-lab/flickr30k", + "--split", + "test", + "--streaming", + "--samples", + SAMPLES, + "--precision", + "fp16", + "--column", + "label_column=caption", + "-o", + str(out), + ], + ) # CLI contract: exit 0 and produce the metric keys. Tiny N may # yield None values; magnitude is checked in the accuracy regression # suite, not here. @@ -351,21 +432,26 @@ def test_image_to_text_fp16(self, runner: CliRunner, tmp_path: Path) -> None: for k, hi in (("cer", 10.0), ("cider", 20.0)): v = m[k] assert v is None or ( - isinstance(v, (int, float)) - and math.isfinite(v) - and 0.0 <= v <= hi + isinstance(v, (int, float)) and math.isfinite(v) and 0.0 <= v <= hi ), f"metric {k}={v!r} not None or in [0,{hi}]" assert isinstance(m["n_samples"], int) and m["n_samples"] >= 0 def test_fill_mask(self, runner: CliRunner, tmp_path: Path) -> None: # Pseudo-perplexity >= 1 (perplexity is exp of non-neg NLL). out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "distilbert/distilbert-base-uncased", - "--task", "fill-mask", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "distilbert/distilbert-base-uncased", + "--task", + "fill-mask", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present(out, ["pseudo_perplexity", "nll"]) # Pseudo-perplexity over a 10-sample wikitext stream can vary # widely (we observed ~3000 with seed=42). Cap is set well above @@ -374,17 +460,26 @@ def test_fill_mask(self, runner: CliRunner, tmp_path: Path) -> None: _assert_in_range(data["metrics"], "nll", 0.0, 15.0) def test_zero_shot_classification( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: require_ep("qnn") # Zero-shot uses ClassificationMetric → accuracy + f1. out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "cross-encoder/nli-deberta-v3-small", - "--task", "zero-shot-classification", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "cross-encoder/nli-deberta-v3-small", + "--task", + "zero-shot-classification", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present(out, ["accuracy", "f1"]) # nli-deberta-v3-small zero-shot on AG News, N=10. 4-class random # baseline = 0.25; tiny-N variance can push real models below @@ -393,17 +488,26 @@ def test_zero_shot_classification( _assert_in_range(data["metrics"], "f1", 0.1, 1.0) def test_zero_shot_image_classification( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: # Skip e2e for VitisAI due to Windows Access violation in model compilation for some models require_not_ep("vitisai") out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "openai/clip-vit-base-patch32", - "--task", "zero-shot-image-classification", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "openai/clip-vit-base-patch32", + "--task", + "zero-shot-image-classification", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present(out, ["top1_accuracy", "top5_accuracy"]) # CLIP-ViT-B/32 zero-shot on CIFAR-100: top1 ≈ 0.63, top5 ≈ 0.88 # (full set). Floors leave headroom for tiny-N variance. @@ -420,15 +524,26 @@ class TestEvalModelInputForms: """Coverage for the two non-default ``-m`` forms.""" def test_onnx_file_mode_monolithic( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: hf_id = "google/vit-base-patch16-224" task = "image-classification" # Warm cache via HF id (use streaming to avoid mini-imagenet cache). - _invoke(runner, [ - "-m", hf_id, "--task", task, "--streaming", "--samples", SAMPLES, - ]) + _invoke( + runner, + [ + "-m", + hf_id, + "--task", + task, + "--streaming", + "--samples", + SAMPLES, + ], + ) cache_dir = find_cache_dir(hf_id, task=task) assert cache_dir is not None, "expected cache after warm run" @@ -436,19 +551,29 @@ def test_onnx_file_mode_monolithic( assert onnx_files, f"no *_model.onnx in {cache_dir}" out = tmp_path / "result.json" - _invoke(runner, [ - "-m", str(onnx_files[0]), - "--model-id", hf_id, - "--task", task, - "--streaming", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + str(onnx_files[0]), + "--model-id", + hf_id, + "--task", + task, + "--streaming", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present(out, ["accuracy"]) _assert_in_range(data["metrics"], "accuracy", 0.5, 1.0) def test_onnx_file_mode_split_encoder( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: # Skip e2e for VitisAI due to Windows Access violation in model compilation for some models require_not_ep("vitisai") @@ -475,14 +600,23 @@ def _pick_onnx(prefix: str) -> Path: text_onnx = _pick_onnx("feat") out = tmp_path / "result.json" - _invoke(runner, [ - "-m", f"image-encoder={image_onnx}", - "-m", f"text-encoder={text_onnx}", - "--model-id", hf_id, - "--task", task, - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + f"image-encoder={image_onnx}", + "-m", + f"text-encoder={text_onnx}", + "--model-id", + hf_id, + "--task", + task, + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present(out, ["top1_accuracy"]) _assert_in_range(data["metrics"], "top1_accuracy", 30.0, 100.0) @@ -496,17 +630,26 @@ class TestEvalOutput: """``-o`` path creation + JSON validity.""" def test_creates_nested_output_dir( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: # Skip e2e for VitisAI due to Windows Access violation in model compilation for some models require_not_ep("vitisai") out = tmp_path / "nested" / "subdir" / "result.json" - _invoke(runner, [ - "-m", "Intel/bert-base-uncased-mrpc", - "--task", "text-classification", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "Intel/bert-base-uncased-mrpc", + "--task", + "text-classification", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) assert out.exists(), "nested output dir not auto-created" data = json.loads(out.read_text()) assert "metrics" in data @@ -554,33 +697,52 @@ def test_device_cpu(self, runner: CliRunner, tmp_path: Path) -> None: # classifier well-suited to a CPU smoke test (no per-token forward # passes like fill-mask). out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "microsoft/resnet-50", - "--task", "image-classification", - "--device", "cpu", - "--streaming", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "microsoft/resnet-50", + "--task", + "image-classification", + "--device", + "cpu", + "--streaming", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present(out, ["accuracy"]) # ResNet-50 full ImageNet ≈ 0.76; mini-imagenet is shifted, floor 0.4. _assert_in_range(data["metrics"], "accuracy", 0.4, 1.0) def test_device_npu_and_ep_qnn( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: # Combined --device + --ep. require_ep("qnn") out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "google/vit-base-patch16-224", - "--task", "image-classification", - "--device", "npu", - "--ep", "qnn", - "--streaming", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "google/vit-base-patch16-224", + "--task", + "image-classification", + "--device", + "npu", + "--ep", + "qnn", + "--streaming", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present(out, ["accuracy"]) _assert_in_range(data["metrics"], "accuracy", 0.5, 1.0) @@ -592,28 +754,43 @@ def test_device_npu_and_ep_qnn( class TestEvalAdditionalOptions: def test_dataset_name_explicit( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: # Skip e2e for VitisAI due to Windows Access violation in model compilation for some models require_not_ep("vitisai") out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "Intel/bert-base-uncased-mrpc", - "--task", "text-classification", - "--dataset", "nyu-mll/glue", - "--dataset-name", "mrpc", - "--column", "input_column=sentence1", - "--column", "second_input_column=sentence2", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "Intel/bert-base-uncased-mrpc", + "--task", + "text-classification", + "--dataset", + "nyu-mll/glue", + "--dataset-name", + "mrpc", + "--column", + "input_column=sentence1", + "--column", + "second_input_column=sentence2", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) # Same quantization caveat as TestEvalPerTask.test_text_classification. data = _assert_metrics_present(out, ["accuracy"]) if is_host("qnn"): _assert_in_range(data["metrics"], "accuracy", 0.6, 1.0) def test_label_mapping_image_segmentation( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: # Skip e2e for VitisAI due to Windows Access violation in model compilation for some models require_not_ep("vitisai") @@ -624,36 +801,58 @@ def test_label_mapping_image_segmentation( pytest.skip(f"label-mapping file not in repo: {label_map}") out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "nvidia/segformer-b1-finetuned-ade-512-512", - "--task", "image-segmentation", - "--dataset", "danjacobellis/scene_parse_150", - "--split", "validation", - "--streaming", - "--label-mapping", str(label_map), - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "nvidia/segformer-b1-finetuned-ade-512-512", + "--task", + "image-segmentation", + "--dataset", + "danjacobellis/scene_parse_150", + "--split", + "validation", + "--streaming", + "--label-mapping", + str(label_map), + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) data = _assert_metrics_present(out, ["mean_iou"]) _assert_in_range(data["metrics"], "mean_iou", 0.0, 1.0) def test_config_file_basic( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: # Skip e2e for VitisAI due to Windows Access violation in model compilation for some models require_not_ep("vitisai") # `eval` section provides task + samples. cfg = tmp_path / "cfg.json" - cfg.write_text(json.dumps({ - "loader": {"task": "text-classification"}, - "eval": {"dataset": {"samples": 5}}, - })) + cfg.write_text( + json.dumps( + { + "loader": {"task": "text-classification"}, + "eval": {"dataset": {"samples": 5}}, + } + ) + ) out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "Intel/bert-base-uncased-mrpc", - "--config", str(cfg), - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "Intel/bert-base-uncased-mrpc", + "--config", + str(cfg), + "-o", + str(out), + ], + ) # Same quantization caveat as TestEvalPerTask.test_text_classification. data = _assert_metrics_present(out, ["accuracy"]) if is_host("qnn"): @@ -663,23 +862,36 @@ def test_config_file_basic( ) def test_config_file_cli_override( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: # Skip e2e for VitisAI due to Windows Access violation in model compilation for some models require_not_ep("vitisai") # CLI wins over config file. cfg = tmp_path / "cfg.json" - cfg.write_text(json.dumps({ - "loader": {"task": "text-classification"}, - "eval": {"dataset": {"samples": 5}}, - })) + cfg.write_text( + json.dumps( + { + "loader": {"task": "text-classification"}, + "eval": {"dataset": {"samples": 5}}, + } + ) + ) out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "Intel/bert-base-uncased-mrpc", - "--config", str(cfg), - "--samples", "7", - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "Intel/bert-base-uncased-mrpc", + "--config", + str(cfg), + "--samples", + "7", + "-o", + str(out), + ], + ) # Same quantization caveat as TestEvalPerTask.test_text_classification. data = _assert_metrics_present(out, ["accuracy"]) if is_host("qnn"): @@ -689,17 +901,25 @@ def test_config_file_cli_override( ) def test_auto_task_detection( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: # Skip e2e for VitisAI due to Windows Access violation in model compilation for some models require_not_ep("vitisai") # No --task flag; CLI infers from HF model. out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "Intel/bert-base-uncased-mrpc", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "Intel/bert-base-uncased-mrpc", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) # Same quantization caveat as TestEvalPerTask.test_text_classification. data = _assert_metrics_present(out, ["accuracy"]) if is_host("qnn"): @@ -709,7 +929,10 @@ def test_auto_task_detection( ) def test_precision_warning_for_prebuilt_onnx( - self, runner: CliRunner, tmp_path: Path, caplog, + self, + runner: CliRunner, + tmp_path: Path, + caplog, ) -> None: # Pre-built ONNX + --precision emits warning, still succeeds. import logging as _logging @@ -726,63 +949,100 @@ def test_precision_warning_for_prebuilt_onnx( out = tmp_path / "result.json" with caplog.at_level(_logging.WARNING, logger="winml.modelkit.commands.eval"): - _invoke(runner, [ - "-m", str(onnx_files[0]), - "--model-id", hf_id, - "--task", task, - "--precision", "fp16", - "--streaming", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + str(onnx_files[0]), + "--model-id", + hf_id, + "--task", + task, + "--precision", + "fp16", + "--streaming", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) # Warning is emitted via ``logger.warning(...)``; capture from log records. msgs = [r.getMessage().lower() for r in caplog.records] - assert any( - "precision" in m and ("ignor" in m or "pre-built" in m) - for m in msgs - ), f"expected precision-ignored warning, got:\n{msgs!r}" + assert any("precision" in m and ("ignor" in m or "pre-built" in m) for m in msgs), ( + f"expected precision-ignored warning, got:\n{msgs!r}" + ) _assert_metrics_present(out, ["accuracy"]) def test_dataset_script_with_column_remap( - self, runner: CliRunner, tmp_path: Path, tiny_textcls_script: Path, + self, + runner: CliRunner, + tmp_path: Path, + tiny_textcls_script: Path, ) -> None: # Skip e2e for VitisAI due to Windows Access violation in model compilation for some models require_not_ep("vitisai") # --dataset-script + --column + --trust-remote-code (happy path). ds_path = tmp_path / "tiny_textcls" out = tmp_path / "result.json" - _invoke(runner, [ - "-m", "Intel/bert-base-uncased-mrpc", - "--task", "text-classification", - "--dataset-script", str(tiny_textcls_script), - "--dataset", str(ds_path), - "--trust-remote-code", - "--column", "input_column=text_a", - "--column", "second_input_column=text_b", - "--samples", "10", - "-o", str(out), - ]) + _invoke( + runner, + [ + "-m", + "Intel/bert-base-uncased-mrpc", + "--task", + "text-classification", + "--dataset-script", + str(tiny_textcls_script), + "--dataset", + str(ds_path), + "--trust-remote-code", + "--column", + "input_column=text_a", + "--column", + "second_input_column=text_b", + "--samples", + "10", + "-o", + str(out), + ], + ) assert ds_path.exists(), "dataset script did not write to --dataset path" data = _assert_metrics_present(out, ["accuracy"]) _assert_in_range(data["metrics"], "accuracy", 0.0, 1.0) def test_dataset_script_without_trust_remote_code( - self, runner: CliRunner, tmp_path: Path, tiny_textcls_script: Path, + self, + runner: CliRunner, + tmp_path: Path, + tiny_textcls_script: Path, ) -> None: ds_path = tmp_path / "tiny_textcls" - result = _invoke(runner, [ - "-m", "Intel/bert-base-uncased-mrpc", - "--task", "text-classification", - "--dataset-script", str(tiny_textcls_script), - "--dataset", str(ds_path), - "--samples", "10", - ], expect_success=False) + result = _invoke( + runner, + [ + "-m", + "Intel/bert-base-uncased-mrpc", + "--task", + "text-classification", + "--dataset-script", + str(tiny_textcls_script), + "--dataset", + str(ds_path), + "--samples", + "10", + ], + expect_success=False, + ) assert result.exit_code != 0 assert "trust-remote-code" in result.output.lower(), result.output def test_compare_mode_image_classification( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: + require_not_ep("migraphx") # --mode compare runs the ONNX candidate and the HF PyTorch reference # on the same random inputs and reports per-output tensor-parity # metrics in display-ready flat shape: @@ -790,14 +1050,23 @@ def test_compare_mode_image_classification( # over 5 metrics (sqnr_db, psnr_db, cosine_similarity, mse, # max_abs_diff) x 4 stats (mean, std, min, max) = 20 top-level keys. out = tmp_path / "result.json" - _invoke(runner, [ - "--mode", "compare", - "-m", "microsoft/resnet-50", - "--task", "image-classification", - "--precision", "fp16", - "--samples", SAMPLES, - "-o", str(out), - ]) + _invoke( + runner, + [ + "--mode", + "compare", + "-m", + "microsoft/resnet-50", + "--task", + "image-classification", + "--precision", + "fp16", + "--samples", + SAMPLES, + "-o", + str(out), + ], + ) assert out.exists(), f"output file not created: {out}" data = json.loads(out.read_text()) metrics = data.get("metrics", {}) @@ -816,9 +1085,7 @@ def test_compare_mode_image_classification( per_output_names: set[str] | None = None for key in expected_keys: row = metrics[key] - assert isinstance(row, dict) and row, ( - f"metrics[{key!r}] not a non-empty dict: {row!r}" - ) + assert isinstance(row, dict) and row, f"metrics[{key!r}] not a non-empty dict: {row!r}" assert all(isinstance(v, (int, float)) for v in row.values()), ( f"non-numeric value in metrics[{key!r}]: {row!r}" ) @@ -847,8 +1114,7 @@ def test_compare_mode_image_classification( threshold = 0.95 if is_host("qnn") else 0.5 for output_name, value in cos_mean.items(): assert value >= threshold, ( - f"cosine_similarity_mean[{output_name}]={value} " - f"below {threshold} sanity floor" + f"cosine_similarity_mean[{output_name}]={value} below {threshold} sanity floor" ) @@ -859,45 +1125,76 @@ def test_compare_mode_image_classification( class TestEvalErrorPaths: def test_bad_column_format( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: - result = _invoke(runner, [ - "-m", "Intel/bert-base-uncased-mrpc", - "--task", "text-classification", - "--column", "foo", # missing '=' - "--samples", "1", - ], expect_success=False) + result = _invoke( + runner, + [ + "-m", + "Intel/bert-base-uncased-mrpc", + "--task", + "text-classification", + "--column", + "foo", # missing '=' + "--samples", + "1", + ], + expect_success=False, + ) assert result.exit_code != 0 assert "key=value" in result.output.lower() or "invalid" in result.output.lower(), ( result.output ) def test_missing_label_mapping_file( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: missing = tmp_path / "does-not-exist.json" - result = _invoke(runner, [ - "-m", "Intel/bert-base-uncased-mrpc", - "--task", "text-classification", - "--label-mapping", str(missing), - "--samples", "1", - ], expect_success=False) + result = _invoke( + runner, + [ + "-m", + "Intel/bert-base-uncased-mrpc", + "--task", + "text-classification", + "--label-mapping", + str(missing), + "--samples", + "1", + ], + expect_success=False, + ) assert result.exit_code != 0 out_lower = result.output.lower() - assert ("does not exist" in out_lower - or "not found" in out_lower - or "no such file" in out_lower), result.output + assert ( + "does not exist" in out_lower or "not found" in out_lower or "no such file" in out_lower + ), result.output def test_bogus_dataset_name( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: - result = _invoke(runner, [ - "-m", "Intel/bert-base-uncased-mrpc", - "--task", "text-classification", - "--dataset", "nyu-mll/glue", - "--dataset-name", "not_a_real_glue_config", - "--samples", "1", - ], expect_success=False) + result = _invoke( + runner, + [ + "-m", + "Intel/bert-base-uncased-mrpc", + "--task", + "text-classification", + "--dataset", + "nyu-mll/glue", + "--dataset-name", + "not_a_real_glue_config", + "--samples", + "1", + ], + expect_success=False, + ) assert result.exit_code != 0 # Loose: exact wording depends on datasets lib version assert "config" in result.output.lower() or "not_a_real_glue_config" in result.output, ( @@ -913,18 +1210,23 @@ def test_schema_without_task(self, runner: CliRunner) -> None: def test_schema_bogus_task(self, runner: CliRunner) -> None: # get_evaluator_class ValueError wrapped as UsageError. result = _invoke( - runner, ["--schema", "--task", "not-a-real-task"], + runner, + ["--schema", "--task", "not-a-real-task"], expect_success=False, ) assert result.exit_code != 0 out_lower = result.output.lower() - assert ("not-a-real-task" in out_lower - or "unknown" in out_lower - or "unsupported" in out_lower - or "invalid" in out_lower), result.output + assert ( + "not-a-real-task" in out_lower + or "unknown" in out_lower + or "unsupported" in out_lower + or "invalid" in out_lower + ), result.output def test_onnx_file_without_model_id( - self, runner: CliRunner, tmp_path: Path, + self, + runner: CliRunner, + tmp_path: Path, ) -> None: # Needs a real .onnx file path that exists; reuse warmed cache. hf_id = "google/vit-base-patch16-224" @@ -935,10 +1237,17 @@ def test_onnx_file_without_model_id( onnx_files = list(cache_dir.glob("*_model.onnx")) assert onnx_files - result = _invoke(runner, [ - "-m", str(onnx_files[0]), - "--task", task, - "--samples", "1", - ], expect_success=False) + result = _invoke( + runner, + [ + "-m", + str(onnx_files[0]), + "--task", + task, + "--samples", + "1", + ], + expect_success=False, + ) assert result.exit_code != 0 assert "model-id" in result.output.lower(), result.output