firebase · MengqinShen · Feb 5, 2026 · Feb 5, 2026 · Feb 5, 2026 · Feb 6, 2026
diff --git a/py/model_performance_report.md b/py/model_performance_report.md
@@ -0,0 +1,68 @@
+# Model Performance Test Report
+
+## Summary
+
+- **Total Tests**: 8
+- **Passed**: 8
+- **Failed**: 0
+- **Success Rate**: 100.0%
+
+## Detailed Results
+
+### googleai/gemini-2.0-flash
+
+#### Config: `{}`
+
+- **Status**: ✅ SUCCESS
+- **Timing**: 1.602s
+- **Response**: Aye, 5 doubloons plus 3 more... that be 8 doubloons in total!
+
+
+#### Config: `{"temperature": 0.0}`
+
+- **Status**: ✅ SUCCESS
+- **Timing**: 1.832s
+- **Response**: Ahoy there, matey! 5 + 3 be equal to 8! Shiver me timbers!
+
+
+#### Config: `{"temperature": 2.0}`
+
+- **Status**: ✅ SUCCESS
+- **Timing**: 1.565s
+- **Response**: Arrr, 5 doubloons plus 3 doubloons be equal to 8 doubloons! 
+
+
+#### Config: `{"temperature": 1.0}`
+
+- **Status**: ✅ SUCCESS
+- **Timing**: 1.662s
+- **Response**: Aye, 5 doubloons plus 3 doubloons be 8 doubloons in total! Shiver me timbers!
+
+
+#### Config: `{"topP": 0.0}`
+
+- **Status**: ✅ SUCCESS
+- **Timing**: 1.774s
+- **Response**: Ahoy there, matey! 5 + 3 be equal to 8! Shiver me timbers, that be a fine haul o' doubloons!
+
+
+#### Config: `{"topP": 1.0}`
+
+- **Status**: ✅ SUCCESS
+- **Timing**: 1.651s
+- **Response**: Ahoy there, matey! 5 + 3 be equal to 8! Shiver me timbers!
+
+
+#### Config: `{"topP": 0.5}`
+
+- **Status**: ✅ SUCCESS
+- **Timing**: 1.752s
+- **Response**: Ahoy there, matey! 5 + 3 be equal to 8! Shiver me timbers, that be a fine bit o' arithmetic!
+
+
+#### Config: `{"safetySettings": []}`
+
+- **Status**: ✅ SUCCESS
+- **Timing**: 1.562s
+- **Response**: Aye, 5 doubloons plus 3 doubloons be 8 doubloons, me hearty!
+
diff --git a/py/samples/sample-test/README.md b/py/samples/sample-test/README.md
@@ -1,35 +1,23 @@
-# Sample Test Utilities
+# Model Performance Testing Tool
 
-Internal testing utilities for Genkit samples. These scripts help developers
-verify that sample flows are working correctly.
+A tool to test model performance across different models and configuration variations.
 
-## Scripts
+## Setup
 
-### `review_sample_flows.py`
+1.  Ensure you have `uv` installed.
+2.  Set your API keys (e.g., `GOOGLE_GENAI_API_KEY`).
 
-Reviews and tests all flows in a sample's `main.py`.
+## Usage
 
-```bash
-# Test all flows in a sample
-cd py
-uv run samples/sample-test/review_sample_flows.py samples/google-genai-hello
-
-# Specify custom output file
-uv run samples/sample-test/review_sample_flows.py samples/google-genai-hello --output results.txt
-```
-
-### `run_single_flow.py`
-
-Runs a single flow from a sample. Used internally by `review_sample_flows.py`.
+Run the tool:
 
 ```bash
-cd py
-uv run samples/sample-test/run_single_flow.py samples/google-genai-hello flow_name --input '{"key": "value"}'
+uv run test_model_performance.py --models googleai/gemini-2.0-flash
 ```
 
-## Output
+## Features
 
-The review script generates a report file with:
-- Summary of successful/failed flows
-- Detailed input/output for each flow
-- Error messages and tracebacks for failures
+-   **Model Discovery**: Automatically finds registered models.
+-   **Config Discovery**: Inspects model schema to find parameters.
+-   **Variations**: Tests min, max, midpoint, and default values.
+-   **Report**: Generates a Markdown report with pass/fail stats and detailed results.
diff --git a/py/samples/sample-test/pyproject.toml b/py/samples/sample-test/pyproject.toml
@@ -0,0 +1,45 @@
+# Copyright 2026 Google LLC
+# SPDX-License-Identifier: Apache-2.0
+
+[project]
+name = "sample-test"
+version = "0.1.0"
+description = "Model Performance Testing Tool"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "genkit",
+    "genkit-plugin-google-genai",
+    "genkit-plugin-vertex-ai",
+    "genkit-plugin-google-cloud",
+    "genkit-plugin-amazon-bedrock",
+    "genkit-plugin-anthropic",
+    "genkit-plugin-deepseek",
+    "genkit-plugin-xai",
+    "genkit-plugin-ollama",
+    "genkit-plugin-mistral",
+    "genkit-plugin-evaluators",
+    "fastapi",
+    "uvicorn",
+    "datamodel-code-generator",
+]
+
+[build-system]
+build-backend = "hatchling.build"
+requires      = ["hatchling"]
+
+[tool.hatch.build.targets.wheel]
+packages = ["test_model_performance.py", "run_single_model_test.py"]
+
+[tool.uv.sources]
+genkit = { path = "../../packages/genkit", editable = true }
+genkit-plugin-google-genai = { path = "../../plugins/google-genai", editable = true }
+genkit-plugin-vertex-ai = { path = "../../plugins/vertex-ai", editable = true }
+genkit-plugin-google-cloud = { path = "../../plugins/google-cloud", editable = true }
+genkit-plugin-amazon-bedrock = { path = "../../plugins/amazon-bedrock", editable = true }
+genkit-plugin-anthropic = { path = "../../plugins/anthropic", editable = true }
+genkit-plugin-deepseek = { path = "../../plugins/deepseek", editable = true }
+genkit-plugin-xai = { path = "../../plugins/xai", editable = true }
+genkit-plugin-ollama = { path = "../../plugins/ollama", editable = true }
+genkit-plugin-mistral = { path = "../../plugins/mistral", editable = true }
+genkit-plugin-evaluators = { path = "../../plugins/evaluators", editable = true }
diff --git a/py/samples/sample-test/run_single_model_test.py b/py/samples/sample-test/run_single_model_test.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Helper script to run a single model test in isolation.
+
+This script is called by test_model_performance.py to execute each model+config
+test in a separate subprocess, avoiding state pollution and enabling timeout handling.
+
+Usage:
+    python run_single_model_test.py <model_name> --config <json_config> --user-prompt <text> --system-prompt <text>
+
+Output:
+    JSON object with 'success', 'response', 'error', 'timing' fields
+"""
+
+import argparse
+import json
+import sys
+import time
+from typing import Any
+
+
+async def run_model_test(
+    model_name: str,
+    config: dict[str, Any],
+    user_prompt: str,
+    system_prompt: str | None = None,
+) -> dict[str, Any]:
+    """Run a single model test and return result.
+
+    Args:
+        model_name: Name of the model to test
+        config: Configuration dictionary for the model
+        user_prompt: User prompt to send
+        system_prompt: Optional system prompt
+
+    Returns:
+        Dict with 'success', 'response', 'error', 'timing' fields
+    """
+    result: dict[str, Any] = {
+        "success": False,
+        "response": None,
+        "error": None,
+        "timing": 0.0,
+    }
+
+    try:
+        # Import Genkit
+        from genkit import Genkit
+        from genkit.plugins.google_genai import GoogleAI, VertexAI
+        from genkit.core.typing import Message, TextPart
+
+        plugins = []
+        try:
+            plugins.append(GoogleAI())
+        except Exception:
+            pass
+        try:
+            plugins.append(VertexAI())
+        except Exception:
+            pass
+
+        # Initialize Genkit
+        ai = Genkit(plugins=plugins)
+
+        # Build the prompt
+        messages = []
+        if system_prompt:
+            messages.append(Message(
+                role='system',
+                content=[TextPart(text=system_prompt)]
+            ))
+        messages.append(Message(
+            role='user',
+            content=[TextPart(text=user_prompt)]
+        ))
+
+        # Start timing
+        start_time = time.time()
+
+        # Run generation
+        response = await ai.generate(
+            model=model_name,
+            messages=messages,
+            config=config,
+        )
+
+        # Calculate timing
+        elapsed = time.time() - start_time
+
+        # Extract response text
+        response_text = response.text if hasattr(response, 'text') else str(response)
+
+        result["success"] = True
+        result["response"] = response_text
+        result["timing"] = round(elapsed, 3)
+
+    except Exception as e:
+        import traceback
+        result["error"] = f"{type(e).__name__}: {e}\n{traceback.format_exc()}"
+
+    return result
+
+
+def main() -> None:
+    """Run a single model test and output JSON result."""
+    parser = argparse.ArgumentParser(description='Run a single model test.')
+    parser.add_argument('model_name', type=str, help='Name of the model to test')
+    parser.add_argument('--config', type=str, default='{}', help='JSON string of model config')
+    parser.add_argument('--user-prompt', type=str, required=True, help='User prompt text')
+    parser.add_argument('--system-prompt', type=str, default=None, help='System prompt text')
+    args = parser.parse_args()
+
+    # Suppress verbose logging
+    import logging
+    logging.basicConfig(level=logging.ERROR)
+    logging.getLogger('genkit').setLevel(logging.ERROR)
+    logging.getLogger('google').setLevel(logging.ERROR)
+
+    # Override input() to prevent blocking
+    import builtins
+    builtins.input = lambda prompt="": "dummy_value"
+
+    result: dict[str, Any] = {"success": False, "response": None, "error": "Unknown initialization error", "timing": 0.0}
+
+    try:
+        # Parse config
+        config = json.loads(args.config)
+
+        # Run test in async context
+        import asyncio
+        result = asyncio.run(run_model_test(
+            args.model_name,
+            config,
+            args.user_prompt,
+            args.system_prompt,
+        ))
+
+    except Exception as e:
+        result = {"success": False, "response": None, "error": f"Initialization failed: {e}", "timing": 0.0}
+
+    # Output JSON result with markers
+    print("---JSON_RESULT_START---")
+    print(json.dumps(result))
+    print("---JSON_RESULT_END---")
+
+
+if __name__ == "__main__":
+    main()