From 2c09eeffb29f2eaef36044f9d3df501b553afcf7 Mon Sep 17 00:00:00 2001 From: jsdevninja Date: Tue, 12 May 2026 08:07:10 -0500 Subject: [PATCH] feat: add model-compatibility.json registry and models CLI --- model-compatibility.json | 58 ++++++ pyproject.toml | 3 + src/browser_harness/model-compatibility.json | 58 ++++++ src/browser_harness/model_compatibility.py | 199 +++++++++++++++++++ src/browser_harness/run.py | 14 ++ tests/unit/test_model_compatibility.py | 116 +++++++++++ 6 files changed, 448 insertions(+) create mode 100644 model-compatibility.json create mode 100644 src/browser_harness/model-compatibility.json create mode 100644 src/browser_harness/model_compatibility.py create mode 100644 tests/unit/test_model_compatibility.py diff --git a/model-compatibility.json b/model-compatibility.json new file mode 100644 index 00000000..d1766699 --- /dev/null +++ b/model-compatibility.json @@ -0,0 +1,58 @@ +[ + { + "model": "qwen3.6 35b-a3b", + "provider": "ollama", + "status": "verified", + "parameter_size_b": 35, + "notes": "Works well for simple click/type tasks. Struggles with complex multi-step flows.", + "last_tested": "2026-05-01" + }, + { + "model": "qwen3:8b", + "provider": "ollama", + "status": "works", + "parameter_size_b": 8, + "notes": "Usable for very short scripts; frequent tool-call mistakes on longer flows.", + "last_tested": "2026-05-01" + }, + { + "model": "qwen2.5:32b", + "provider": "ollama", + "status": "verified", + "parameter_size_b": 32, + "notes": "Works well for simple click/type tasks. Struggles with complex multi-step flows.", + "last_tested": "2026-05-01" + }, + { + "model": "mistral-small", + "provider": "anyscale", + "status": "verified", + "parameter_size_b": 22, + "notes": "Good balance of speed and capability for headless deployments.", + "last_tested": "2026-05-01" + }, + { + "model": "llama3.1:70b", + "provider": "ollama", + "status": "verified", + "parameter_size_b": 70, + "notes": "Strong reliability on multi-step automation when VRAM permits.", + "last_tested": "2026-05-01" + }, + { + "model": "phi4", + "provider": "ollama", + "status": "unknown", + "parameter_size_b": 14, + "notes": "Community reports only; not yet re-verified on current harness.", + "last_tested": "2026-04-15" + }, + { + "model": "tinyllama", + "provider": "ollama", + "status": "broken", + "parameter_size_b": 1, + "notes": "Too small for reliable tool use; not recommended for harness workflows.", + "last_tested": "2026-05-01" + } +] diff --git a/pyproject.toml b/pyproject.toml index f812a6ab..b11a344c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,5 +23,8 @@ package-dir = {"" = "src"} [tool.setuptools.packages.find] where = ["src"] +[tool.setuptools.package-data] +browser_harness = ["model-compatibility.json"] + [tool.pytest.ini_options] pythonpath = ["src"] diff --git a/src/browser_harness/model-compatibility.json b/src/browser_harness/model-compatibility.json new file mode 100644 index 00000000..d1766699 --- /dev/null +++ b/src/browser_harness/model-compatibility.json @@ -0,0 +1,58 @@ +[ + { + "model": "qwen3.6 35b-a3b", + "provider": "ollama", + "status": "verified", + "parameter_size_b": 35, + "notes": "Works well for simple click/type tasks. Struggles with complex multi-step flows.", + "last_tested": "2026-05-01" + }, + { + "model": "qwen3:8b", + "provider": "ollama", + "status": "works", + "parameter_size_b": 8, + "notes": "Usable for very short scripts; frequent tool-call mistakes on longer flows.", + "last_tested": "2026-05-01" + }, + { + "model": "qwen2.5:32b", + "provider": "ollama", + "status": "verified", + "parameter_size_b": 32, + "notes": "Works well for simple click/type tasks. Struggles with complex multi-step flows.", + "last_tested": "2026-05-01" + }, + { + "model": "mistral-small", + "provider": "anyscale", + "status": "verified", + "parameter_size_b": 22, + "notes": "Good balance of speed and capability for headless deployments.", + "last_tested": "2026-05-01" + }, + { + "model": "llama3.1:70b", + "provider": "ollama", + "status": "verified", + "parameter_size_b": 70, + "notes": "Strong reliability on multi-step automation when VRAM permits.", + "last_tested": "2026-05-01" + }, + { + "model": "phi4", + "provider": "ollama", + "status": "unknown", + "parameter_size_b": 14, + "notes": "Community reports only; not yet re-verified on current harness.", + "last_tested": "2026-04-15" + }, + { + "model": "tinyllama", + "provider": "ollama", + "status": "broken", + "parameter_size_b": 1, + "notes": "Too small for reliable tool use; not recommended for harness workflows.", + "last_tested": "2026-05-01" + } +] diff --git a/src/browser_harness/model_compatibility.py b/src/browser_harness/model_compatibility.py new file mode 100644 index 00000000..b763ea91 --- /dev/null +++ b/src/browser_harness/model_compatibility.py @@ -0,0 +1,199 @@ +"""Load and query the model-compatibility.json registry (issue #329).""" + +from __future__ import annotations + +import json +import re +import sys +from importlib import resources +from pathlib import Path +from typing import Any + +VALID_STATUSES = frozenset({"verified", "works", "unknown", "broken"}) + +_SIZE_RE = re.compile(r"(\d+(?:\.\d+)?)\s*([bB])\b") + + +def _registry_bytes() -> bytes: + """Load registry JSON: repo root when developing from a src/ checkout (#329), else bundled copy.""" + here = Path(__file__).resolve().parent + if here.name == "browser_harness" and here.parent.name == "src": + root = here.parents[2] / "model-compatibility.json" + if root.is_file(): + return root.read_bytes() + bundled = here / "model-compatibility.json" + if bundled.is_file(): + return bundled.read_bytes() + return resources.files(__package__).joinpath("model-compatibility.json").read_bytes() + + +def load_registry() -> list[dict[str, Any]]: + raw = json.loads(_registry_bytes().decode("utf-8")) + if not isinstance(raw, list): + raise ValueError("registry must be a JSON array") + out: list[dict[str, Any]] = [] + for i, row in enumerate(raw): + if not isinstance(row, dict): + raise ValueError(f"entry {i} must be an object") + for key in ("model", "provider", "status", "notes", "last_tested"): + if key not in row: + raise ValueError(f"entry {i} missing required field {key!r}") + if not isinstance(row[key], str): + raise ValueError(f"entry {i} field {key!r} must be a string") + if row["status"] not in VALID_STATUSES: + raise ValueError( + f"entry {i} invalid status {row['status']!r}; " + f"expected one of {sorted(VALID_STATUSES)}" + ) + ps = row.get("parameter_size_b") + if ps is not None and not isinstance(ps, (int, float)): + raise ValueError(f"entry {i} parameter_size_b must be a number or omitted") + out.append(row) + return out + + +def parse_size_b(token: str) -> float: + """Parse values like '35b', '8.9B', '70' (treated as billions).""" + t = token.strip().lower() + if not t: + raise ValueError("empty size") + m = _SIZE_RE.search(t) + if m: + return float(m.group(1)) + if t.endswith("b") and t[:-1].replace(".", "", 1).isdigit(): + return float(t[:-1]) + if re.fullmatch(r"\d+(?:\.\d+)?", t): + return float(t) + raise ValueError(f"unrecognized size token: {token!r}") + + +def infer_parameter_size_b(entry: dict[str, Any]) -> float | None: + ps = entry.get("parameter_size_b") + if isinstance(ps, (int, float)): + return float(ps) + name = entry.get("model", "") + if not isinstance(name, str): + return None + m = _SIZE_RE.search(name) + if m: + return float(m.group(1)) + return None + + +def _format_row(entry: dict[str, Any]) -> str: + sz = infer_parameter_size_b(entry) + sz_s = f"{sz:g}B" if sz is not None else "—" + return f"{entry['model']}\t{entry['provider']}\t{entry['status']}\t{sz_s}\t{entry['last_tested']}" + + +def models_list(subargs: list[str]) -> int: + if not subargs or subargs[0] != "list": + print("usage: browser-harness models list [--min-size b] [--status ]", file=sys.stderr) + return 2 + + i = 1 + min_b: float | None = None + status_filter: str | None = None + while i < len(subargs): + a = subargs[i] + if a == "--min-size": + if i + 1 >= len(subargs): + print("error: --min-size requires a value (e.g. 35b)", file=sys.stderr) + return 2 + try: + min_b = parse_size_b(subargs[i + 1]) + except ValueError as e: + print(f"error: {e}", file=sys.stderr) + return 2 + i += 2 + continue + if a == "--status": + if i + 1 >= len(subargs): + print("error: --status requires a value", file=sys.stderr) + return 2 + status_filter = subargs[i + 1].strip().lower() + if status_filter not in VALID_STATUSES: + print( + f"error: unknown status {subargs[i + 1]!r}; " + f"expected one of {sorted(VALID_STATUSES)}", + file=sys.stderr, + ) + return 2 + i += 2 + continue + print(f"error: unexpected argument {a!r}", file=sys.stderr) + return 2 + + try: + rows = load_registry() + except (OSError, json.JSONDecodeError, ValueError) as e: + print(f"error: failed to load model registry: {e}", file=sys.stderr) + return 1 + + filtered: list[dict[str, Any]] = [] + for row in rows: + if status_filter is not None and row["status"] != status_filter: + continue + if min_b is not None: + sz = infer_parameter_size_b(row) + if sz is None or sz < min_b: + continue + filtered.append(row) + + print("model\tprovider\tstatus\tsize\tlast_tested") + for row in filtered: + print(_format_row(row)) + return 0 + + +def resolve_model(query: str, rows: list[dict[str, Any]]) -> dict[str, Any] | list[dict[str, Any]]: + q = query.strip().lower() + if not q: + return [] + exact = [r for r in rows if r["model"].lower() == q] + if len(exact) == 1: + return exact[0] + if len(exact) > 1: + return exact + substr = [r for r in rows if q in r["model"].lower()] + if len(substr) == 1: + return substr[0] + return substr + + +def print_model_info(name: str) -> int: + try: + rows = load_registry() + except (OSError, json.JSONDecodeError, ValueError) as e: + print(f"error: failed to load model registry: {e}", file=sys.stderr) + return 1 + + hit = resolve_model(name, rows) + if isinstance(hit, list) and not hit: + print(f"no model matched {name!r}", file=sys.stderr) + return 1 + if isinstance(hit, list): + names = ", ".join(repr(r["model"]) for r in hit) + print(f"ambiguous query {name!r}; matches: {names}", file=sys.stderr) + return 1 + + entry = hit + sz = infer_parameter_size_b(entry) + print(f"model: {entry['model']}") + print(f"provider: {entry['provider']}") + print(f"status: {entry['status']}") + print(f"last_tested: {entry['last_tested']}") + if sz is not None: + print(f"parameter_size: {sz:g}B") + print(f"notes:\n{entry['notes']}") + return 0 + + +def models_main(argv: list[str]) -> int: + if not argv: + print("usage: browser-harness models list [options]", file=sys.stderr) + return 2 + if argv[0] == "list": + return models_list(argv) + print("usage: browser-harness models list [--min-size b] [--status ]", file=sys.stderr) + return 2 diff --git a/src/browser_harness/run.py b/src/browser_harness/run.py index f41e180e..afe0acfd 100644 --- a/src/browser_harness/run.py +++ b/src/browser_harness/run.py @@ -41,6 +41,9 @@ browser-harness --doctor diagnose install, daemon, and browser state browser-harness --update [-y] pull the latest version (agents: pass -y) browser-harness --reload stop the daemon so next call picks up code changes + browser-harness --model-info print registry details for a model name + browser-harness models list [--min-size b] [--status ] + list tested self-host / third-party models (see model-compatibility.json) """ USAGE = """Usage: @@ -88,6 +91,17 @@ def main(): restart_daemon() print("daemon stopped — will restart fresh on next call") return + if args and args[0] == "--model-info": + if len(args) < 2: + print("error: --model-info requires a model name", file=sys.stderr) + sys.exit(2) + from .model_compatibility import print_model_info + + sys.exit(print_model_info(args[1])) + if args and args[0] == "models": + from .model_compatibility import models_main + + sys.exit(models_main(args[1:])) if args and args[0] == "--debug-clicks": os.environ["BH_DEBUG_CLICKS"] = "1" args = args[1:] diff --git a/tests/unit/test_model_compatibility.py b/tests/unit/test_model_compatibility.py new file mode 100644 index 00000000..26a5737a --- /dev/null +++ b/tests/unit/test_model_compatibility.py @@ -0,0 +1,116 @@ +import sys +from io import StringIO +from unittest.mock import patch + +import pytest + +from browser_harness import model_compatibility +from browser_harness.model_compatibility import ( + infer_parameter_size_b, + load_registry, + models_list, + models_main, + parse_size_b, + print_model_info, + resolve_model, +) + + +def test_parse_size_b(): + assert parse_size_b("35b") == 35 + assert parse_size_b("35B") == 35 + assert parse_size_b("8.9b") == 8.9 + assert parse_size_b("70") == 70 + with pytest.raises(ValueError): + parse_size_b("") + + +def test_load_registry_smoke(): + rows = load_registry() + assert isinstance(rows, list) + assert all("model" in r and "status" in r for r in rows) + + +def test_infer_parameter_size_b(): + assert infer_parameter_size_b({"model": "x", "parameter_size_b": 12}) == 12.0 + assert infer_parameter_size_b({"model": "foo-70b-bar"}) == 70.0 + assert infer_parameter_size_b({"model": "nope"}) is None + + +def test_resolve_model_exact_and_substring(): + rows = [ + {"model": "ab", "provider": "p", "status": "works", "notes": "", "last_tested": "2026-01-01"}, + {"model": "abc", "provider": "p", "status": "works", "notes": "", "last_tested": "2026-01-01"}, + ] + assert resolve_model("abc", rows)["model"] == "abc" + assert resolve_model("AB", rows)["model"] == "ab" + amb = resolve_model("a", rows) + assert isinstance(amb, list) and len(amb) > 1 + + +def test_models_list_filters(): + fake = [ + { + "model": "big", + "provider": "ollama", + "status": "verified", + "parameter_size_b": 70, + "notes": "n", + "last_tested": "2026-01-01", + }, + { + "model": "small", + "provider": "ollama", + "status": "works", + "parameter_size_b": 7, + "notes": "n", + "last_tested": "2026-01-01", + }, + ] + out = StringIO() + with patch.object(model_compatibility, "load_registry", return_value=fake), patch("sys.stdout", out): + assert models_list(["list", "--min-size", "35b", "--status", "verified"]) == 0 + lines = out.getvalue().strip().splitlines() + assert len(lines) == 2 # header + one row + assert "big" in lines[1] + assert "small" not in lines[1] + + +def test_models_list_bad_flag(): + err = StringIO() + with patch("sys.stderr", err): + assert models_list(["list", "--nope"]) == 2 + assert "unexpected" in err.getvalue() + + +def test_models_main_dispatches_list(): + out = StringIO() + with patch.object(model_compatibility, "load_registry", return_value=[]), patch("sys.stdout", out): + assert models_main(["list"]) == 0 + assert "model\tprovider" in out.getvalue() + + +def test_print_model_info_not_found(): + err = StringIO() + with patch.object(model_compatibility, "load_registry", return_value=[]), patch("sys.stderr", err): + assert print_model_info("missing-model-xyz") == 1 + assert "no model matched" in err.getvalue() + + +def test_run_model_info_invocation(): + from browser_harness import run + + out = StringIO() + err = StringIO() + with patch.object(sys, "argv", ["browser-harness", "--model-info", "mistral-small"]), \ + patch("sys.stdout", out), \ + patch("sys.stderr", err): + try: + run.main() + except SystemExit as e: + assert e.code == 0 + else: + raise AssertionError("expected sys.exit") + body = out.getvalue() + assert "mistral-small" in body + assert "anyscale" in body