From 2c09eeffb29f2eaef36044f9d3df501b553afcf7 Mon Sep 17 00:00:00 2001
From: jsdevninja <topit89807@gmail.com>
Date: Tue, 12 May 2026 08:07:10 -0500
Subject: [PATCH] feat: add model-compatibility.json registry and models CLI

---
 model-compatibility.json                     |  58 ++++++
 pyproject.toml                               |   3 +
 src/browser_harness/model-compatibility.json |  58 ++++++
 src/browser_harness/model_compatibility.py   | 199 +++++++++++++++++++
 src/browser_harness/run.py                   |  14 ++
 tests/unit/test_model_compatibility.py       | 116 +++++++++++
 6 files changed, 448 insertions(+)
 create mode 100644 model-compatibility.json
 create mode 100644 src/browser_harness/model-compatibility.json
 create mode 100644 src/browser_harness/model_compatibility.py
 create mode 100644 tests/unit/test_model_compatibility.py

diff --git a/model-compatibility.json b/model-compatibility.json
new file mode 100644
index 00000000..d1766699
--- /dev/null
+++ b/model-compatibility.json
@@ -0,0 +1,58 @@
+[
+  {
+    "model": "qwen3.6 35b-a3b",
+    "provider": "ollama",
+    "status": "verified",
+    "parameter_size_b": 35,
+    "notes": "Works well for simple click/type tasks. Struggles with complex multi-step flows.",
+    "last_tested": "2026-05-01"
+  },
+  {
+    "model": "qwen3:8b",
+    "provider": "ollama",
+    "status": "works",
+    "parameter_size_b": 8,
+    "notes": "Usable for very short scripts; frequent tool-call mistakes on longer flows.",
+    "last_tested": "2026-05-01"
+  },
+  {
+    "model": "qwen2.5:32b",
+    "provider": "ollama",
+    "status": "verified",
+    "parameter_size_b": 32,
+    "notes": "Works well for simple click/type tasks. Struggles with complex multi-step flows.",
+    "last_tested": "2026-05-01"
+  },
+  {
+    "model": "mistral-small",
+    "provider": "anyscale",
+    "status": "verified",
+    "parameter_size_b": 22,
+    "notes": "Good balance of speed and capability for headless deployments.",
+    "last_tested": "2026-05-01"
+  },
+  {
+    "model": "llama3.1:70b",
+    "provider": "ollama",
+    "status": "verified",
+    "parameter_size_b": 70,
+    "notes": "Strong reliability on multi-step automation when VRAM permits.",
+    "last_tested": "2026-05-01"
+  },
+  {
+    "model": "phi4",
+    "provider": "ollama",
+    "status": "unknown",
+    "parameter_size_b": 14,
+    "notes": "Community reports only; not yet re-verified on current harness.",
+    "last_tested": "2026-04-15"
+  },
+  {
+    "model": "tinyllama",
+    "provider": "ollama",
+    "status": "broken",
+    "parameter_size_b": 1,
+    "notes": "Too small for reliable tool use; not recommended for harness workflows.",
+    "last_tested": "2026-05-01"
+  }
+]
diff --git a/pyproject.toml b/pyproject.toml
index f812a6ab..b11a344c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,5 +23,8 @@ package-dir = {"" = "src"}
 [tool.setuptools.packages.find]
 where = ["src"]
 
+[tool.setuptools.package-data]
+browser_harness = ["model-compatibility.json"]
+
 [tool.pytest.ini_options]
 pythonpath = ["src"]
diff --git a/src/browser_harness/model-compatibility.json b/src/browser_harness/model-compatibility.json
new file mode 100644
index 00000000..d1766699
--- /dev/null
+++ b/src/browser_harness/model-compatibility.json
@@ -0,0 +1,58 @@
+[
+  {
+    "model": "qwen3.6 35b-a3b",
+    "provider": "ollama",
+    "status": "verified",
+    "parameter_size_b": 35,
+    "notes": "Works well for simple click/type tasks. Struggles with complex multi-step flows.",
+    "last_tested": "2026-05-01"
+  },
+  {
+    "model": "qwen3:8b",
+    "provider": "ollama",
+    "status": "works",
+    "parameter_size_b": 8,
+    "notes": "Usable for very short scripts; frequent tool-call mistakes on longer flows.",
+    "last_tested": "2026-05-01"
+  },
+  {
+    "model": "qwen2.5:32b",
+    "provider": "ollama",
+    "status": "verified",
+    "parameter_size_b": 32,
+    "notes": "Works well for simple click/type tasks. Struggles with complex multi-step flows.",
+    "last_tested": "2026-05-01"
+  },
+  {
+    "model": "mistral-small",
+    "provider": "anyscale",
+    "status": "verified",
+    "parameter_size_b": 22,
+    "notes": "Good balance of speed and capability for headless deployments.",
+    "last_tested": "2026-05-01"
+  },
+  {
+    "model": "llama3.1:70b",
+    "provider": "ollama",
+    "status": "verified",
+    "parameter_size_b": 70,
+    "notes": "Strong reliability on multi-step automation when VRAM permits.",
+    "last_tested": "2026-05-01"
+  },
+  {
+    "model": "phi4",
+    "provider": "ollama",
+    "status": "unknown",
+    "parameter_size_b": 14,
+    "notes": "Community reports only; not yet re-verified on current harness.",
+    "last_tested": "2026-04-15"
+  },
+  {
+    "model": "tinyllama",
+    "provider": "ollama",
+    "status": "broken",
+    "parameter_size_b": 1,
+    "notes": "Too small for reliable tool use; not recommended for harness workflows.",
+    "last_tested": "2026-05-01"
+  }
+]
diff --git a/src/browser_harness/model_compatibility.py b/src/browser_harness/model_compatibility.py
new file mode 100644
index 00000000..b763ea91
--- /dev/null
+++ b/src/browser_harness/model_compatibility.py
@@ -0,0 +1,199 @@
+"""Load and query the model-compatibility.json registry (issue #329)."""
+
+from __future__ import annotations
+
+import json
+import re
+import sys
+from importlib import resources
+from pathlib import Path
+from typing import Any
+
+VALID_STATUSES = frozenset({"verified", "works", "unknown", "broken"})
+
+_SIZE_RE = re.compile(r"(\d+(?:\.\d+)?)\s*([bB])\b")
+
+
+def _registry_bytes() -> bytes:
+    """Load registry JSON: repo root when developing from a src/ checkout (#329), else bundled copy."""
+    here = Path(__file__).resolve().parent
+    if here.name == "browser_harness" and here.parent.name == "src":
+        root = here.parents[2] / "model-compatibility.json"
+        if root.is_file():
+            return root.read_bytes()
+    bundled = here / "model-compatibility.json"
+    if bundled.is_file():
+        return bundled.read_bytes()
+    return resources.files(__package__).joinpath("model-compatibility.json").read_bytes()
+
+
+def load_registry() -> list[dict[str, Any]]:
+    raw = json.loads(_registry_bytes().decode("utf-8"))
+    if not isinstance(raw, list):
+        raise ValueError("registry must be a JSON array")
+    out: list[dict[str, Any]] = []
+    for i, row in enumerate(raw):
+        if not isinstance(row, dict):
+            raise ValueError(f"entry {i} must be an object")
+        for key in ("model", "provider", "status", "notes", "last_tested"):
+            if key not in row:
+                raise ValueError(f"entry {i} missing required field {key!r}")
+            if not isinstance(row[key], str):
+                raise ValueError(f"entry {i} field {key!r} must be a string")
+        if row["status"] not in VALID_STATUSES:
+            raise ValueError(
+                f"entry {i} invalid status {row['status']!r}; "
+                f"expected one of {sorted(VALID_STATUSES)}"
+            )
+        ps = row.get("parameter_size_b")
+        if ps is not None and not isinstance(ps, (int, float)):
+            raise ValueError(f"entry {i} parameter_size_b must be a number or omitted")
+        out.append(row)
+    return out
+
+
+def parse_size_b(token: str) -> float:
+    """Parse values like '35b', '8.9B', '70' (treated as billions)."""
+    t = token.strip().lower()
+    if not t:
+        raise ValueError("empty size")
+    m = _SIZE_RE.search(t)
+    if m:
+        return float(m.group(1))
+    if t.endswith("b") and t[:-1].replace(".", "", 1).isdigit():
+        return float(t[:-1])
+    if re.fullmatch(r"\d+(?:\.\d+)?", t):
+        return float(t)
+    raise ValueError(f"unrecognized size token: {token!r}")
+
+
+def infer_parameter_size_b(entry: dict[str, Any]) -> float | None:
+    ps = entry.get("parameter_size_b")
+    if isinstance(ps, (int, float)):
+        return float(ps)
+    name = entry.get("model", "")
+    if not isinstance(name, str):
+        return None
+    m = _SIZE_RE.search(name)
+    if m:
+        return float(m.group(1))
+    return None
+
+
+def _format_row(entry: dict[str, Any]) -> str:
+    sz = infer_parameter_size_b(entry)
+    sz_s = f"{sz:g}B" if sz is not None else "—"
+    return f"{entry['model']}\t{entry['provider']}\t{entry['status']}\t{sz_s}\t{entry['last_tested']}"
+
+
+def models_list(subargs: list[str]) -> int:
+    if not subargs or subargs[0] != "list":
+        print("usage: browser-harness models list [--min-size <n>b] [--status <status>]", file=sys.stderr)
+        return 2
+
+    i = 1
+    min_b: float | None = None
+    status_filter: str | None = None
+    while i < len(subargs):
+        a = subargs[i]
+        if a == "--min-size":
+            if i + 1 >= len(subargs):
+                print("error: --min-size requires a value (e.g. 35b)", file=sys.stderr)
+                return 2
+            try:
+                min_b = parse_size_b(subargs[i + 1])
+            except ValueError as e:
+                print(f"error: {e}", file=sys.stderr)
+                return 2
+            i += 2
+            continue
+        if a == "--status":
+            if i + 1 >= len(subargs):
+                print("error: --status requires a value", file=sys.stderr)
+                return 2
+            status_filter = subargs[i + 1].strip().lower()
+            if status_filter not in VALID_STATUSES:
+                print(
+                    f"error: unknown status {subargs[i + 1]!r}; "
+                    f"expected one of {sorted(VALID_STATUSES)}",
+                    file=sys.stderr,
+                )
+                return 2
+            i += 2
+            continue
+        print(f"error: unexpected argument {a!r}", file=sys.stderr)
+        return 2
+
+    try:
+        rows = load_registry()
+    except (OSError, json.JSONDecodeError, ValueError) as e:
+        print(f"error: failed to load model registry: {e}", file=sys.stderr)
+        return 1
+
+    filtered: list[dict[str, Any]] = []
+    for row in rows:
+        if status_filter is not None and row["status"] != status_filter:
+            continue
+        if min_b is not None:
+            sz = infer_parameter_size_b(row)
+            if sz is None or sz < min_b:
+                continue
+        filtered.append(row)
+
+    print("model\tprovider\tstatus\tsize\tlast_tested")
+    for row in filtered:
+        print(_format_row(row))
+    return 0
+
+
+def resolve_model(query: str, rows: list[dict[str, Any]]) -> dict[str, Any] | list[dict[str, Any]]:
+    q = query.strip().lower()
+    if not q:
+        return []
+    exact = [r for r in rows if r["model"].lower() == q]
+    if len(exact) == 1:
+        return exact[0]
+    if len(exact) > 1:
+        return exact
+    substr = [r for r in rows if q in r["model"].lower()]
+    if len(substr) == 1:
+        return substr[0]
+    return substr
+
+
+def print_model_info(name: str) -> int:
+    try:
+        rows = load_registry()
+    except (OSError, json.JSONDecodeError, ValueError) as e:
+        print(f"error: failed to load model registry: {e}", file=sys.stderr)
+        return 1
+
+    hit = resolve_model(name, rows)
+    if isinstance(hit, list) and not hit:
+        print(f"no model matched {name!r}", file=sys.stderr)
+        return 1
+    if isinstance(hit, list):
+        names = ", ".join(repr(r["model"]) for r in hit)
+        print(f"ambiguous query {name!r}; matches: {names}", file=sys.stderr)
+        return 1
+
+    entry = hit
+    sz = infer_parameter_size_b(entry)
+    print(f"model:          {entry['model']}")
+    print(f"provider:       {entry['provider']}")
+    print(f"status:         {entry['status']}")
+    print(f"last_tested:    {entry['last_tested']}")
+    if sz is not None:
+        print(f"parameter_size: {sz:g}B")
+    print(f"notes:\n{entry['notes']}")
+    return 0
+
+
+def models_main(argv: list[str]) -> int:
+    if not argv:
+        print("usage: browser-harness models list [options]", file=sys.stderr)
+        return 2
+    if argv[0] == "list":
+        return models_list(argv)
+    print("usage: browser-harness models list [--min-size <n>b] [--status <s>]", file=sys.stderr)
+    return 2
diff --git a/src/browser_harness/run.py b/src/browser_harness/run.py
index f41e180e..afe0acfd 100644
--- a/src/browser_harness/run.py
+++ b/src/browser_harness/run.py
@@ -41,6 +41,9 @@
   browser-harness --doctor         diagnose install, daemon, and browser state
   browser-harness --update [-y]    pull the latest version (agents: pass -y)
   browser-harness --reload         stop the daemon so next call picks up code changes
+  browser-harness --model-info <n> print registry details for a model name
+  browser-harness models list [--min-size <n>b] [--status <s>]
+                                 list tested self-host / third-party models (see model-compatibility.json)
 """
 
 USAGE = """Usage:
@@ -88,6 +91,17 @@ def main():
         restart_daemon()
         print("daemon stopped — will restart fresh on next call")
         return
+    if args and args[0] == "--model-info":
+        if len(args) < 2:
+            print("error: --model-info requires a model name", file=sys.stderr)
+            sys.exit(2)
+        from .model_compatibility import print_model_info
+
+        sys.exit(print_model_info(args[1]))
+    if args and args[0] == "models":
+        from .model_compatibility import models_main
+
+        sys.exit(models_main(args[1:]))
     if args and args[0] == "--debug-clicks":
         os.environ["BH_DEBUG_CLICKS"] = "1"
         args = args[1:]
diff --git a/tests/unit/test_model_compatibility.py b/tests/unit/test_model_compatibility.py
new file mode 100644
index 00000000..26a5737a
--- /dev/null
+++ b/tests/unit/test_model_compatibility.py
@@ -0,0 +1,116 @@
+import sys
+from io import StringIO
+from unittest.mock import patch
+
+import pytest
+
+from browser_harness import model_compatibility
+from browser_harness.model_compatibility import (
+    infer_parameter_size_b,
+    load_registry,
+    models_list,
+    models_main,
+    parse_size_b,
+    print_model_info,
+    resolve_model,
+)
+
+
+def test_parse_size_b():
+    assert parse_size_b("35b") == 35
+    assert parse_size_b("35B") == 35
+    assert parse_size_b("8.9b") == 8.9
+    assert parse_size_b("70") == 70
+    with pytest.raises(ValueError):
+        parse_size_b("")
+
+
+def test_load_registry_smoke():
+    rows = load_registry()
+    assert isinstance(rows, list)
+    assert all("model" in r and "status" in r for r in rows)
+
+
+def test_infer_parameter_size_b():
+    assert infer_parameter_size_b({"model": "x", "parameter_size_b": 12}) == 12.0
+    assert infer_parameter_size_b({"model": "foo-70b-bar"}) == 70.0
+    assert infer_parameter_size_b({"model": "nope"}) is None
+
+
+def test_resolve_model_exact_and_substring():
+    rows = [
+        {"model": "ab", "provider": "p", "status": "works", "notes": "", "last_tested": "2026-01-01"},
+        {"model": "abc", "provider": "p", "status": "works", "notes": "", "last_tested": "2026-01-01"},
+    ]
+    assert resolve_model("abc", rows)["model"] == "abc"
+    assert resolve_model("AB", rows)["model"] == "ab"
+    amb = resolve_model("a", rows)
+    assert isinstance(amb, list) and len(amb) > 1
+
+
+def test_models_list_filters():
+    fake = [
+        {
+            "model": "big",
+            "provider": "ollama",
+            "status": "verified",
+            "parameter_size_b": 70,
+            "notes": "n",
+            "last_tested": "2026-01-01",
+        },
+        {
+            "model": "small",
+            "provider": "ollama",
+            "status": "works",
+            "parameter_size_b": 7,
+            "notes": "n",
+            "last_tested": "2026-01-01",
+        },
+    ]
+    out = StringIO()
+    with patch.object(model_compatibility, "load_registry", return_value=fake), patch("sys.stdout", out):
+        assert models_list(["list", "--min-size", "35b", "--status", "verified"]) == 0
+    lines = out.getvalue().strip().splitlines()
+    assert len(lines) == 2  # header + one row
+    assert "big" in lines[1]
+    assert "small" not in lines[1]
+
+
+def test_models_list_bad_flag():
+    err = StringIO()
+    with patch("sys.stderr", err):
+        assert models_list(["list", "--nope"]) == 2
+    assert "unexpected" in err.getvalue()
+
+
+def test_models_main_dispatches_list():
+    out = StringIO()
+    with patch.object(model_compatibility, "load_registry", return_value=[]), patch("sys.stdout", out):
+        assert models_main(["list"]) == 0
+    assert "model\tprovider" in out.getvalue()
+
+
+def test_print_model_info_not_found():
+    err = StringIO()
+    with patch.object(model_compatibility, "load_registry", return_value=[]), patch("sys.stderr", err):
+        assert print_model_info("missing-model-xyz") == 1
+    assert "no model matched" in err.getvalue()
+
+
+def test_run_model_info_invocation():
+    from browser_harness import run
+
+    out = StringIO()
+    err = StringIO()
+    with patch.object(sys, "argv", ["browser-harness", "--model-info", "mistral-small"]), \
+         patch("sys.stdout", out), \
+         patch("sys.stderr", err):
+        try:
+            run.main()
+        except SystemExit as e:
+            assert e.code == 0
+        else:
+            raise AssertionError("expected sys.exit")
+    body = out.getvalue()
+    assert "mistral-small" in body
+    assert "anyscale" in body