diff --git a/docs/dark-host-activation-runbook.md b/docs/dark-host-activation-runbook.md new file mode 100644 index 0000000..fb8fab5 --- /dev/null +++ b/docs/dark-host-activation-runbook.md @@ -0,0 +1,152 @@ +# Dark-Host Activation Runbook (AIN-248) + +> **Founder-gated.** This runbook depends on three founder authorizations: +> +> 1. **~$45 in provider credits** topped up across the 5 open-weight venues (DeepInfra $15 · Together $15 · Fireworks $10 · Groq $0 · Novita $5). +> 2. **Doppler keys** mirroring those credits into the api's environment. +> 3. **Disc#12 sign-off on the Model×Host ontology** (see [dark-host-ontology-proposal.md](./dark-host-ontology-proposal.md)). +> +> Until all three are in place, **do not run the activation migration**. The smoke harness in [`scripts/dark_host_smoke.py`](../scripts/dark_host_smoke.py) can be exercised independently; the catalog stays inactive. + +## Why dark hosts matter + +The catalog has **47 inactive models across 10 providers** as of 2026-05-24 (P5 §0 of SP-4). Five of those providers are open-weight venues that host the same Llama / Mistral / DeepSeek weights at different prices — Groq (cheap-fast), DeepInfra (cheap-stable), Together (broad), Fireworks (cheap-reasoning), Novita (frontier-coverage). Activating them moves Ainfera Inference from "5-host prime-broker" to "many-host prime-broker" — venue routing becomes a real prime-brokerage product. + +Today the schema is **one-model-one-host** (verified: `select distinct(slug, provider) … having count(distinct provider) > 1` returns 0). The same model on Groq + DeepInfra + Together would need three distinct catalog rows. The Model×Host ontology proposal addresses this — see below. + +## Phase 1 — smoke (founder, no DB change) + +For each of the 5 venues, validate the adapter+upstream wiring **before** the catalog row is enrolled. The harness is read-only. + +```bash +# DeepInfra +export DEEPINFRA_API_KEY="$(doppler secrets get DEEPINFRA_API_KEY --plain)" +uv run python scripts/dark_host_smoke.py \ + --provider deepinfra \ + --upstream-model meta-llama/Llama-3.3-70B-Instruct \ + > smoke/deepinfra-llama-3.3-70b.json + +# Together +export TOGETHER_API_KEY="$(doppler secrets get TOGETHER_API_KEY --plain)" +uv run python scripts/dark_host_smoke.py \ + --provider together \ + --upstream-model meta-llama/Llama-3.3-70B-Instruct-Turbo \ + > smoke/together-llama-3.3-70b.json + +# Fireworks +export FIREWORKS_API_KEY="$(doppler secrets get FIREWORKS_API_KEY --plain)" +uv run python scripts/dark_host_smoke.py \ + --provider fireworks \ + --upstream-model accounts/fireworks/models/llama-v3p3-70b-instruct \ + > smoke/fireworks-llama-3.3-70b.json + +# Groq (free tier — no credit topup needed) +export GROQ_API_KEY="$(doppler secrets get GROQ_API_KEY --plain)" +uv run python scripts/dark_host_smoke.py \ + --provider groq \ + --upstream-model llama-3.3-70b-versatile \ + > smoke/groq-llama-3.3-70b.json + +# Novita +export NOVITA_API_KEY="$(doppler secrets get NOVITA_API_KEY --plain)" +uv run python scripts/dark_host_smoke.py \ + --provider novita \ + --upstream-model meta-llama/llama-3.3-70b-instruct \ + > smoke/novita-llama-3.3-70b.json +``` + +Each report should show `"both_ok": true` and `latency_ms` in the 200–5000ms range. Stash the 5 JSON reports — they become evidence in the §16 audit ledger when the model rows enroll. + +## Phase 2 — Model×Host ontology decision (Disc#12) + +Before any DB change, the founder authorizes the schema shape from [dark-host-ontology-proposal.md](./dark-host-ontology-proposal.md). Two paths the proposal lays out: + +- **Path A (minimal):** keep the existing `models` table; add multiple rows for the same logical model (e.g. three rows for `llama-3.3-70b` differentiated by `provider_id`). Slug becomes non-unique → schema change. +- **Path B (M:N junction):** new `model_hosts` table; `models` rows stay slug-unique; per-host price/latency lives on the junction. Heavier migration, cleaner routing semantics. + +The runbook below assumes **Path A** (the lighter migration). Path B requires a different migration template — see the proposal doc. + +## Phase 3 — activation migration TEMPLATE (do NOT apply without §1 sign-off) + +This template is **parametrized**. Filling in the values is the founder's tap; the file lives in `alembic/versions/` only after authorization. Saved here as a doc snippet to keep the schema clean. + +```python +# alembic/versions/_activate_dark_host__.py +# DO NOT COMMIT until §1 founder authorizations are signed. + +from __future__ import annotations +from decimal import Decimal +from alembic import op + +# revision = "" +# down_revision = "" + +# --- founder fills in these per-venue values from the smoke report --- +_VENUE = "deepinfra" # provider.slug +_MODEL_SLUG = "llama-3.3-70b-deepinfra" # NEW canonical slug (Path A) +_UPSTREAM_NAME = "meta-llama/Llama-3.3-70B-Instruct" +_INPUT_COST_PER_M = Decimal("0.49") # from venue pricing page +_OUTPUT_COST_PER_M = Decimal("0.79") +_Q_PRIOR = Decimal("0.78") # from AA Index v4 ÷ 100 +_BRAND_SLUG = "meta-llama" # required for compliance gate +# --------------------------------------------------------------------- + +def upgrade() -> None: + op.execute(f""" + UPDATE models SET + provider_model_name = '{_UPSTREAM_NAME}', + input_cost_per_million_usd = {_INPUT_COST_PER_M}, + output_cost_per_million_usd = {_OUTPUT_COST_PER_M}, + q_prior = {_Q_PRIOR}, + aa_index_source = 'aa_v4_2026q2', + active = TRUE + WHERE slug = '{_MODEL_SLUG}' + AND provider_id = (SELECT id FROM providers WHERE slug = '{_VENUE}'); + """) + # `brand.active = TRUE` is asserted in a separate sub-migration so + # the M_allowed gate clears for `_BRAND_SLUG`. + +def downgrade() -> None: + op.execute(f""" + UPDATE models SET active = FALSE + WHERE slug = '{_MODEL_SLUG}' + AND provider_id = (SELECT id FROM providers WHERE slug = '{_VENUE}'); + """) +``` + +## Phase 4 — verify (founder, post-deploy) + +After the migration applies: + +1. **Catalog row is active:** + + ```sql + SELECT slug, active, q_prior, aa_index_source, + input_cost_per_million_usd, output_cost_per_million_usd + FROM models WHERE slug = ''; + ``` + + Expect `active=true`, `q_prior` populated, `aa_index_source='aa_v4_2026q2'`. + +2. **Brain enrols it:** a routed call with the brain's default policy should now consider the new row. Read-only: + + ```sql + SELECT chosen_model_slug, candidates + FROM routing_outcomes + ORDER BY created_at DESC LIMIT 1; + ``` + + The new slug should appear in `candidates[]` for a routed call. If the new model is cheaper than the previous winner and clears the floor, it becomes `chosen_model_slug`. + +3. **Audit chain hash-link intact:** routine post-deploy. + +## Rollback + +`alembic downgrade -1` flips `active=false`. The brain stops enrolling the row on the next decision; existing inferences are unaffected (append-only audit chain). No data loss — the row stays in the catalog with the smoke-validated price/q_prior data for the next activation attempt. + +## What this runbook does NOT do + +- Change the routing engine (it's immutable per SP-4 §1). +- Re-score existing models — the new row enters the candidate set; old rows are untouched. +- Mutate any `routing_outcomes` row. +- Auto-enroll Anthropic / OpenAI / Google models — those are activated via the same template with `provider.slug ∈ {anthropic, openai, gemini, mistral, xai}`. The dark-host venues just use this runbook more often because they're the high-leverage open-weight catalog. diff --git a/docs/dark-host-ontology-proposal.md b/docs/dark-host-ontology-proposal.md new file mode 100644 index 0000000..356cd6e --- /dev/null +++ b/docs/dark-host-ontology-proposal.md @@ -0,0 +1,110 @@ +# Model×Host Ontology Proposal (AIN-248, Disc#12) + +> **Status: PROPOSAL.** This document does not change any schema. Aulë drafts; the founder authorizes the ontology before any migration ships. The activation runbook ([dark-host-activation-runbook.md](./dark-host-activation-runbook.md)) is parametrized on which path is chosen below. + +## The problem + +Today's `models` table treats slug as effectively one-model-one-host. The schema is: + +``` +models + id uuid (PK) + slug text -- canonical, unique-ish + provider_id uuid (FK → providers) + active bool + input_cost_per_million_usd numeric + output_cost_per_million_usd numeric + q_prior numeric(3,2) + aa_index_source text + brand_id uuid (FK → brands) + ... +``` + +Verified live (2026-05-24, `select distinct (slug, provider) … having count(distinct provider) > 1` against `dftfpwzqxoebwzepygzl`): **0 model slugs appear across multiple providers**. The catalog is operationally one-model-one-host today. + +The dark-host activation needs the SAME logical model — `llama-3.3-70b`, say — to live on **Groq + DeepInfra + Together** at three different prices, three different latencies, three different reliability profiles. That's the prime-brokerage deepening: routing chooses not just *which model* but *which venue is hosting the model right now*. + +There are three principled ways to model this. The founder picks one. + +## Path A — keep `models` flat; multi-row per logical model (lightest migration) + +Each (logical model, host) pair gets its own `models` row. The slug encodes the host: + +``` +slug provider_id (FK → providers) +───────────────────────────────── ────────────────────────── +llama-3.3-70b-groq groq +llama-3.3-70b-deepinfra deepinfra +llama-3.3-70b-together together +llama-3.3-70b-fireworks fireworks +llama-3.3-70b-novita novita +``` + +**Pros:** +- Zero schema change. The existing `models` table already accepts this shape — the activation migration just inserts rows with venue-suffixed slugs. +- Routing engine (`ainfera_routing.decide()`) treats each row as a candidate without any code change. The cheapest-clearing-floor objective naturally picks the cheapest venue for a given logical-model fit. +- §16 capture surfaces the actual venue routed to (`chosen_model_slug = "llama-3.3-70b-groq"`) — full attribution. + +**Cons:** +- Slugs encode host info — `llama-3.3-70b-groq` reads as "a Groq-flavored Llama", which is slightly leaky to SDK consumers. +- The marketing claim "Ainfera picks the best venue for `llama-3.3-70b`" is harder to express: the caller has to ask for `ainfera-inference` (routed) — they can't ask for `llama-3.3-70b` and have Ainfera route across venues. They'd have to ask for ONE of the suffixed slugs. +- The "logical model" concept exists only implicitly (by inspecting the slug prefix). Cross-venue reports (`SELECT count(*) WHERE logical_model = "llama-3.3-70b"`) require a substring match. + +**Migration burden:** 1 alembic migration per (model, venue) pair, applied via the runbook template. Linear effort, no schema delta. + +## Path B — `model_hosts` junction (M:N) + +Keep `models` slug-unique (`llama-3.3-70b` is one row). Add a new `model_hosts` junction that carries the per-host price + q_prior + latency profile: + +``` +models + id, slug (UNIQUE), brand_id, capabilities, ... + -- NO provider_id, NO cost fields, NO q_prior on this row + +model_hosts + id uuid (PK) + model_id uuid (FK → models) + provider_id uuid (FK → providers) + upstream_model_name text -- "meta-llama/Llama-3.3-70B-Instruct" + input_cost_per_million_usd numeric + output_cost_per_million_usd numeric + q_prior numeric(3,2) -- per-host (Groq's Llama vs Together's Llama) + aa_index_source text + active bool + UNIQUE (model_id, provider_id) +``` + +**Pros:** +- Slug stays clean. `model="llama-3.3-70b"` becomes a valid routed request — the brain picks the cheapest active host. +- Cross-host aggregates are trivial: `SELECT count(*) FROM model_hosts WHERE model_id = ?`. +- Per-host quality signal: Groq's faster-cheaper Llama and Together's slower-cheaper Llama can have different `q_prior` values if `q_empirical` later diverges. +- Cleaner public narrative: "Ainfera routes ONE model across many venues" is true at the schema level. + +**Cons:** +- **Real schema migration**: add `model_hosts` + backfill from existing `models` rows + drop the per-host fields from `models` (cost / q_prior / aa_index_source / provider_id). The 5 currently-active models become 5 (model, host) junction rows. Audit chain replay needs to handle the old row shape during the migration window. +- Routing engine needs candidate-set shape change: a `Candidate` today is built from one `models` row; under Path B it's built from a `models ⋈ model_hosts` join. Brain code (`ainfera_routing.build_candidates`) gets a column rename. Engine logic is unchanged — but it's a code touch, which is a Disc#12 concern. +- `routing_outcomes.chosen_model_slug` is no longer enough to attribute a routed call to a venue; need `chosen_host_id` or `chosen_model_host` text. Schema change to the immutable §16 row — **violates SP-4 §1 routing_outcomes immutability** unless we add the column as `NULL`-default in a forward-compat migration. + +**Migration burden:** 1 big alembic migration + a coordinated routing-engine field rename. Heavier; needs a backward-compat read window during deploy. + +## Path C — same as Path A, but pin a "primary" host per logical model + +Path A's slugs (`llama-3.3-70b-groq`) but with a new optional column `models.logical_slug` (default null, populated for multi-host rows). Reporting joins on `logical_slug` for cross-venue aggregates; caller-facing API stays unchanged (still slug-based). + +**Pros:** lightest schema delta (one nullable column); cross-venue aggregates queryable; public narrative unchanged. +**Cons:** still requires the caller to know which suffixed slug to pin; same leak as Path A from the SDK consumer's perspective. + +## Recommendation + +**Path A** for the SP-4 dark-host activation pass — it's the lightest migration and the routing engine needs zero code change. The brain just sees more candidates. Cross-venue aggregates are postponed (substring match) until the catalog reaches enough multi-host density to justify the model_hosts junction. + +When the catalog hits ~20 multi-host models (Path A becomes painful to report on), migrate to Path B in a follow-up sprint with the §16-column-addition migration handled separately from the schema reshape. + +## Disc#12 questions for the founder + +1. **Which path?** A (light) / B (junction) / C (logical_slug column). +2. **Slug convention for Path A:** `-` (e.g. `llama-3.3-70b-groq`) vs `/` (e.g. `groq/llama-3.3-70b`). The first is greppable; the second matches Anthropic's `anthropic/claude-...` convention. Both work; pick one. +3. **q_prior sourcing per host:** the AA Index gives one number per logical model. Per-host q_prior diverges from `q_empirical` once traffic lands; until then all hosts of the same logical model start at the same `q_prior` value. Confirm that's the intended starting point. +4. **Public claim:** "Ainfera Inference picks the cheapest venue for your floor" — is this in the public narrative for the activation announce, or does it stay internal until enough venues are active to demo it? + +Until the founder answers these, the activation runbook stays parked. The smoke harness can be exercised in parallel — it never touches the DB. diff --git a/scripts/dark_host_smoke.py b/scripts/dark_host_smoke.py new file mode 100644 index 0000000..2cd7448 --- /dev/null +++ b/scripts/dark_host_smoke.py @@ -0,0 +1,210 @@ +"""SP-4 PR-C · dark-host smoke harness. + +Given a Doppler key + a (provider_slug, upstream_model_name, base_url) +target, exercise the existing ProviderAdapter against it and report +latency / cost / response shape. Used to validate a new open-weight +venue BEFORE activating any catalog rows. + +**Activates nothing.** This script: + +- Reads the API key from the environment (Doppler-injected, same + pattern as production). +- Instantiates the matching `OpenAICompatAdapter` (or `AnthropicAdapter` + for `provider_slug == "anthropic"`). +- Calls `.chat(...)` with a tiny "reply with the word OK" prompt + twice (for latency variance). +- Prints a JSON report: `{provider, upstream_model, ok, latency_ms, + input_tokens, output_tokens, response_text, error}`. + +It does NOT touch the DB. It does NOT enroll the model. It does NOT +seed q_prior. Activation is the runbook in `docs/dark-host-activation-runbook.md`, +which the founder runs AFTER this smoke succeeds AND the Model x Host +ontology proposal in `docs/dark-host-ontology-proposal.md` has been +authorized (Disc#12). + +## Usage + + # The 5 open-weight venues + the keys they read from env: + export DEEPINFRA_API_KEY=... # ~$15 credits + export TOGETHER_API_KEY=... # ~$15 credits + export FIREWORKS_API_KEY=... # ~$10 credits + export GROQ_API_KEY=... # free tier + export NOVITA_API_KEY=... # ~$5 credits + + uv run python scripts/dark_host_smoke.py \\ + --provider deepinfra \\ + --upstream-model meta-llama/Llama-3.3-70B-Instruct \\ + --base-url https://api.deepinfra.com + +## Why this is run by the founder, not by Aulë + +The smoke harness needs **live provider credits + the corresponding +Doppler keys**, which Aulë does not have. The founder runs this +script after topping up the venues; the output goes into the +activation runbook as evidence the upstream is reachable + the +adapter's `chat()` shape matches. +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import sys +import time +from typing import Any + +from ainfera_api.adapters.anthropic import AnthropicAdapter +from ainfera_api.adapters.openai_compat import OpenAICompatAdapter +from ainfera_api.adapters.provider import ProviderAdapter, ProviderError + +# Provider → (env var, default base_url, adapter class). The smoke +# harness reads from env so we never put credits in argv (where they'd +# end up in shell history). +_PROVIDERS: dict[str, tuple[str, str, type[ProviderAdapter]]] = { + "deepinfra": ( + "DEEPINFRA_API_KEY", + "https://api.deepinfra.com", + OpenAICompatAdapter, + ), + "together": ( + "TOGETHER_API_KEY", + "https://api.together.xyz", + OpenAICompatAdapter, + ), + "fireworks": ( + "FIREWORKS_API_KEY", + "https://api.fireworks.ai/inference", + OpenAICompatAdapter, + ), + "groq": ( + "GROQ_API_KEY", + "https://api.groq.com/openai", + OpenAICompatAdapter, + ), + "novita": ( + "NOVITA_API_KEY", + "https://api.novita.ai/v3/openai", + OpenAICompatAdapter, + ), + "anthropic": ( + "ANTHROPIC_API_KEY", + "https://api.anthropic.com", + AnthropicAdapter, + ), +} + + +async def _smoke_one( + adapter: ProviderAdapter, + *, + upstream_model: str, +) -> dict[str, Any]: + """Single `.chat()` call against the live adapter. Reports timing + + response shape; never raises (errors return as JSON-serializable + dicts so the founder can pipe the output to a file). + """ + messages = [ + {"role": "user", "content": "Reply with exactly the word OK and nothing else."}, + ] + start = time.monotonic() + try: + response = await adapter.chat( + model=upstream_model, + messages=messages, + max_tokens=80, + temperature=0.0, + ) + except ProviderError as exc: + return { + "ok": False, + "latency_ms": int((time.monotonic() - start) * 1000), + "error": { + "kind": "provider_error", + "status_code": exc.status_code, + "body": exc.body if isinstance(exc.body, str) else json.dumps(exc.body), + }, + } + except Exception as exc: # we want every failure mode captured in the JSON report + return { + "ok": False, + "latency_ms": int((time.monotonic() - start) * 1000), + "error": {"kind": type(exc).__name__, "message": str(exc)}, + } + latency_ms = int((time.monotonic() - start) * 1000) + return { + "ok": True, + "latency_ms": latency_ms, + "input_tokens": response.input_tokens, + "output_tokens": response.output_tokens, + "model_used": response.model_used, + "finish_reason": response.finish_reason, + "response_text": response.content[:200], + } + + +async def main_async(args: argparse.Namespace) -> int: + import os + + if args.provider not in _PROVIDERS: + print( + f"error: unknown provider {args.provider!r}. Known: {sorted(_PROVIDERS)}", + file=sys.stderr, + ) + return 2 + env_var, default_base, adapter_cls = _PROVIDERS[args.provider] + api_key = os.environ.get(env_var) + if not api_key: + print( + f"error: {env_var} not set in env. Doppler-inject the key " + f"before running this script — see docs/dark-host-activation-runbook.md.", + file=sys.stderr, + ) + return 2 + + base_url = args.base_url or default_base + adapter = adapter_cls(api_key=api_key, base_url=base_url) + + # Two consecutive calls — gives a coarse latency variance read so + # cold-start vs warm doesn't read as a single misleading number. + call_1 = await _smoke_one(adapter, upstream_model=args.upstream_model) + call_2 = await _smoke_one(adapter, upstream_model=args.upstream_model) + + report = { + "provider": args.provider, + "upstream_model": args.upstream_model, + "base_url": base_url, + "call_1": call_1, + "call_2": call_2, + "both_ok": call_1.get("ok") and call_2.get("ok"), + } + print(json.dumps(report, indent=2)) + return 0 if report["both_ok"] else 1 + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Smoke-test a dark-host adapter target (founder-run; needs live keys)." + ) + parser.add_argument( + "--provider", + required=True, + choices=sorted(_PROVIDERS), + help="open-weight venue slug", + ) + parser.add_argument( + "--upstream-model", + required=True, + help="the upstream-vendor model name (e.g. meta-llama/Llama-3.3-70B-Instruct)", + ) + parser.add_argument( + "--base-url", + default=None, + help="override base_url (defaults to the venue's documented endpoint)", + ) + args = parser.parse_args() + return asyncio.run(main_async(args)) + + +if __name__ == "__main__": + sys.exit(main())