From 1d1736eb6ca1e8303c52b6acb18fc50f603ef77f Mon Sep 17 00:00:00 2001 From: Tibo2403 Date: Sat, 27 Jun 2026 18:42:33 +0200 Subject: [PATCH 1/6] Add self-hosted Qwen fallback for Codex routing --- scripts/python/codex-routing-policy.yaml | 2 ++ scripts/python/codex_cost_router.py | 30 ++++++++++++++++++- scripts/python/codex_key_session_web.py | 13 +++++++- scripts/python/litellm-cost-routing.yaml | 26 ++++++++++++++++ .../python/tests/test_codex_cost_router.py | 17 +++++++++++ 5 files changed, 86 insertions(+), 2 deletions(-) diff --git a/scripts/python/codex-routing-policy.yaml b/scripts/python/codex-routing-policy.yaml index a1200f9..81052fa 100644 --- a/scripts/python/codex-routing-policy.yaml +++ b/scripts/python/codex-routing-policy.yaml @@ -1,5 +1,7 @@ # Codex cost-routing policy. # CLI options still have priority, then environment variables, then this file. +# Provider choices: auto, openai, gemini, huggingface, qwen. +# qwen uses a self-hosted OpenAI-compatible endpoint via QWEN_API_BASE. default_provider: auto default_codex_provider: litellm diff --git a/scripts/python/codex_cost_router.py b/scripts/python/codex_cost_router.py index a7b6af1..d72e9e1 100644 --- a/scripts/python/codex_cost_router.py +++ b/scripts/python/codex_cost_router.py @@ -33,9 +33,10 @@ HF_FAST_MODEL = "codex-hf-fast" HF_CHEAP_MODEL = "codex-hf-cheap" HF_DIRECT_MODEL = "openai/gpt-oss-120b:fastest" +QWEN_LOCAL_MODEL = "codex-qwen-local" DEFAULT_MAX_INPUT_TOKENS = 12_000 DEFAULT_MAX_OUTPUT_TOKENS = 2_000 -PROVIDERS = ("auto", "openai", "gemini", "huggingface") +PROVIDERS = ("auto", "openai", "gemini", "huggingface", "qwen") CODEX_PROVIDERS = ("litellm", "huggingface") MODELS = ( LIGHT_MODEL, @@ -46,6 +47,7 @@ LEGACY_STRONG_MODEL, HF_FAST_MODEL, HF_CHEAP_MODEL, + QWEN_LOCAL_MODEL, ) LITELLM_HOST = "localhost" LITELLM_PORT = 4000 @@ -75,6 +77,7 @@ LEGACY_STRONG_MODEL: {"input": 2.00, "output": 8.00}, HF_CHEAP_MODEL: {"input": 0.10, "output": 0.30}, HF_FAST_MODEL: {"input": 0.25, "output": 0.75}, + QWEN_LOCAL_MODEL: {"input": 0.0, "output": 0.0}, } SIMPLE_TERMS = ( @@ -122,6 +125,16 @@ "provider benchmark", "benchmark providers", ) +QWEN_TERMS = ( + "qwen", + "auto-heberge", + "auto heberge", + "auto-hebergee", + "self-hosted", + "self hosted", + "local llm", + "openai-compatible local", +) LONG_CONTEXT_TERMS = ( "gros contexte", "long contexte", @@ -378,6 +391,11 @@ def hf_available() -> bool: return bool(os.environ.get("HF_TOKEN")) +def qwen_available() -> bool: + """Return whether a self-hosted Qwen endpoint is configured.""" + return bool(os.environ.get("QWEN_API_BASE")) + + def default_provider() -> str: """Read the provider preference from the environment with a safe fallback.""" provider = os.environ.get("CODEX_ROUTER_PROVIDER", "auto").casefold() @@ -487,6 +505,11 @@ def route_model( return model, f"huggingface provider requested; {reason}" return DEFAULT_MODEL, "huggingface requested but HF_TOKEN is missing; using default OpenAI/Gemini tier" + if provider == "qwen": + if qwen_available(): + return QWEN_LOCAL_MODEL, f"qwen provider requested; {reason}" + return DEFAULT_MODEL, "qwen requested but QWEN_API_BASE is missing; using default OpenAI/Gemini tier" + if provider == "openai": model = LIGHT_MODEL if complexity == "simple" else DEEP_MODEL return model, f"openai provider requested; {reason}" @@ -499,6 +522,9 @@ def route_model( model = HF_CHEAP_MODEL if complexity == "simple" else HF_FAST_MODEL return model, f"huggingface-related task; {reason}" + if any(term in normalized for term in QWEN_TERMS) and qwen_available(): + return QWEN_LOCAL_MODEL, f"qwen local task; {reason}" + if wants_long_context: return LONG_MODEL, f"long-context task; {reason}" @@ -656,6 +682,8 @@ def print_doctor() -> int: ("LiteLLM proxy localhost:4000", proxy_available(), "listening" if proxy_available() else "not listening"), ("LITELLM_API_KEY", bool(os.environ.get("LITELLM_API_KEY")), "set" if os.environ.get("LITELLM_API_KEY") else "missing"), ("OPENAI_API_KEY", bool(os.environ.get("OPENAI_API_KEY")), "set" if os.environ.get("OPENAI_API_KEY") else "missing"), + ("QWEN_API_BASE optional", True, os.environ.get("QWEN_API_BASE") or "missing; self-hosted Qwen fallback disabled"), + ("QWEN_API_KEY optional", True, "set" if os.environ.get("QWEN_API_KEY") else "missing; use dummy value for no-auth local servers"), ("HF_TOKEN optional", True, "set" if hf_available() else "missing; Hugging Face aliases disabled"), ("PYTHONUTF8", os.environ.get("PYTHONUTF8") == "1", "1" if os.environ.get("PYTHONUTF8") == "1" else "missing or not 1"), ("Cost-routing profile", router_enabled(), "enabled" if router_enabled() else "disabled"), diff --git a/scripts/python/codex_key_session_web.py b/scripts/python/codex_key_session_web.py index f957582..46aacbe 100644 --- a/scripts/python/codex_key_session_web.py +++ b/scripts/python/codex_key_session_web.py @@ -102,6 +102,10 @@ + + + +
@@ -199,7 +203,9 @@ def _start_proxy(self) -> None: openai_key = form.get("OPENAI_API_KEY", "") gemini_key = form.get("GEMINI_API_KEY", "") hf_token = form.get("HF_TOKEN", "") - if not any((openai_key, gemini_key, hf_token)): + qwen_base = form.get("QWEN_API_BASE", "") + qwen_key = form.get("QWEN_API_KEY", "") + if not any((openai_key, gemini_key, hf_token, qwen_base)): self.state.message = "Provide at least one provider key." self._send_page() return @@ -215,6 +221,9 @@ def _start_proxy(self) -> None: env["GEMINI_API_KEY"] = gemini_key if hf_token: env["HF_TOKEN"] = hf_token + if qwen_base: + env["QWEN_API_BASE"] = qwen_base.rstrip("/") + env["QWEN_API_KEY"] = qwen_key or "sk-local-qwen" try: self.state.process = subprocess.Popen( @@ -246,6 +255,8 @@ def _start_proxy(self) -> None: providers.append("Gemini") if hf_token: providers.append("Hugging Face") + if qwen_base: + providers.append("Qwen local") self.state.message = "Session proxy started with: " + ", ".join(providers) else: self.state.message = "LiteLLM process started, but the proxy port did not become ready yet." diff --git a/scripts/python/litellm-cost-routing.yaml b/scripts/python/litellm-cost-routing.yaml index d7027c4..5a85c7a 100644 --- a/scripts/python/litellm-cost-routing.yaml +++ b/scripts/python/litellm-cost-routing.yaml @@ -5,7 +5,11 @@ # - codex-default: normal coding work, OpenAI biased with Gemini relief # - codex-long: long-context reads and synthesis, Gemini Pro biased # - codex-deep: high-stakes debugging/security/architecture, OpenAI biased +# - codex-qwen-local: self-hosted OpenAI-compatible Qwen fallback # API keys are read from environment variables and must never be committed. +# Qwen fallback expects QWEN_API_BASE, for example http://127.0.0.1:8000/v1. +# QWEN_API_KEY is optional for many local servers; use any dummy value when the +# server does not require authentication. model_list: # Backward-compatible aliases used by older wrapper calls. @@ -88,6 +92,17 @@ model_list: api_key: os.environ/GEMINI_API_KEY weight: 1 + # Local OpenAI-compatible fallback for a self-hosted Qwen endpoint. + # Start a server with a /v1-compatible API and set: + # QWEN_API_BASE=http://127.0.0.1:8000/v1 + # QWEN_API_KEY=sk-local-qwen + - model_name: codex-qwen-local + litellm_params: + model: openai/qwen-auto-hosted + api_base: os.environ/QWEN_API_BASE + api_key: os.environ/QWEN_API_KEY + weight: 1 + - model_name: codex-hf-cheap litellm_params: model: huggingface/groq/openai/gpt-oss-120b @@ -109,29 +124,40 @@ router_settings: gpt-5.4-mini: codex-light gemini-3.5-pro: codex-long gemini-3.5-flash: codex-light + qwen-auto-hosted: codex-qwen-local fallbacks: - codex-light: - codex-default + - codex-qwen-local - codex-default: - codex-long - codex-light + - codex-qwen-local - codex-long: - codex-default + - codex-qwen-local - codex-deep: - codex-default - codex-long + - codex-qwen-local - codex-cheap: - codex-strong - codex-default + - codex-qwen-local - codex-strong: - codex-default - codex-long + - codex-qwen-local - codex-hf-cheap: - codex-light - codex-cheap + - codex-qwen-local - codex-hf-fast: - codex-default - codex-deep + - codex-qwen-local + - codex-qwen-local: + - codex-light context_window_fallbacks: - codex-light: - codex-long diff --git a/scripts/python/tests/test_codex_cost_router.py b/scripts/python/tests/test_codex_cost_router.py index cf270bb..de14cca 100644 --- a/scripts/python/tests/test_codex_cost_router.py +++ b/scripts/python/tests/test_codex_cost_router.py @@ -60,6 +60,23 @@ def test_route_model_falls_back_when_hugging_face_token_is_missing(self) -> None self.assertEqual(model, "codex-default") self.assertIn("HF_TOKEN is missing", reason) + def test_route_model_can_use_self_hosted_qwen_when_endpoint_exists(self) -> None: + with patch.dict(ROUTER.os.environ, {"QWEN_API_BASE": "http://127.0.0.1:8000/v1"}): + self.assertEqual( + ROUTER.route_model("Use qwen auto heberge as backup", provider="qwen")[0], + "codex-qwen-local", + ) + self.assertEqual( + ROUTER.route_model("Prefer self-hosted local llm fallback")[0], + "codex-qwen-local", + ) + + def test_route_model_falls_back_when_qwen_endpoint_is_missing(self) -> None: + with patch.dict(ROUTER.os.environ, {}, clear=True): + model, reason = ROUTER.route_model("Use Qwen local", provider="qwen") + self.assertEqual(model, "codex-default") + self.assertIn("QWEN_API_BASE is missing", reason) + def test_codex_provider_helpers_select_expected_profiles(self) -> None: self.assertEqual(ROUTER.codex_profile("litellm"), "cost-routing") self.assertEqual(ROUTER.codex_profile("huggingface"), "cost-routing-hf") From cddd21f8166f13c7129e2e030d712ed9febbc5cf Mon Sep 17 00:00:00 2001 From: Tibo2403 Date: Sat, 27 Jun 2026 19:04:58 +0200 Subject: [PATCH 2/6] Fix Hugging Face Codex dispatch routing --- scripts/python/codex_key_session_web.py | 1 + scripts/python/litellm-cost-routing.yaml | 2 +- .../tests/test_codex_key_session_web.py | 41 ++++++++++++++++++- 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/scripts/python/codex_key_session_web.py b/scripts/python/codex_key_session_web.py index 46aacbe..629f564 100644 --- a/scripts/python/codex_key_session_web.py +++ b/scripts/python/codex_key_session_web.py @@ -221,6 +221,7 @@ def _start_proxy(self) -> None: env["GEMINI_API_KEY"] = gemini_key if hf_token: env["HF_TOKEN"] = hf_token + env["HUGGINGFACE_API_KEY"] = hf_token if qwen_base: env["QWEN_API_BASE"] = qwen_base.rstrip("/") env["QWEN_API_KEY"] = qwen_key or "sk-local-qwen" diff --git a/scripts/python/litellm-cost-routing.yaml b/scripts/python/litellm-cost-routing.yaml index 5a85c7a..7bb2db6 100644 --- a/scripts/python/litellm-cost-routing.yaml +++ b/scripts/python/litellm-cost-routing.yaml @@ -110,7 +110,7 @@ model_list: - model_name: codex-hf-fast litellm_params: - model: huggingface/together/deepseek-ai/DeepSeek-R1 + model: huggingface/together/openai/gpt-oss-120b api_key: os.environ/HF_TOKEN router_settings: diff --git a/scripts/python/tests/test_codex_key_session_web.py b/scripts/python/tests/test_codex_key_session_web.py index afce691..8842531 100644 --- a/scripts/python/tests/test_codex_key_session_web.py +++ b/scripts/python/tests/test_codex_key_session_web.py @@ -1,9 +1,10 @@ """Tests for the optional local web key session launcher.""" import importlib.util +import os import unittest from pathlib import Path -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch MODULE_PATH = Path(__file__).resolve().parents[1] / "codex_key_session_web.py" @@ -40,6 +41,44 @@ def test_stop_proxy_terminates_running_process(self) -> None: self.assertIsNone(state.process) self.assertEqual(state.message, "done") + def test_start_proxy_exports_hf_token_under_litellm_mapping_name(self) -> None: + handler = object.__new__(WEB.KeySessionHandler) + handler.state = WEB.SessionState() + handler.config_path = Path("config.yaml") + handler.litellm_path = Path("litellm.exe") + handler.proxy_host = "127.0.0.1" + handler.proxy_port = 4000 + handler._read_form = MagicMock( # type: ignore[method-assign] + return_value={ + "OPENAI_API_KEY": "", + "GEMINI_API_KEY": "", + "HF_TOKEN": "hf_test", + "QWEN_API_BASE": "", + "QWEN_API_KEY": "", + } + ) + handler._stop_proxy = MagicMock() # type: ignore[method-assign] + handler._send_page = MagicMock() # type: ignore[method-assign] + + process = MagicMock() + process.poll.return_value = None + captured_env: dict[str, str] = {} + + def fake_popen(*args: object, **kwargs: object) -> MagicMock: + captured_env.update(kwargs["env"]) # type: ignore[index] + return process + + with ( + patch.object(WEB.subprocess, "Popen", side_effect=fake_popen), + patch.object(WEB, "wait_for_port", return_value=True), + patch.dict(os.environ, {}, clear=True), + ): + handler._start_proxy() + + self.assertEqual(captured_env["HF_TOKEN"], "hf_test") + self.assertEqual(captured_env["HUGGINGFACE_API_KEY"], "hf_test") + self.assertIn("Hugging Face", handler.state.message) + if __name__ == "__main__": unittest.main() From 6a634cee4a0bb0ca017a43146d8f602668f894a3 Mon Sep 17 00:00:00 2001 From: Tibo2403 Date: Sat, 27 Jun 2026 20:36:13 +0200 Subject: [PATCH 3/6] Document and verify Codex LiteLLM dispatch --- .../Install-CodexLocalLiteLLMAssets.ps1 | 3 +- scripts/python/README.md | 11 ++- scripts/python/README_Codex_Cost_Routing.md | 52 +++++++++- scripts/python/Test-CodexLiteLLMDispatch.ps1 | 95 +++++++++++++++++++ 4 files changed, 151 insertions(+), 10 deletions(-) create mode 100644 scripts/python/Test-CodexLiteLLMDispatch.ps1 diff --git a/scripts/python/Install-CodexLocalLiteLLMAssets.ps1 b/scripts/python/Install-CodexLocalLiteLLMAssets.ps1 index bc87468..6fc0938 100644 --- a/scripts/python/Install-CodexLocalLiteLLMAssets.ps1 +++ b/scripts/python/Install-CodexLocalLiteLLMAssets.ps1 @@ -8,7 +8,8 @@ New-Item -ItemType Directory -Force -Path $target | Out-Null $files = @( 'litellm-cost-routing.yaml', 'codex_key_session_web.py', - 'Start-CodexKeySessionWeb.ps1' + 'Start-CodexKeySessionWeb.ps1', + 'Test-CodexLiteLLMDispatch.ps1' ) foreach ($file in $files) { diff --git a/scripts/python/README.md b/scripts/python/README.md index 02ebe3e..b1b9d59 100644 --- a/scripts/python/README.md +++ b/scripts/python/README.md @@ -58,9 +58,10 @@ dispatches those aliases across OpenAI and Gemini while keeping API keys in environment variables. When `HF_TOKEN` is available, it can also route Hugging Face and multi-provider tasks through the `codex-hf-cheap` and `codex-hf-fast` LiteLLM aliases, or launch an optional `cost-routing-hf` Codex profile that -points directly at the Hugging Face router. `codex-routing-policy.yaml` keeps -the default provider rules and fallback order editable without changing Python -code. +points directly at the Hugging Face router. `codex-qwen-local` is available as +a self-hosted OpenAI-compatible Qwen fallback when `QWEN_API_BASE` is set. +`codex-routing-policy.yaml` keeps the default provider rules and fallback order +editable without changing Python code. See [`README_Codex_Cost_Routing.md`](README_Codex_Cost_Routing.md) for setup, activation, LiteLLM configuration, and usage instructions. @@ -68,7 +69,9 @@ activation, LiteLLM configuration, and usage instructions. To enter OpenAI, Gemini, or Hugging Face keys through a local page for one session, run `Start-CodexKeySessionWeb.ps1` and open `http://127.0.0.1:8787/`. Keys are kept in memory for the LiteLLM subprocess -and are not written to disk. +and are not written to disk. Use `Test-CodexLiteLLMDispatch.ps1` to verify the +local proxy aliases, or add `-Call -Model codex-hf-cheap` after entering a +provider key to make one minimal dispatch request. ## LLM Review Tools diff --git a/scripts/python/README_Codex_Cost_Routing.md b/scripts/python/README_Codex_Cost_Routing.md index 2eb5ed9..6ec17a7 100644 --- a/scripts/python/README_Codex_Cost_Routing.md +++ b/scripts/python/README_Codex_Cost_Routing.md @@ -34,7 +34,7 @@ provider pool. The local config still includes two optional aliases: ```yaml codex-hf-cheap -> huggingface/groq/openai/gpt-oss-120b -codex-hf-fast -> huggingface/together/deepseek-ai/DeepSeek-R1 +codex-hf-fast -> huggingface/together/openai/gpt-oss-120b ``` Set `HF_TOKEN` in the shell before starting the router. A fine-grained token @@ -56,6 +56,30 @@ python .\scripts\python\codex_cost_router.py run --dry-run ` `--provider auto` routes Hugging Face or multi-provider prompts to the HF aliases only when `HF_TOKEN` is present. Otherwise it keeps the OpenAI-backed aliases. +LiteLLM also uses `HUGGINGFACE_API_KEY` while resolving some Inference Provider +mappings. The local web session exports the submitted `HF_TOKEN` under both +names for the LiteLLM subprocess. If you start LiteLLM manually, set both names +to the same token: + +```powershell +$env:HF_TOKEN = 'hf_...' +$env:HUGGINGFACE_API_KEY = $env:HF_TOKEN +``` + +## Self-Hosted Qwen Fallback + +The local LiteLLM config includes `codex-qwen-local` as a final fallback for +the main Codex aliases. It expects an OpenAI-compatible local endpoint: + +```powershell +$env:QWEN_API_BASE = 'http://127.0.0.1:8000/v1' +$env:QWEN_API_KEY = 'sk-local-qwen' +``` + +`QWEN_API_KEY` can be any dummy value when your local server does not require +authentication. The local web key page also accepts these two fields and passes +them only to the LiteLLM subprocess environment. + Second, Hugging Face can be added as an optional Codex-facing layer. Running `enable` now installs two managed profiles: @@ -155,10 +179,10 @@ If you prefer entering keys in a local page for one work session, start: ``` Then open `http://127.0.0.1:8787/`, paste `OPENAI_API_KEY`, -`GEMINI_API_KEY`, or `HF_TOKEN`, and submit the form. The page starts the -LiteLLM proxy on `http://127.0.0.1:4000/v1` with those keys only in the proxy -process environment. The keys are not written to disk and the web server -suppresses request logging. +`GEMINI_API_KEY`, `HF_TOKEN`, or the optional Qwen endpoint fields, and submit +the form. The page starts the LiteLLM proxy on `http://127.0.0.1:4000/v1` with +those values only in the proxy process environment. The keys are not written to +disk and the web server suppresses request logging. To launch the optional Hugging Face-facing profile instead of the local LiteLLM proxy: @@ -183,6 +207,23 @@ python .\scripts\python\codex_cost_router.py doctor If a browser opened on `http://localhost:4000/health` shows `Unauthorized`, that is expected: the local proxy is protected by `LITELLM_API_KEY`. +Validate the local proxy aliases without making a paid/model call: + +```powershell +.\scripts\python\Test-CodexLiteLLMDispatch.ps1 +``` + +Run a real minimal provider call after entering the relevant key in the local +web page: + +```powershell +.\scripts\python\Test-CodexLiteLLMDispatch.ps1 -Model codex-hf-cheap -Call +.\scripts\python\Test-CodexLiteLLMDispatch.ps1 -Model codex-qwen-local -Call +.\scripts\python\Test-CodexLiteLLMDispatch.ps1 -Model codex-default -Call +``` + +The test prints a compact JSON result and never prints provider tokens. + ## Optimized One-Shot Requests Use the Python wrapper when prompt cleanup and dynamic model routing are needed: @@ -225,6 +266,7 @@ Prompts and API keys are not logged. - `codex_cost_router.py`: prompt optimization and one-shot routing. - `codex_key_session_web.py`: local-only web form for session keys. - `Start-CodexKeySessionWeb.ps1`: PowerShell launcher for the local key page. +- `Test-CodexLiteLLMDispatch.ps1`: local proxy alias and optional call test. - `codex-routing-policy.yaml`: editable routing policy and fallback order. - `litellm-cost-routing.yaml`: local LiteLLM OSS OpenAI/Gemini model groups, context-window fallbacks, cooldowns, and compatibility aliases. diff --git a/scripts/python/Test-CodexLiteLLMDispatch.ps1 b/scripts/python/Test-CodexLiteLLMDispatch.ps1 new file mode 100644 index 0000000..0c99189 --- /dev/null +++ b/scripts/python/Test-CodexLiteLLMDispatch.ps1 @@ -0,0 +1,95 @@ +[CmdletBinding()] +param( + [string]$BaseUrl = "http://127.0.0.1:4000/v1", + [string]$ApiKey = "sk-local-codex", + [string]$Model = "codex-default", + [switch]$Call, + [int]$TimeoutSec = 90 +) + +$ErrorActionPreference = "Stop" + +$headers = @{ + "Authorization" = "Bearer $ApiKey" + "Content-Type" = "application/json" +} + +function ConvertTo-ShortError { + param([object]$ErrorRecord) + + $message = $ErrorRecord.Exception.Message + if ($ErrorRecord.ErrorDetails -and $ErrorRecord.ErrorDetails.Message) { + $message = $ErrorRecord.ErrorDetails.Message + } + if ($message.Length -gt 900) { + return $message.Substring(0, 900) + "..." + } + return $message +} + +$models = Invoke-RestMethod -Uri "$BaseUrl/models" -Headers $headers -Method Get -TimeoutSec 10 +$modelIds = @($models.data | ForEach-Object { $_.id }) +$requiredAliases = @( + "codex-light", + "codex-default", + "codex-long", + "codex-deep", + "codex-qwen-local", + "codex-hf-cheap", + "codex-hf-fast" +) +$missingAliases = @($requiredAliases | Where-Object { $modelIds -notcontains $_ }) + +$health = $null +try { + $healthUrl = $BaseUrl -replace "/v1$", "" + $health = Invoke-RestMethod -Uri "$healthUrl/health" -Headers $headers -Method Get -TimeoutSec $TimeoutSec +} +catch { + $health = [pscustomobject]@{ + healthy_count = $null + unhealthy_count = $null + health_error = ConvertTo-ShortError $_ + } +} + +$callResult = $null +if ($Call) { + $body = @{ + model = $Model + messages = @( + @{ + role = "user" + content = "Reply with exactly: dispatch ok" + } + ) + max_tokens = 16 + temperature = 0 + } | ConvertTo-Json -Depth 6 + + try { + $response = Invoke-RestMethod -Uri "$BaseUrl/chat/completions" -Headers $headers -Method Post -Body $body -TimeoutSec $TimeoutSec + $callResult = [pscustomobject]@{ + ok = $true + model = $response.model + content = $response.choices[0].message.content + } + } + catch { + $callResult = [pscustomobject]@{ + ok = $false + error = ConvertTo-ShortError $_ + } + } +} + +[pscustomobject]@{ + ok = ($missingAliases.Count -eq 0 -and (-not $Call -or ($callResult -and $callResult.ok))) + base_url = $BaseUrl + aliases_present = @($requiredAliases | Where-Object { $modelIds -contains $_ }) + aliases_missing = $missingAliases + healthy_count = $health.healthy_count + unhealthy_count = $health.unhealthy_count + health_error = $health.health_error + call = $callResult +} | ConvertTo-Json -Depth 6 From d196b93504524123da53b0c5fe11a7b600497d36 Mon Sep 17 00:00:00 2001 From: Tibo2403 Date: Sat, 27 Jun 2026 20:57:04 +0200 Subject: [PATCH 4/6] Use Qwen2.5 Coder GGUF through Ollama --- .../Install-CodexLocalLiteLLMAssets.ps1 | 1 + scripts/python/README.md | 2 +- scripts/python/README_Codex_Cost_Routing.md | 29 ++++++---- scripts/python/Start-CodexQwenOllama.ps1 | 55 +++++++++++++++++++ scripts/python/codex_cost_router.py | 18 ++++-- scripts/python/litellm-cost-routing.yaml | 22 ++++---- .../python/tests/test_codex_cost_router.py | 8 ++- 7 files changed, 103 insertions(+), 32 deletions(-) create mode 100644 scripts/python/Start-CodexQwenOllama.ps1 diff --git a/scripts/python/Install-CodexLocalLiteLLMAssets.ps1 b/scripts/python/Install-CodexLocalLiteLLMAssets.ps1 index 6fc0938..42f54f8 100644 --- a/scripts/python/Install-CodexLocalLiteLLMAssets.ps1 +++ b/scripts/python/Install-CodexLocalLiteLLMAssets.ps1 @@ -9,6 +9,7 @@ $files = @( 'litellm-cost-routing.yaml', 'codex_key_session_web.py', 'Start-CodexKeySessionWeb.ps1', + 'Start-CodexQwenOllama.ps1', 'Test-CodexLiteLLMDispatch.ps1' ) diff --git a/scripts/python/README.md b/scripts/python/README.md index b1b9d59..a0ed839 100644 --- a/scripts/python/README.md +++ b/scripts/python/README.md @@ -59,7 +59,7 @@ environment variables. When `HF_TOKEN` is available, it can also route Hugging Face and multi-provider tasks through the `codex-hf-cheap` and `codex-hf-fast` LiteLLM aliases, or launch an optional `cost-routing-hf` Codex profile that points directly at the Hugging Face router. `codex-qwen-local` is available as -a self-hosted OpenAI-compatible Qwen fallback when `QWEN_API_BASE` is set. +a local Ollama fallback through `Qwen/Qwen2.5-Coder-7B-Instruct-GGUF`. `codex-routing-policy.yaml` keeps the default provider rules and fallback order editable without changing Python code. diff --git a/scripts/python/README_Codex_Cost_Routing.md b/scripts/python/README_Codex_Cost_Routing.md index 6ec17a7..834444b 100644 --- a/scripts/python/README_Codex_Cost_Routing.md +++ b/scripts/python/README_Codex_Cost_Routing.md @@ -66,19 +66,24 @@ $env:HF_TOKEN = 'hf_...' $env:HUGGINGFACE_API_KEY = $env:HF_TOKEN ``` -## Self-Hosted Qwen Fallback +## Local Ollama Qwen Fallback The local LiteLLM config includes `codex-qwen-local` as a final fallback for -the main Codex aliases. It expects an OpenAI-compatible local endpoint: +the main Codex aliases. It uses Ollama's OpenAI-compatible endpoint with the +lighter Qwen2.5 Coder 7B GGUF model: ```powershell -$env:QWEN_API_BASE = 'http://127.0.0.1:8000/v1' -$env:QWEN_API_KEY = 'sk-local-qwen' +.\scripts\python\Start-CodexQwenOllama.ps1 ``` -`QWEN_API_KEY` can be any dummy value when your local server does not require -authentication. The local web key page also accepts these two fields and passes -them only to the LiteLLM subprocess environment. +The script starts Ollama if needed and pulls: + +```text +hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest +``` + +LiteLLM then reaches it through `http://127.0.0.1:11434/v1`. No provider API key +is required for this local fallback. Second, Hugging Face can be added as an optional Codex-facing layer. Running `enable` now installs two managed profiles: @@ -179,10 +184,12 @@ If you prefer entering keys in a local page for one work session, start: ``` Then open `http://127.0.0.1:8787/`, paste `OPENAI_API_KEY`, -`GEMINI_API_KEY`, `HF_TOKEN`, or the optional Qwen endpoint fields, and submit -the form. The page starts the LiteLLM proxy on `http://127.0.0.1:4000/v1` with -those values only in the proxy process environment. The keys are not written to -disk and the web server suppresses request logging. +`GEMINI_API_KEY`, `HF_TOKEN`, or optional custom Qwen endpoint fields, and +submit the form. For the default local Qwen/Ollama fallback, run +`Start-CodexQwenOllama.ps1`; no Qwen API key is needed. The page starts the +LiteLLM proxy on `http://127.0.0.1:4000/v1` with submitted values only in the +proxy process environment. The keys are not written to disk and the web server +suppresses request logging. To launch the optional Hugging Face-facing profile instead of the local LiteLLM proxy: diff --git a/scripts/python/Start-CodexQwenOllama.ps1 b/scripts/python/Start-CodexQwenOllama.ps1 new file mode 100644 index 0000000..2372566 --- /dev/null +++ b/scripts/python/Start-CodexQwenOllama.ps1 @@ -0,0 +1,55 @@ +[CmdletBinding()] +param( + [string]$Model = "hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest", + [switch]$SkipPull +) + +$ErrorActionPreference = "Stop" + +$ollama = Get-Command ollama -ErrorAction SilentlyContinue +if (-not $ollama) { + throw "Ollama is not installed or not available in PATH." +} + +try { + Invoke-RestMethod -Uri "http://127.0.0.1:11434/api/tags" -TimeoutSec 3 | Out-Null +} +catch { + Start-Process -WindowStyle Hidden -FilePath $ollama.Source -ArgumentList @("serve") + $ready = $false + for ($i = 0; $i -lt 40; $i++) { + Start-Sleep -Milliseconds 500 + try { + Invoke-RestMethod -Uri "http://127.0.0.1:11434/api/tags" -TimeoutSec 2 | Out-Null + $ready = $true + break + } + catch { + $ready = $false + } + } + if (-not $ready) { + throw "Ollama did not become ready on http://127.0.0.1:11434." + } +} + +$tags = Invoke-RestMethod -Uri "http://127.0.0.1:11434/api/tags" -TimeoutSec 10 +$modelNames = @($tags.models | ForEach-Object { $_.name }) +if (($modelNames -notcontains $Model) -and (-not $SkipPull)) { + & $ollama.Source pull $Model + if ($LASTEXITCODE -ne 0) { + throw "ollama pull failed for $Model" + } +} + +$tags = Invoke-RestMethod -Uri "http://127.0.0.1:11434/api/tags" -TimeoutSec 10 +$modelNames = @($tags.models | ForEach-Object { $_.name }) +if ($modelNames -notcontains $Model) { + throw "$Model is not installed. Run without -SkipPull to download it." +} + +[pscustomobject]@{ + ok = $true + model = $Model + api_base = "http://127.0.0.1:11434/v1" +} | ConvertTo-Json diff --git a/scripts/python/codex_cost_router.py b/scripts/python/codex_cost_router.py index d72e9e1..f4ce62a 100644 --- a/scripts/python/codex_cost_router.py +++ b/scripts/python/codex_cost_router.py @@ -51,6 +51,8 @@ ) LITELLM_HOST = "localhost" LITELLM_PORT = 4000 +OLLAMA_HOST = "127.0.0.1" +OLLAMA_PORT = 11434 WINDOWS_LITELLM_FALLBACK = Path(r"C:\tmp\litellm-oss\Scripts\litellm.exe") POLICY_FILE = Path(__file__).with_name("codex-routing-policy.yaml") DEFAULT_POLICY = { @@ -392,8 +394,15 @@ def hf_available() -> bool: def qwen_available() -> bool: - """Return whether a self-hosted Qwen endpoint is configured.""" - return bool(os.environ.get("QWEN_API_BASE")) + """Return whether the local Ollama Qwen endpoint is reachable.""" + configured_base = os.environ.get("QWEN_API_BASE") + if configured_base: + return True + try: + with socket.create_connection((OLLAMA_HOST, OLLAMA_PORT), timeout=1): + return True + except OSError: + return False def default_provider() -> str: @@ -508,7 +517,7 @@ def route_model( if provider == "qwen": if qwen_available(): return QWEN_LOCAL_MODEL, f"qwen provider requested; {reason}" - return DEFAULT_MODEL, "qwen requested but QWEN_API_BASE is missing; using default OpenAI/Gemini tier" + return DEFAULT_MODEL, "qwen requested but Ollama is not listening on 127.0.0.1:11434; using default OpenAI/Gemini tier" if provider == "openai": model = LIGHT_MODEL if complexity == "simple" else DEEP_MODEL @@ -682,8 +691,7 @@ def print_doctor() -> int: ("LiteLLM proxy localhost:4000", proxy_available(), "listening" if proxy_available() else "not listening"), ("LITELLM_API_KEY", bool(os.environ.get("LITELLM_API_KEY")), "set" if os.environ.get("LITELLM_API_KEY") else "missing"), ("OPENAI_API_KEY", bool(os.environ.get("OPENAI_API_KEY")), "set" if os.environ.get("OPENAI_API_KEY") else "missing"), - ("QWEN_API_BASE optional", True, os.environ.get("QWEN_API_BASE") or "missing; self-hosted Qwen fallback disabled"), - ("QWEN_API_KEY optional", True, "set" if os.environ.get("QWEN_API_KEY") else "missing; use dummy value for no-auth local servers"), + ("Ollama Qwen optional", True, "listening on 127.0.0.1:11434" if qwen_available() else "missing; run Start-CodexQwenOllama.ps1"), ("HF_TOKEN optional", True, "set" if hf_available() else "missing; Hugging Face aliases disabled"), ("PYTHONUTF8", os.environ.get("PYTHONUTF8") == "1", "1" if os.environ.get("PYTHONUTF8") == "1" else "missing or not 1"), ("Cost-routing profile", router_enabled(), "enabled" if router_enabled() else "disabled"), diff --git a/scripts/python/litellm-cost-routing.yaml b/scripts/python/litellm-cost-routing.yaml index 7bb2db6..ccda823 100644 --- a/scripts/python/litellm-cost-routing.yaml +++ b/scripts/python/litellm-cost-routing.yaml @@ -5,11 +5,10 @@ # - codex-default: normal coding work, OpenAI biased with Gemini relief # - codex-long: long-context reads and synthesis, Gemini Pro biased # - codex-deep: high-stakes debugging/security/architecture, OpenAI biased -# - codex-qwen-local: self-hosted OpenAI-compatible Qwen fallback +# - codex-qwen-local: local Ollama Qwen fallback # API keys are read from environment variables and must never be committed. -# Qwen fallback expects QWEN_API_BASE, for example http://127.0.0.1:8000/v1. -# QWEN_API_KEY is optional for many local servers; use any dummy value when the -# server does not require authentication. +# Qwen fallback expects Ollama on http://127.0.0.1:11434/v1 with: +# hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest model_list: # Backward-compatible aliases used by older wrapper calls. @@ -92,15 +91,14 @@ model_list: api_key: os.environ/GEMINI_API_KEY weight: 1 - # Local OpenAI-compatible fallback for a self-hosted Qwen endpoint. - # Start a server with a /v1-compatible API and set: - # QWEN_API_BASE=http://127.0.0.1:8000/v1 - # QWEN_API_KEY=sk-local-qwen + # Local Ollama fallback for Qwen2.5 Coder 7B GGUF. + # Prepare with: + # ollama pull hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest - model_name: codex-qwen-local litellm_params: - model: openai/qwen-auto-hosted - api_base: os.environ/QWEN_API_BASE - api_key: os.environ/QWEN_API_KEY + model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest + api_base: http://127.0.0.1:11434/v1 + api_key: sk-ollama-local weight: 1 - model_name: codex-hf-cheap @@ -124,7 +122,7 @@ router_settings: gpt-5.4-mini: codex-light gemini-3.5-pro: codex-long gemini-3.5-flash: codex-light - qwen-auto-hosted: codex-qwen-local + hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest: codex-qwen-local fallbacks: - codex-light: - codex-default diff --git a/scripts/python/tests/test_codex_cost_router.py b/scripts/python/tests/test_codex_cost_router.py index de14cca..fe75d52 100644 --- a/scripts/python/tests/test_codex_cost_router.py +++ b/scripts/python/tests/test_codex_cost_router.py @@ -61,7 +61,7 @@ def test_route_model_falls_back_when_hugging_face_token_is_missing(self) -> None self.assertIn("HF_TOKEN is missing", reason) def test_route_model_can_use_self_hosted_qwen_when_endpoint_exists(self) -> None: - with patch.dict(ROUTER.os.environ, {"QWEN_API_BASE": "http://127.0.0.1:8000/v1"}): + with patch.dict(ROUTER.os.environ, {"QWEN_API_BASE": "http://127.0.0.1:11434/v1"}): self.assertEqual( ROUTER.route_model("Use qwen auto heberge as backup", provider="qwen")[0], "codex-qwen-local", @@ -72,10 +72,12 @@ def test_route_model_can_use_self_hosted_qwen_when_endpoint_exists(self) -> None ) def test_route_model_falls_back_when_qwen_endpoint_is_missing(self) -> None: - with patch.dict(ROUTER.os.environ, {}, clear=True): + with patch.dict(ROUTER.os.environ, {}, clear=True), patch.object( + ROUTER.socket, "create_connection", side_effect=OSError + ): model, reason = ROUTER.route_model("Use Qwen local", provider="qwen") self.assertEqual(model, "codex-default") - self.assertIn("QWEN_API_BASE is missing", reason) + self.assertIn("Ollama is not listening", reason) def test_codex_provider_helpers_select_expected_profiles(self) -> None: self.assertEqual(ROUTER.codex_profile("litellm"), "cost-routing") From 67615854ac11d51f49af62e948331d3f9ac06b2d Mon Sep 17 00:00:00 2001 From: Tibo2403 Date: Sat, 27 Jun 2026 22:40:25 +0200 Subject: [PATCH 5/6] Simplify Codex LiteLLM session page --- scripts/python/codex_key_session_web.py | 195 ++++++++++++++---- .../tests/test_codex_key_session_web.py | 33 +++ 2 files changed, 193 insertions(+), 35 deletions(-) diff --git a/scripts/python/codex_key_session_web.py b/scripts/python/codex_key_session_web.py index 629f564..35b89b1 100644 --- a/scripts/python/codex_key_session_web.py +++ b/scripts/python/codex_key_session_web.py @@ -10,6 +10,7 @@ import sys import time import urllib.parse +import urllib.request from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from pathlib import Path from typing import ClassVar @@ -30,28 +31,56 @@ :root {{ color-scheme: light dark; font-family: Segoe UI, system-ui, sans-serif; + --accent: #1f6feb; + --ok: #238636; + --warn: #9a6700; }} body {{ margin: 0; min-height: 100vh; - display: grid; - place-items: center; + background: color-mix(in srgb, Canvas 94%, CanvasText); background: Canvas; color: CanvasText; }} main {{ - width: min(680px, calc(100vw - 32px)); - border: 1px solid color-mix(in srgb, CanvasText 18%, transparent); - border-radius: 8px; - padding: 24px; + width: min(880px, calc(100vw - 32px)); + margin: 28px auto; + padding: 0 0 28px; }} h1 {{ - font-size: 22px; - margin: 0 0 8px; + font-size: clamp(24px, 4vw, 34px); + margin: 0 0 10px; + letter-spacing: 0; + }} + h2 {{ + font-size: 17px; + margin: 0 0 14px; }} p {{ line-height: 1.5; }} + .panel {{ + border: 1px solid color-mix(in srgb, CanvasText 18%, transparent); + border-radius: 8px; + padding: 18px; + margin-top: 16px; + background: Canvas; + }} + .grid {{ + display: grid; + grid-template-columns: repeat(auto-fit, minmax(240px, 1fr)); + gap: 14px; + }} + .provider {{ + border: 1px solid color-mix(in srgb, CanvasText 14%, transparent); + border-radius: 8px; + padding: 14px; + min-height: 96px; + }} + .provider strong {{ + display: block; + margin-bottom: 4px; + }} label {{ display: block; margin-top: 16px; @@ -61,57 +90,139 @@ width: 100%; box-sizing: border-box; margin-top: 6px; - padding: 10px; + padding: 11px 12px; border-radius: 6px; border: 1px solid color-mix(in srgb, CanvasText 24%, transparent); font: inherit; }} + input[type="checkbox"] {{ + width: auto; + margin: 0 8px 0 0; + }} + .check {{ + display: flex; + align-items: center; + gap: 8px; + margin-top: 10px; + font-weight: 600; + }} + .actions {{ + display: flex; + flex-wrap: wrap; + gap: 12px; + margin-top: 18px; + }} button {{ - margin-top: 20px; padding: 10px 14px; border: 0; border-radius: 6px; - background: #1f6feb; + background: var(--accent); color: white; font: inherit; font-weight: 600; cursor: pointer; }} + button.secondary {{ + background: color-mix(in srgb, CanvasText 14%, Canvas); + color: CanvasText; + }} .status {{ margin-top: 16px; padding: 12px; border-radius: 6px; - background: color-mix(in srgb, #1f6feb 12%, Canvas); + background: color-mix(in srgb, var(--accent) 12%, Canvas); + overflow-wrap: anywhere; + }} + .meta {{ + display: grid; + grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); + gap: 10px; + margin-top: 14px; + }} + .pill {{ + border: 1px solid color-mix(in srgb, CanvasText 14%, transparent); + border-radius: 999px; + padding: 8px 12px; overflow-wrap: anywhere; }} + details {{ + margin-top: 14px; + }} + summary {{ + cursor: pointer; + font-weight: 600; + }} .muted {{ opacity: .75; font-size: 14px; }} + .small {{ + font-size: 13px; + }}
-

Codex LiteLLM session keys

-

Keys are kept only in this local process environment and passed to the LiteLLM subprocess. They are not written to disk.

+

Codex LiteLLM dispatch

+

Start a local LiteLLM proxy for Codex with cloud keys kept only in this session. Qwen local runs through Ollama and needs no cloud key.

{status} - - - - - - - - - - - - - -
- -
-

Proxy URL: http://127.0.0.1:{proxy_port}/v1

+
+

Providers

+
+
+ OpenAI + Used for default and deep coding aliases. +
+
+ Gemini + Used for low-cost, long-context, and relief routing. +
+
+ Hugging Face + Optional aliases for HF-hosted open models. +
+
+ Qwen local + {qwen_status} +
+
+
+
Proxy URL: http://127.0.0.1:{proxy_port}/v1
+
Codex model aliases: codex-light, codex-default, codex-long, codex-deep
+
Local fallback: codex-qwen-local
+
+
+ +
+

Session keys

+
+ + + + + + + +
+ Advanced custom Qwen endpoint + + + + +
+
+ +
+
+
+
+ +
+
+
@@ -145,6 +256,18 @@ def wait_for_port(host: str, port: int, timeout: float = 20.0) -> bool: return False +def local_qwen_status() -> str: + """Return a short status for the local Ollama Qwen fallback.""" + try: + with urllib.request.urlopen("http://127.0.0.1:11434/v1/models", timeout=2) as response: + body = response.read().decode("utf-8", errors="replace") + except OSError: + return "Ollama is not reachable on 127.0.0.1:11434." + if "hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest" in body: + return "Ready through Ollama on 127.0.0.1:11434." + return "Ollama is running, but the Qwen2.5 Coder model was not found." + + class SessionState: """Mutable server state.""" @@ -184,7 +307,8 @@ def do_POST(self) -> None: # noqa: N802 def _send_page(self) -> None: safe_message = html.escape(self.state.message) status = f'
{safe_message}
' - body = PAGE.format(status=status, proxy_port=self.proxy_port).encode("utf-8") + qwen_status = html.escape(local_qwen_status()) + body = PAGE.format(status=status, proxy_port=self.proxy_port, qwen_status=qwen_status).encode("utf-8") self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.send_header("Cache-Control", "no-store") @@ -203,10 +327,11 @@ def _start_proxy(self) -> None: openai_key = form.get("OPENAI_API_KEY", "") gemini_key = form.get("GEMINI_API_KEY", "") hf_token = form.get("HF_TOKEN", "") + use_local_qwen = form.get("USE_LOCAL_QWEN", "") == "1" qwen_base = form.get("QWEN_API_BASE", "") qwen_key = form.get("QWEN_API_KEY", "") - if not any((openai_key, gemini_key, hf_token, qwen_base)): - self.state.message = "Provide at least one provider key." + if not any((openai_key, gemini_key, hf_token, use_local_qwen, qwen_base)): + self.state.message = "Provide at least one provider key, or keep local Qwen enabled." self._send_page() return @@ -256,7 +381,7 @@ def _start_proxy(self) -> None: providers.append("Gemini") if hf_token: providers.append("Hugging Face") - if qwen_base: + if use_local_qwen or qwen_base: providers.append("Qwen local") self.state.message = "Session proxy started with: " + ", ".join(providers) else: diff --git a/scripts/python/tests/test_codex_key_session_web.py b/scripts/python/tests/test_codex_key_session_web.py index 8842531..6105c9f 100644 --- a/scripts/python/tests/test_codex_key_session_web.py +++ b/scripts/python/tests/test_codex_key_session_web.py @@ -53,6 +53,7 @@ def test_start_proxy_exports_hf_token_under_litellm_mapping_name(self) -> None: "OPENAI_API_KEY": "", "GEMINI_API_KEY": "", "HF_TOKEN": "hf_test", + "USE_LOCAL_QWEN": "", "QWEN_API_BASE": "", "QWEN_API_KEY": "", } @@ -79,6 +80,38 @@ def fake_popen(*args: object, **kwargs: object) -> MagicMock: self.assertEqual(captured_env["HUGGINGFACE_API_KEY"], "hf_test") self.assertIn("Hugging Face", handler.state.message) + def test_start_proxy_allows_qwen_local_without_cloud_keys(self) -> None: + handler = object.__new__(WEB.KeySessionHandler) + handler.state = WEB.SessionState() + handler.config_path = Path("config.yaml") + handler.litellm_path = Path("litellm.exe") + handler.proxy_host = "127.0.0.1" + handler.proxy_port = 4000 + handler._read_form = MagicMock( # type: ignore[method-assign] + return_value={ + "OPENAI_API_KEY": "", + "GEMINI_API_KEY": "", + "HF_TOKEN": "", + "USE_LOCAL_QWEN": "1", + "QWEN_API_BASE": "", + "QWEN_API_KEY": "", + } + ) + handler._stop_proxy = MagicMock() # type: ignore[method-assign] + handler._send_page = MagicMock() # type: ignore[method-assign] + + process = MagicMock() + process.poll.return_value = None + + with ( + patch.object(WEB.subprocess, "Popen", return_value=process), + patch.object(WEB, "wait_for_port", return_value=True), + patch.dict(os.environ, {}, clear=True), + ): + handler._start_proxy() + + self.assertIn("Qwen local", handler.state.message) + if __name__ == "__main__": unittest.main() From 7ecc30dc7716d704f37cffbc5d2ab9c439a417f0 Mon Sep 17 00:00:00 2001 From: Tibo2403 Date: Sat, 27 Jun 2026 23:32:57 +0200 Subject: [PATCH 6/6] Add OpenAI quota saver routing --- scripts/python/README_Codex_Cost_Routing.md | 40 +++++++++- scripts/python/codex-routing-policy.yaml | 1 + scripts/python/codex_cost_router.py | 23 +++++- scripts/python/litellm-cost-routing.yaml | 75 +++++++++++++++++-- .../python/tests/test_codex_cost_router.py | 16 ++++ 5 files changed, 142 insertions(+), 13 deletions(-) diff --git a/scripts/python/README_Codex_Cost_Routing.md b/scripts/python/README_Codex_Cost_Routing.md index 834444b..a3e3c62 100644 --- a/scripts/python/README_Codex_Cost_Routing.md +++ b/scripts/python/README_Codex_Cost_Routing.md @@ -10,21 +10,52 @@ applies budgets, and selects one of these LiteLLM aliases: - `codex-default` for normal coding work - `codex-long` for long-context reads, log review, and synthesis - `codex-deep` for difficult debugging, security, and architecture decisions +- `codex-no-openai` for Gemini + local Qwen routing when OpenAI quota is low + or exhausted - `codex-cheap` and `codex-strong` as backward-compatible aliases - `codex-hf-cheap` for simple Hugging Face / open-model tasks when `HF_TOKEN` is set - `codex-hf-fast` for larger Hugging Face / multi-provider tasks when `HF_TOKEN` is set -OpenAI and Gemini are both configured through LiteLLM model groups. The normal -default keeps most code-generation traffic on OpenAI while letting Gemini absorb -long-context and lower-risk work. This reduces token saturation without sending -high-stakes changes blindly to the cheapest model. +OpenAI, Gemini, and local Qwen are configured through LiteLLM model groups. The +normal default now balances OpenAI with Gemini relief and keeps Qwen as a local +zero-cost fallback. This reduces token saturation without sending high-stakes +changes blindly to the cheapest model. API keys are never committed or written to a configuration file. `OPENAI_API_KEY` is required for the default profile; `GEMINI_API_KEY` is optional but recommended to activate the OpenAI/Gemini dispatching path. +## OpenAI Quota Saver + +When OpenAI quota is low or exhausted, use the `codex-no-openai` alias. It routes +through Gemini first and local Qwen second, without OpenAI entries in the model +group: + +```powershell +codex --model codex-no-openai +``` + +For one-shot wrapper calls, either force the provider: + +```powershell +python .\scripts\python\codex_cost_router.py run --dry-run ` + --provider no-openai ` + "Refactor this Python API without using OpenAI quota" +``` + +or set a temporary session mode: + +```powershell +$env:CODEX_ROUTER_OPENAI_MODE = 'avoid' +python .\scripts\python\codex_cost_router.py run --dry-run ` + "Refactor this Python API without using OpenAI quota" +``` + +For a durable default, set `avoid_openai: true` in +`codex-routing-policy.yaml`. + ## Hugging Face Integration Hugging Face can be used in two optional places. @@ -129,6 +160,7 @@ Default policy: default_provider: auto default_codex_provider: litellm open_models_only: false +avoid_openai: false max_cost_usd: 0.0 task_provider_rules: diff --git a/scripts/python/codex-routing-policy.yaml b/scripts/python/codex-routing-policy.yaml index 81052fa..0850f90 100644 --- a/scripts/python/codex-routing-policy.yaml +++ b/scripts/python/codex-routing-policy.yaml @@ -6,6 +6,7 @@ default_provider: auto default_codex_provider: litellm open_models_only: false +avoid_openai: false max_cost_usd: 0.0 task_provider_rules: diff --git a/scripts/python/codex_cost_router.py b/scripts/python/codex_cost_router.py index f4ce62a..916851b 100644 --- a/scripts/python/codex_cost_router.py +++ b/scripts/python/codex_cost_router.py @@ -34,9 +34,10 @@ HF_CHEAP_MODEL = "codex-hf-cheap" HF_DIRECT_MODEL = "openai/gpt-oss-120b:fastest" QWEN_LOCAL_MODEL = "codex-qwen-local" +NO_OPENAI_MODEL = "codex-no-openai" DEFAULT_MAX_INPUT_TOKENS = 12_000 DEFAULT_MAX_OUTPUT_TOKENS = 2_000 -PROVIDERS = ("auto", "openai", "gemini", "huggingface", "qwen") +PROVIDERS = ("auto", "openai", "gemini", "huggingface", "qwen", "no-openai") CODEX_PROVIDERS = ("litellm", "huggingface") MODELS = ( LIGHT_MODEL, @@ -48,6 +49,7 @@ HF_FAST_MODEL, HF_CHEAP_MODEL, QWEN_LOCAL_MODEL, + NO_OPENAI_MODEL, ) LITELLM_HOST = "localhost" LITELLM_PORT = 4000 @@ -59,6 +61,7 @@ "default_provider": "auto", "default_codex_provider": "litellm", "open_models_only": False, + "avoid_openai": False, "max_cost_usd": 0.0, "task_provider_rules": { "simple": "auto", @@ -80,6 +83,7 @@ HF_CHEAP_MODEL: {"input": 0.10, "output": 0.30}, HF_FAST_MODEL: {"input": 0.25, "output": 0.75}, QWEN_LOCAL_MODEL: {"input": 0.0, "output": 0.0}, + NO_OPENAI_MODEL: {"input": 0.40, "output": 1.50}, } SIMPLE_TERMS = ( @@ -411,6 +415,16 @@ def default_provider() -> str: return provider if provider in PROVIDERS else "auto" +def openai_avoidance_enabled(policy: dict[str, Any] | None = None) -> bool: + """Return whether OpenAI should be avoided to preserve or bypass quota.""" + value = os.environ.get("CODEX_ROUTER_OPENAI_MODE", "").casefold() + if value in {"avoid", "off", "depleted", "quota", "no-openai", "no_openai"}: + return True + if value in {"", "auto", "normal", "on"}: + return bool(policy and policy.get("avoid_openai")) + return False + + def default_codex_provider() -> str: """Read the Codex-facing provider preference with a safe fallback.""" provider = os.environ.get("CODEX_ROUTER_CODEX_PROVIDER", "litellm").casefold() @@ -441,6 +455,8 @@ def provider_from_policy( return default_provider(), "provider forced by CODEX_ROUTER_PROVIDER" if bool(policy.get("open_models_only")): return "huggingface", "policy open_models_only" + if openai_avoidance_enabled(policy): + return "no-openai", "OpenAI avoidance enabled" complexity, _ = classify_complexity(prompt) rules = policy.get("task_provider_rules", {}) if isinstance(rules, dict) and complexity in rules: @@ -519,6 +535,11 @@ def route_model( return QWEN_LOCAL_MODEL, f"qwen provider requested; {reason}" return DEFAULT_MODEL, "qwen requested but Ollama is not listening on 127.0.0.1:11434; using default OpenAI/Gemini tier" + if provider == "no-openai": + if qwen_available(): + return NO_OPENAI_MODEL, f"OpenAI avoided; Gemini/Qwen alias selected; {reason}" + return LONG_MODEL, f"OpenAI avoided; Qwen unavailable so Gemini long-context alias selected; {reason}" + if provider == "openai": model = LIGHT_MODEL if complexity == "simple" else DEEP_MODEL return model, f"openai provider requested; {reason}" diff --git a/scripts/python/litellm-cost-routing.yaml b/scripts/python/litellm-cost-routing.yaml index ccda823..6330036 100644 --- a/scripts/python/litellm-cost-routing.yaml +++ b/scripts/python/litellm-cost-routing.yaml @@ -1,10 +1,11 @@ # LiteLLM OSS self-hosted proxy example for Codex. # Task-oriented aliases let Codex route by workload instead of hard-coding a # single provider: -# - codex-light: cheap/frequent work, Gemini Flash biased -# - codex-default: normal coding work, OpenAI biased with Gemini relief +# - codex-light: cheap/frequent work, Gemini Flash biased with local Qwen relief +# - codex-default: normal coding work, balanced OpenAI/Gemini with local Qwen relief # - codex-long: long-context reads and synthesis, Gemini Pro biased -# - codex-deep: high-stakes debugging/security/architecture, OpenAI biased +# - codex-deep: high-stakes debugging/security/architecture, OpenAI biased with fast fallback +# - codex-no-openai: Gemini/Qwen routing when OpenAI quota is exhausted # - codex-qwen-local: local Ollama Qwen fallback # API keys are read from environment variables and must never be committed. # Qwen fallback expects Ollama on http://127.0.0.1:11434/v1 with: @@ -47,19 +48,33 @@ model_list: litellm_params: model: openai/gpt-5.4-mini api_key: os.environ/OPENAI_API_KEY - weight: 3 + weight: 2 + + - model_name: codex-light + litellm_params: + model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest + api_base: http://127.0.0.1:11434/v1 + api_key: sk-ollama-local + weight: 1 - model_name: codex-default litellm_params: model: openai/gpt-5.5 api_key: os.environ/OPENAI_API_KEY - weight: 8 + weight: 5 - model_name: codex-default litellm_params: model: gemini/gemini-3.5-pro api_key: os.environ/GEMINI_API_KEY - weight: 2 + weight: 4 + + - model_name: codex-default + litellm_params: + model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest + api_base: http://127.0.0.1:11434/v1 + api_key: sk-ollama-local + weight: 1 - model_name: codex-long litellm_params: @@ -79,18 +94,53 @@ model_list: api_key: os.environ/OPENAI_API_KEY weight: 1 + - model_name: codex-long + litellm_params: + model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest + api_base: http://127.0.0.1:11434/v1 + api_key: sk-ollama-local + weight: 1 + - model_name: codex-deep litellm_params: model: openai/gpt-5.5 api_key: os.environ/OPENAI_API_KEY - weight: 10 + weight: 7 - model_name: codex-deep litellm_params: model: gemini/gemini-3.5-pro api_key: os.environ/GEMINI_API_KEY + weight: 2 + + - model_name: codex-deep + litellm_params: + model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest + api_base: http://127.0.0.1:11434/v1 + api_key: sk-ollama-local weight: 1 + # OpenAI quota saver. Use this alias directly, or set + # CODEX_ROUTER_OPENAI_MODE=avoid with codex_cost_router.py. + - model_name: codex-no-openai + litellm_params: + model: gemini/gemini-3.5-pro + api_key: os.environ/GEMINI_API_KEY + weight: 6 + + - model_name: codex-no-openai + litellm_params: + model: gemini/gemini-3.5-flash + api_key: os.environ/GEMINI_API_KEY + weight: 2 + + - model_name: codex-no-openai + litellm_params: + model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest + api_base: http://127.0.0.1:11434/v1 + api_key: sk-ollama-local + weight: 2 + # Local Ollama fallback for Qwen2.5 Coder 7B GGUF. # Prepare with: # ollama pull hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest @@ -125,24 +175,33 @@ router_settings: hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest: codex-qwen-local fallbacks: - codex-light: + - codex-no-openai - codex-default - codex-qwen-local - codex-default: + - codex-no-openai - codex-long - codex-light - codex-qwen-local - codex-long: + - codex-no-openai - codex-default - codex-qwen-local - codex-deep: + - codex-no-openai - codex-default - codex-long - codex-qwen-local + - codex-no-openai: + - codex-long + - codex-qwen-local - codex-cheap: + - codex-no-openai - codex-strong - codex-default - codex-qwen-local - codex-strong: + - codex-no-openai - codex-default - codex-long - codex-qwen-local @@ -166,7 +225,7 @@ router_settings: allowed_fails_policy: AuthenticationErrorAllowedFails: 0 TimeoutErrorAllowedFails: 2 - RateLimitErrorAllowedFails: 4 + RateLimitErrorAllowedFails: 1 litellm_settings: drop_params: true diff --git a/scripts/python/tests/test_codex_cost_router.py b/scripts/python/tests/test_codex_cost_router.py index fe75d52..9eaf85c 100644 --- a/scripts/python/tests/test_codex_cost_router.py +++ b/scripts/python/tests/test_codex_cost_router.py @@ -71,6 +71,12 @@ def test_route_model_can_use_self_hosted_qwen_when_endpoint_exists(self) -> None "codex-qwen-local", ) + def test_route_model_can_avoid_openai_with_gemini_qwen_alias(self) -> None: + with patch.dict(ROUTER.os.environ, {"QWEN_API_BASE": "http://127.0.0.1:11434/v1"}): + model, reason = ROUTER.route_model("Refactor this Python API", provider="no-openai") + self.assertEqual(model, "codex-no-openai") + self.assertIn("OpenAI avoided", reason) + def test_route_model_falls_back_when_qwen_endpoint_is_missing(self) -> None: with patch.dict(ROUTER.os.environ, {}, clear=True), patch.object( ROUTER.socket, "create_connection", side_effect=OSError @@ -143,6 +149,16 @@ def test_policy_open_models_only_prefers_hugging_face(self) -> None: self.assertEqual(ROUTER.provider_from_policy("Security review", None, policy)[0], "huggingface") self.assertEqual(ROUTER.codex_provider_from_policy(None, policy)[0], "huggingface") + def test_policy_or_environment_can_avoid_openai(self) -> None: + policy = {**ROUTER.DEFAULT_POLICY, "avoid_openai": True} + provider, reason = ROUTER.provider_from_policy("Refactor this Python API", None, policy) + self.assertEqual(provider, "no-openai") + self.assertIn("OpenAI avoidance", reason) + with patch.dict(ROUTER.os.environ, {"CODEX_ROUTER_OPENAI_MODE": "avoid"}): + provider, reason = ROUTER.provider_from_policy("Refactor this Python API", None, ROUTER.DEFAULT_POLICY) + self.assertEqual(provider, "no-openai") + self.assertIn("OpenAI avoidance", reason) + def test_build_optimized_prompt_respects_budget(self) -> None: context = "
" + ("Architecture production Odoo migration security. " * 1000) + "
" optimized = ROUTER.build_optimized_prompt(context, 120)