From 1d1736eb6ca1e8303c52b6acb18fc50f603ef77f Mon Sep 17 00:00:00 2001
From: Tibo2403 <Tibo2403@users.noreply.github.com>
Date: Sat, 27 Jun 2026 18:42:33 +0200
Subject: [PATCH 1/6] Add self-hosted Qwen fallback for Codex routing

---
 scripts/python/codex-routing-policy.yaml      |  2 ++
 scripts/python/codex_cost_router.py           | 30 ++++++++++++++++++-
 scripts/python/codex_key_session_web.py       | 13 +++++++-
 scripts/python/litellm-cost-routing.yaml      | 26 ++++++++++++++++
 .../python/tests/test_codex_cost_router.py    | 17 +++++++++++
 5 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/scripts/python/codex-routing-policy.yaml b/scripts/python/codex-routing-policy.yaml
index a1200f9..81052fa 100644
--- a/scripts/python/codex-routing-policy.yaml
+++ b/scripts/python/codex-routing-policy.yaml
@@ -1,5 +1,7 @@
 # Codex cost-routing policy.
 # CLI options still have priority, then environment variables, then this file.
+# Provider choices: auto, openai, gemini, huggingface, qwen.
+# qwen uses a self-hosted OpenAI-compatible endpoint via QWEN_API_BASE.
 
 default_provider: auto
 default_codex_provider: litellm
diff --git a/scripts/python/codex_cost_router.py b/scripts/python/codex_cost_router.py
index a7b6af1..d72e9e1 100644
--- a/scripts/python/codex_cost_router.py
+++ b/scripts/python/codex_cost_router.py
@@ -33,9 +33,10 @@
 HF_FAST_MODEL = "codex-hf-fast"
 HF_CHEAP_MODEL = "codex-hf-cheap"
 HF_DIRECT_MODEL = "openai/gpt-oss-120b:fastest"
+QWEN_LOCAL_MODEL = "codex-qwen-local"
 DEFAULT_MAX_INPUT_TOKENS = 12_000
 DEFAULT_MAX_OUTPUT_TOKENS = 2_000
-PROVIDERS = ("auto", "openai", "gemini", "huggingface")
+PROVIDERS = ("auto", "openai", "gemini", "huggingface", "qwen")
 CODEX_PROVIDERS = ("litellm", "huggingface")
 MODELS = (
     LIGHT_MODEL,
@@ -46,6 +47,7 @@
     LEGACY_STRONG_MODEL,
     HF_FAST_MODEL,
     HF_CHEAP_MODEL,
+    QWEN_LOCAL_MODEL,
 )
 LITELLM_HOST = "localhost"
 LITELLM_PORT = 4000
@@ -75,6 +77,7 @@
     LEGACY_STRONG_MODEL: {"input": 2.00, "output": 8.00},
     HF_CHEAP_MODEL: {"input": 0.10, "output": 0.30},
     HF_FAST_MODEL: {"input": 0.25, "output": 0.75},
+    QWEN_LOCAL_MODEL: {"input": 0.0, "output": 0.0},
 }
 
 SIMPLE_TERMS = (
@@ -122,6 +125,16 @@
     "provider benchmark",
     "benchmark providers",
 )
+QWEN_TERMS = (
+    "qwen",
+    "auto-heberge",
+    "auto heberge",
+    "auto-hebergee",
+    "self-hosted",
+    "self hosted",
+    "local llm",
+    "openai-compatible local",
+)
 LONG_CONTEXT_TERMS = (
     "gros contexte",
     "long contexte",
@@ -378,6 +391,11 @@ def hf_available() -> bool:
     return bool(os.environ.get("HF_TOKEN"))
 
 
+def qwen_available() -> bool:
+    """Return whether a self-hosted Qwen endpoint is configured."""
+    return bool(os.environ.get("QWEN_API_BASE"))
+
+
 def default_provider() -> str:
     """Read the provider preference from the environment with a safe fallback."""
     provider = os.environ.get("CODEX_ROUTER_PROVIDER", "auto").casefold()
@@ -487,6 +505,11 @@ def route_model(
             return model, f"huggingface provider requested; {reason}"
         return DEFAULT_MODEL, "huggingface requested but HF_TOKEN is missing; using default OpenAI/Gemini tier"
 
+    if provider == "qwen":
+        if qwen_available():
+            return QWEN_LOCAL_MODEL, f"qwen provider requested; {reason}"
+        return DEFAULT_MODEL, "qwen requested but QWEN_API_BASE is missing; using default OpenAI/Gemini tier"
+
     if provider == "openai":
         model = LIGHT_MODEL if complexity == "simple" else DEEP_MODEL
         return model, f"openai provider requested; {reason}"
@@ -499,6 +522,9 @@ def route_model(
         model = HF_CHEAP_MODEL if complexity == "simple" else HF_FAST_MODEL
         return model, f"huggingface-related task; {reason}"
 
+    if any(term in normalized for term in QWEN_TERMS) and qwen_available():
+        return QWEN_LOCAL_MODEL, f"qwen local task; {reason}"
+
     if wants_long_context:
         return LONG_MODEL, f"long-context task; {reason}"
 
@@ -656,6 +682,8 @@ def print_doctor() -> int:
         ("LiteLLM proxy localhost:4000", proxy_available(), "listening" if proxy_available() else "not listening"),
         ("LITELLM_API_KEY", bool(os.environ.get("LITELLM_API_KEY")), "set" if os.environ.get("LITELLM_API_KEY") else "missing"),
         ("OPENAI_API_KEY", bool(os.environ.get("OPENAI_API_KEY")), "set" if os.environ.get("OPENAI_API_KEY") else "missing"),
+        ("QWEN_API_BASE optional", True, os.environ.get("QWEN_API_BASE") or "missing; self-hosted Qwen fallback disabled"),
+        ("QWEN_API_KEY optional", True, "set" if os.environ.get("QWEN_API_KEY") else "missing; use dummy value for no-auth local servers"),
         ("HF_TOKEN optional", True, "set" if hf_available() else "missing; Hugging Face aliases disabled"),
         ("PYTHONUTF8", os.environ.get("PYTHONUTF8") == "1", "1" if os.environ.get("PYTHONUTF8") == "1" else "missing or not 1"),
         ("Cost-routing profile", router_enabled(), "enabled" if router_enabled() else "disabled"),
diff --git a/scripts/python/codex_key_session_web.py b/scripts/python/codex_key_session_web.py
index f957582..46aacbe 100644
--- a/scripts/python/codex_key_session_web.py
+++ b/scripts/python/codex_key_session_web.py
@@ -102,6 +102,10 @@
       <input id="gemini" name="GEMINI_API_KEY" type="password" placeholder="AI..." autocomplete="off">
       <label for="hf">HF_TOKEN optional</label>
       <input id="hf" name="HF_TOKEN" type="password" placeholder="hf_..." autocomplete="off">
+      <label for="qwen_base">QWEN_API_BASE optional</label>
+      <input id="qwen_base" name="QWEN_API_BASE" type="url" placeholder="http://127.0.0.1:8000/v1" autocomplete="off">
+      <label for="qwen_key">QWEN_API_KEY optional</label>
+      <input id="qwen_key" name="QWEN_API_KEY" type="password" placeholder="sk-local-qwen" autocomplete="off">
       <button type="submit">Start session proxy</button>
     </form>
     <form method="post" action="/stop">
@@ -199,7 +203,9 @@ def _start_proxy(self) -> None:
         openai_key = form.get("OPENAI_API_KEY", "")
         gemini_key = form.get("GEMINI_API_KEY", "")
         hf_token = form.get("HF_TOKEN", "")
-        if not any((openai_key, gemini_key, hf_token)):
+        qwen_base = form.get("QWEN_API_BASE", "")
+        qwen_key = form.get("QWEN_API_KEY", "")
+        if not any((openai_key, gemini_key, hf_token, qwen_base)):
             self.state.message = "Provide at least one provider key."
             self._send_page()
             return
@@ -215,6 +221,9 @@ def _start_proxy(self) -> None:
             env["GEMINI_API_KEY"] = gemini_key
         if hf_token:
             env["HF_TOKEN"] = hf_token
+        if qwen_base:
+            env["QWEN_API_BASE"] = qwen_base.rstrip("/")
+            env["QWEN_API_KEY"] = qwen_key or "sk-local-qwen"
 
         try:
             self.state.process = subprocess.Popen(
@@ -246,6 +255,8 @@ def _start_proxy(self) -> None:
                 providers.append("Gemini")
             if hf_token:
                 providers.append("Hugging Face")
+            if qwen_base:
+                providers.append("Qwen local")
             self.state.message = "Session proxy started with: " + ", ".join(providers)
         else:
             self.state.message = "LiteLLM process started, but the proxy port did not become ready yet."
diff --git a/scripts/python/litellm-cost-routing.yaml b/scripts/python/litellm-cost-routing.yaml
index d7027c4..5a85c7a 100644
--- a/scripts/python/litellm-cost-routing.yaml
+++ b/scripts/python/litellm-cost-routing.yaml
@@ -5,7 +5,11 @@
 # - codex-default: normal coding work, OpenAI biased with Gemini relief
 # - codex-long: long-context reads and synthesis, Gemini Pro biased
 # - codex-deep: high-stakes debugging/security/architecture, OpenAI biased
+# - codex-qwen-local: self-hosted OpenAI-compatible Qwen fallback
 # API keys are read from environment variables and must never be committed.
+# Qwen fallback expects QWEN_API_BASE, for example http://127.0.0.1:8000/v1.
+# QWEN_API_KEY is optional for many local servers; use any dummy value when the
+# server does not require authentication.
 
 model_list:
   # Backward-compatible aliases used by older wrapper calls.
@@ -88,6 +92,17 @@ model_list:
       api_key: os.environ/GEMINI_API_KEY
       weight: 1
 
+  # Local OpenAI-compatible fallback for a self-hosted Qwen endpoint.
+  # Start a server with a /v1-compatible API and set:
+  #   QWEN_API_BASE=http://127.0.0.1:8000/v1
+  #   QWEN_API_KEY=sk-local-qwen
+  - model_name: codex-qwen-local
+    litellm_params:
+      model: openai/qwen-auto-hosted
+      api_base: os.environ/QWEN_API_BASE
+      api_key: os.environ/QWEN_API_KEY
+      weight: 1
+
   - model_name: codex-hf-cheap
     litellm_params:
       model: huggingface/groq/openai/gpt-oss-120b
@@ -109,29 +124,40 @@ router_settings:
     gpt-5.4-mini: codex-light
     gemini-3.5-pro: codex-long
     gemini-3.5-flash: codex-light
+    qwen-auto-hosted: codex-qwen-local
   fallbacks:
     - codex-light:
         - codex-default
+        - codex-qwen-local
     - codex-default:
         - codex-long
         - codex-light
+        - codex-qwen-local
     - codex-long:
         - codex-default
+        - codex-qwen-local
     - codex-deep:
         - codex-default
         - codex-long
+        - codex-qwen-local
     - codex-cheap:
         - codex-strong
         - codex-default
+        - codex-qwen-local
     - codex-strong:
         - codex-default
         - codex-long
+        - codex-qwen-local
     - codex-hf-cheap:
         - codex-light
         - codex-cheap
+        - codex-qwen-local
     - codex-hf-fast:
         - codex-default
         - codex-deep
+        - codex-qwen-local
+    - codex-qwen-local:
+        - codex-light
   context_window_fallbacks:
     - codex-light:
         - codex-long
diff --git a/scripts/python/tests/test_codex_cost_router.py b/scripts/python/tests/test_codex_cost_router.py
index cf270bb..de14cca 100644
--- a/scripts/python/tests/test_codex_cost_router.py
+++ b/scripts/python/tests/test_codex_cost_router.py
@@ -60,6 +60,23 @@ def test_route_model_falls_back_when_hugging_face_token_is_missing(self) -> None
             self.assertEqual(model, "codex-default")
             self.assertIn("HF_TOKEN is missing", reason)
 
+    def test_route_model_can_use_self_hosted_qwen_when_endpoint_exists(self) -> None:
+        with patch.dict(ROUTER.os.environ, {"QWEN_API_BASE": "http://127.0.0.1:8000/v1"}):
+            self.assertEqual(
+                ROUTER.route_model("Use qwen auto heberge as backup", provider="qwen")[0],
+                "codex-qwen-local",
+            )
+            self.assertEqual(
+                ROUTER.route_model("Prefer self-hosted local llm fallback")[0],
+                "codex-qwen-local",
+            )
+
+    def test_route_model_falls_back_when_qwen_endpoint_is_missing(self) -> None:
+        with patch.dict(ROUTER.os.environ, {}, clear=True):
+            model, reason = ROUTER.route_model("Use Qwen local", provider="qwen")
+            self.assertEqual(model, "codex-default")
+            self.assertIn("QWEN_API_BASE is missing", reason)
+
     def test_codex_provider_helpers_select_expected_profiles(self) -> None:
         self.assertEqual(ROUTER.codex_profile("litellm"), "cost-routing")
         self.assertEqual(ROUTER.codex_profile("huggingface"), "cost-routing-hf")

From cddd21f8166f13c7129e2e030d712ed9febbc5cf Mon Sep 17 00:00:00 2001
From: Tibo2403 <Tibo2403@users.noreply.github.com>
Date: Sat, 27 Jun 2026 19:04:58 +0200
Subject: [PATCH 2/6] Fix Hugging Face Codex dispatch routing

---
 scripts/python/codex_key_session_web.py       |  1 +
 scripts/python/litellm-cost-routing.yaml      |  2 +-
 .../tests/test_codex_key_session_web.py       | 41 ++++++++++++++++++-
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/scripts/python/codex_key_session_web.py b/scripts/python/codex_key_session_web.py
index 46aacbe..629f564 100644
--- a/scripts/python/codex_key_session_web.py
+++ b/scripts/python/codex_key_session_web.py
@@ -221,6 +221,7 @@ def _start_proxy(self) -> None:
             env["GEMINI_API_KEY"] = gemini_key
         if hf_token:
             env["HF_TOKEN"] = hf_token
+            env["HUGGINGFACE_API_KEY"] = hf_token
         if qwen_base:
             env["QWEN_API_BASE"] = qwen_base.rstrip("/")
             env["QWEN_API_KEY"] = qwen_key or "sk-local-qwen"
diff --git a/scripts/python/litellm-cost-routing.yaml b/scripts/python/litellm-cost-routing.yaml
index 5a85c7a..7bb2db6 100644
--- a/scripts/python/litellm-cost-routing.yaml
+++ b/scripts/python/litellm-cost-routing.yaml
@@ -110,7 +110,7 @@ model_list:
 
   - model_name: codex-hf-fast
     litellm_params:
-      model: huggingface/together/deepseek-ai/DeepSeek-R1
+      model: huggingface/together/openai/gpt-oss-120b
       api_key: os.environ/HF_TOKEN
 
 router_settings:
diff --git a/scripts/python/tests/test_codex_key_session_web.py b/scripts/python/tests/test_codex_key_session_web.py
index afce691..8842531 100644
--- a/scripts/python/tests/test_codex_key_session_web.py
+++ b/scripts/python/tests/test_codex_key_session_web.py
@@ -1,9 +1,10 @@
 """Tests for the optional local web key session launcher."""
 
 import importlib.util
+import os
 import unittest
 from pathlib import Path
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 
 
 MODULE_PATH = Path(__file__).resolve().parents[1] / "codex_key_session_web.py"
@@ -40,6 +41,44 @@ def test_stop_proxy_terminates_running_process(self) -> None:
         self.assertIsNone(state.process)
         self.assertEqual(state.message, "done")
 
+    def test_start_proxy_exports_hf_token_under_litellm_mapping_name(self) -> None:
+        handler = object.__new__(WEB.KeySessionHandler)
+        handler.state = WEB.SessionState()
+        handler.config_path = Path("config.yaml")
+        handler.litellm_path = Path("litellm.exe")
+        handler.proxy_host = "127.0.0.1"
+        handler.proxy_port = 4000
+        handler._read_form = MagicMock(  # type: ignore[method-assign]
+            return_value={
+                "OPENAI_API_KEY": "",
+                "GEMINI_API_KEY": "",
+                "HF_TOKEN": "hf_test",
+                "QWEN_API_BASE": "",
+                "QWEN_API_KEY": "",
+            }
+        )
+        handler._stop_proxy = MagicMock()  # type: ignore[method-assign]
+        handler._send_page = MagicMock()  # type: ignore[method-assign]
+
+        process = MagicMock()
+        process.poll.return_value = None
+        captured_env: dict[str, str] = {}
+
+        def fake_popen(*args: object, **kwargs: object) -> MagicMock:
+            captured_env.update(kwargs["env"])  # type: ignore[index]
+            return process
+
+        with (
+            patch.object(WEB.subprocess, "Popen", side_effect=fake_popen),
+            patch.object(WEB, "wait_for_port", return_value=True),
+            patch.dict(os.environ, {}, clear=True),
+        ):
+            handler._start_proxy()
+
+        self.assertEqual(captured_env["HF_TOKEN"], "hf_test")
+        self.assertEqual(captured_env["HUGGINGFACE_API_KEY"], "hf_test")
+        self.assertIn("Hugging Face", handler.state.message)
+
 
 if __name__ == "__main__":
     unittest.main()

From 6a634cee4a0bb0ca017a43146d8f602668f894a3 Mon Sep 17 00:00:00 2001
From: Tibo2403 <Tibo2403@users.noreply.github.com>
Date: Sat, 27 Jun 2026 20:36:13 +0200
Subject: [PATCH 3/6] Document and verify Codex LiteLLM dispatch

---
 .../Install-CodexLocalLiteLLMAssets.ps1       |  3 +-
 scripts/python/README.md                      | 11 ++-
 scripts/python/README_Codex_Cost_Routing.md   | 52 +++++++++-
 scripts/python/Test-CodexLiteLLMDispatch.ps1  | 95 +++++++++++++++++++
 4 files changed, 151 insertions(+), 10 deletions(-)
 create mode 100644 scripts/python/Test-CodexLiteLLMDispatch.ps1

diff --git a/scripts/python/Install-CodexLocalLiteLLMAssets.ps1 b/scripts/python/Install-CodexLocalLiteLLMAssets.ps1
index bc87468..6fc0938 100644
--- a/scripts/python/Install-CodexLocalLiteLLMAssets.ps1
+++ b/scripts/python/Install-CodexLocalLiteLLMAssets.ps1
@@ -8,7 +8,8 @@ New-Item -ItemType Directory -Force -Path $target | Out-Null
 $files = @(
     'litellm-cost-routing.yaml',
     'codex_key_session_web.py',
-    'Start-CodexKeySessionWeb.ps1'
+    'Start-CodexKeySessionWeb.ps1',
+    'Test-CodexLiteLLMDispatch.ps1'
 )
 
 foreach ($file in $files) {
diff --git a/scripts/python/README.md b/scripts/python/README.md
index 02ebe3e..b1b9d59 100644
--- a/scripts/python/README.md
+++ b/scripts/python/README.md
@@ -58,9 +58,10 @@ dispatches those aliases across OpenAI and Gemini while keeping API keys in
 environment variables. When `HF_TOKEN` is available, it can also route Hugging
 Face and multi-provider tasks through the `codex-hf-cheap` and `codex-hf-fast`
 LiteLLM aliases, or launch an optional `cost-routing-hf` Codex profile that
-points directly at the Hugging Face router. `codex-routing-policy.yaml` keeps
-the default provider rules and fallback order editable without changing Python
-code.
+points directly at the Hugging Face router. `codex-qwen-local` is available as
+a self-hosted OpenAI-compatible Qwen fallback when `QWEN_API_BASE` is set.
+`codex-routing-policy.yaml` keeps the default provider rules and fallback order
+editable without changing Python code.
 
 See [`README_Codex_Cost_Routing.md`](README_Codex_Cost_Routing.md) for setup,
 activation, LiteLLM configuration, and usage instructions.
@@ -68,7 +69,9 @@ activation, LiteLLM configuration, and usage instructions.
 To enter OpenAI, Gemini, or Hugging Face keys through a local page for one
 session, run `Start-CodexKeySessionWeb.ps1` and open
 `http://127.0.0.1:8787/`. Keys are kept in memory for the LiteLLM subprocess
-and are not written to disk.
+and are not written to disk. Use `Test-CodexLiteLLMDispatch.ps1` to verify the
+local proxy aliases, or add `-Call -Model codex-hf-cheap` after entering a
+provider key to make one minimal dispatch request.
 
 ## LLM Review Tools
 
diff --git a/scripts/python/README_Codex_Cost_Routing.md b/scripts/python/README_Codex_Cost_Routing.md
index 2eb5ed9..6ec17a7 100644
--- a/scripts/python/README_Codex_Cost_Routing.md
+++ b/scripts/python/README_Codex_Cost_Routing.md
@@ -34,7 +34,7 @@ provider pool. The local config still includes two optional aliases:
 
 ```yaml
 codex-hf-cheap -> huggingface/groq/openai/gpt-oss-120b
-codex-hf-fast  -> huggingface/together/deepseek-ai/DeepSeek-R1
+codex-hf-fast  -> huggingface/together/openai/gpt-oss-120b
 ```
 
 Set `HF_TOKEN` in the shell before starting the router. A fine-grained token
@@ -56,6 +56,30 @@ python .\scripts\python\codex_cost_router.py run --dry-run `
 `--provider auto` routes Hugging Face or multi-provider prompts to the HF aliases
 only when `HF_TOKEN` is present. Otherwise it keeps the OpenAI-backed aliases.
 
+LiteLLM also uses `HUGGINGFACE_API_KEY` while resolving some Inference Provider
+mappings. The local web session exports the submitted `HF_TOKEN` under both
+names for the LiteLLM subprocess. If you start LiteLLM manually, set both names
+to the same token:
+
+```powershell
+$env:HF_TOKEN = 'hf_...'
+$env:HUGGINGFACE_API_KEY = $env:HF_TOKEN
+```
+
+## Self-Hosted Qwen Fallback
+
+The local LiteLLM config includes `codex-qwen-local` as a final fallback for
+the main Codex aliases. It expects an OpenAI-compatible local endpoint:
+
+```powershell
+$env:QWEN_API_BASE = 'http://127.0.0.1:8000/v1'
+$env:QWEN_API_KEY = 'sk-local-qwen'
+```
+
+`QWEN_API_KEY` can be any dummy value when your local server does not require
+authentication. The local web key page also accepts these two fields and passes
+them only to the LiteLLM subprocess environment.
+
 Second, Hugging Face can be added as an optional Codex-facing layer. Running
 `enable` now installs two managed profiles:
 
@@ -155,10 +179,10 @@ If you prefer entering keys in a local page for one work session, start:
 ```
 
 Then open `http://127.0.0.1:8787/`, paste `OPENAI_API_KEY`,
-`GEMINI_API_KEY`, or `HF_TOKEN`, and submit the form. The page starts the
-LiteLLM proxy on `http://127.0.0.1:4000/v1` with those keys only in the proxy
-process environment. The keys are not written to disk and the web server
-suppresses request logging.
+`GEMINI_API_KEY`, `HF_TOKEN`, or the optional Qwen endpoint fields, and submit
+the form. The page starts the LiteLLM proxy on `http://127.0.0.1:4000/v1` with
+those values only in the proxy process environment. The keys are not written to
+disk and the web server suppresses request logging.
 
 To launch the optional Hugging Face-facing profile instead of the local LiteLLM
 proxy:
@@ -183,6 +207,23 @@ python .\scripts\python\codex_cost_router.py doctor
 If a browser opened on `http://localhost:4000/health` shows `Unauthorized`,
 that is expected: the local proxy is protected by `LITELLM_API_KEY`.
 
+Validate the local proxy aliases without making a paid/model call:
+
+```powershell
+.\scripts\python\Test-CodexLiteLLMDispatch.ps1
+```
+
+Run a real minimal provider call after entering the relevant key in the local
+web page:
+
+```powershell
+.\scripts\python\Test-CodexLiteLLMDispatch.ps1 -Model codex-hf-cheap -Call
+.\scripts\python\Test-CodexLiteLLMDispatch.ps1 -Model codex-qwen-local -Call
+.\scripts\python\Test-CodexLiteLLMDispatch.ps1 -Model codex-default -Call
+```
+
+The test prints a compact JSON result and never prints provider tokens.
+
 ## Optimized One-Shot Requests
 
 Use the Python wrapper when prompt cleanup and dynamic model routing are needed:
@@ -225,6 +266,7 @@ Prompts and API keys are not logged.
 - `codex_cost_router.py`: prompt optimization and one-shot routing.
 - `codex_key_session_web.py`: local-only web form for session keys.
 - `Start-CodexKeySessionWeb.ps1`: PowerShell launcher for the local key page.
+- `Test-CodexLiteLLMDispatch.ps1`: local proxy alias and optional call test.
 - `codex-routing-policy.yaml`: editable routing policy and fallback order.
 - `litellm-cost-routing.yaml`: local LiteLLM OSS OpenAI/Gemini model groups,
   context-window fallbacks, cooldowns, and compatibility aliases.
diff --git a/scripts/python/Test-CodexLiteLLMDispatch.ps1 b/scripts/python/Test-CodexLiteLLMDispatch.ps1
new file mode 100644
index 0000000..0c99189
--- /dev/null
+++ b/scripts/python/Test-CodexLiteLLMDispatch.ps1
@@ -0,0 +1,95 @@
+[CmdletBinding()]
+param(
+    [string]$BaseUrl = "http://127.0.0.1:4000/v1",
+    [string]$ApiKey = "sk-local-codex",
+    [string]$Model = "codex-default",
+    [switch]$Call,
+    [int]$TimeoutSec = 90
+)
+
+$ErrorActionPreference = "Stop"
+
+$headers = @{
+    "Authorization" = "Bearer $ApiKey"
+    "Content-Type" = "application/json"
+}
+
+function ConvertTo-ShortError {
+    param([object]$ErrorRecord)
+
+    $message = $ErrorRecord.Exception.Message
+    if ($ErrorRecord.ErrorDetails -and $ErrorRecord.ErrorDetails.Message) {
+        $message = $ErrorRecord.ErrorDetails.Message
+    }
+    if ($message.Length -gt 900) {
+        return $message.Substring(0, 900) + "..."
+    }
+    return $message
+}
+
+$models = Invoke-RestMethod -Uri "$BaseUrl/models" -Headers $headers -Method Get -TimeoutSec 10
+$modelIds = @($models.data | ForEach-Object { $_.id })
+$requiredAliases = @(
+    "codex-light",
+    "codex-default",
+    "codex-long",
+    "codex-deep",
+    "codex-qwen-local",
+    "codex-hf-cheap",
+    "codex-hf-fast"
+)
+$missingAliases = @($requiredAliases | Where-Object { $modelIds -notcontains $_ })
+
+$health = $null
+try {
+    $healthUrl = $BaseUrl -replace "/v1$", ""
+    $health = Invoke-RestMethod -Uri "$healthUrl/health" -Headers $headers -Method Get -TimeoutSec $TimeoutSec
+}
+catch {
+    $health = [pscustomobject]@{
+        healthy_count = $null
+        unhealthy_count = $null
+        health_error = ConvertTo-ShortError $_
+    }
+}
+
+$callResult = $null
+if ($Call) {
+    $body = @{
+        model = $Model
+        messages = @(
+            @{
+                role = "user"
+                content = "Reply with exactly: dispatch ok"
+            }
+        )
+        max_tokens = 16
+        temperature = 0
+    } | ConvertTo-Json -Depth 6
+
+    try {
+        $response = Invoke-RestMethod -Uri "$BaseUrl/chat/completions" -Headers $headers -Method Post -Body $body -TimeoutSec $TimeoutSec
+        $callResult = [pscustomobject]@{
+            ok = $true
+            model = $response.model
+            content = $response.choices[0].message.content
+        }
+    }
+    catch {
+        $callResult = [pscustomobject]@{
+            ok = $false
+            error = ConvertTo-ShortError $_
+        }
+    }
+}
+
+[pscustomobject]@{
+    ok = ($missingAliases.Count -eq 0 -and (-not $Call -or ($callResult -and $callResult.ok)))
+    base_url = $BaseUrl
+    aliases_present = @($requiredAliases | Where-Object { $modelIds -contains $_ })
+    aliases_missing = $missingAliases
+    healthy_count = $health.healthy_count
+    unhealthy_count = $health.unhealthy_count
+    health_error = $health.health_error
+    call = $callResult
+} | ConvertTo-Json -Depth 6

From d196b93504524123da53b0c5fe11a7b600497d36 Mon Sep 17 00:00:00 2001
From: Tibo2403 <Tibo2403@users.noreply.github.com>
Date: Sat, 27 Jun 2026 20:57:04 +0200
Subject: [PATCH 4/6] Use Qwen2.5 Coder GGUF through Ollama

---
 .../Install-CodexLocalLiteLLMAssets.ps1       |  1 +
 scripts/python/README.md                      |  2 +-
 scripts/python/README_Codex_Cost_Routing.md   | 29 ++++++----
 scripts/python/Start-CodexQwenOllama.ps1      | 55 +++++++++++++++++++
 scripts/python/codex_cost_router.py           | 18 ++++--
 scripts/python/litellm-cost-routing.yaml      | 22 ++++----
 .../python/tests/test_codex_cost_router.py    |  8 ++-
 7 files changed, 103 insertions(+), 32 deletions(-)
 create mode 100644 scripts/python/Start-CodexQwenOllama.ps1

diff --git a/scripts/python/Install-CodexLocalLiteLLMAssets.ps1 b/scripts/python/Install-CodexLocalLiteLLMAssets.ps1
index 6fc0938..42f54f8 100644
--- a/scripts/python/Install-CodexLocalLiteLLMAssets.ps1
+++ b/scripts/python/Install-CodexLocalLiteLLMAssets.ps1
@@ -9,6 +9,7 @@ $files = @(
     'litellm-cost-routing.yaml',
     'codex_key_session_web.py',
     'Start-CodexKeySessionWeb.ps1',
+    'Start-CodexQwenOllama.ps1',
     'Test-CodexLiteLLMDispatch.ps1'
 )
 
diff --git a/scripts/python/README.md b/scripts/python/README.md
index b1b9d59..a0ed839 100644
--- a/scripts/python/README.md
+++ b/scripts/python/README.md
@@ -59,7 +59,7 @@ environment variables. When `HF_TOKEN` is available, it can also route Hugging
 Face and multi-provider tasks through the `codex-hf-cheap` and `codex-hf-fast`
 LiteLLM aliases, or launch an optional `cost-routing-hf` Codex profile that
 points directly at the Hugging Face router. `codex-qwen-local` is available as
-a self-hosted OpenAI-compatible Qwen fallback when `QWEN_API_BASE` is set.
+a local Ollama fallback through `Qwen/Qwen2.5-Coder-7B-Instruct-GGUF`.
 `codex-routing-policy.yaml` keeps the default provider rules and fallback order
 editable without changing Python code.
 
diff --git a/scripts/python/README_Codex_Cost_Routing.md b/scripts/python/README_Codex_Cost_Routing.md
index 6ec17a7..834444b 100644
--- a/scripts/python/README_Codex_Cost_Routing.md
+++ b/scripts/python/README_Codex_Cost_Routing.md
@@ -66,19 +66,24 @@ $env:HF_TOKEN = 'hf_...'
 $env:HUGGINGFACE_API_KEY = $env:HF_TOKEN
 ```
 
-## Self-Hosted Qwen Fallback
+## Local Ollama Qwen Fallback
 
 The local LiteLLM config includes `codex-qwen-local` as a final fallback for
-the main Codex aliases. It expects an OpenAI-compatible local endpoint:
+the main Codex aliases. It uses Ollama's OpenAI-compatible endpoint with the
+lighter Qwen2.5 Coder 7B GGUF model:
 
 ```powershell
-$env:QWEN_API_BASE = 'http://127.0.0.1:8000/v1'
-$env:QWEN_API_KEY = 'sk-local-qwen'
+.\scripts\python\Start-CodexQwenOllama.ps1
 ```
 
-`QWEN_API_KEY` can be any dummy value when your local server does not require
-authentication. The local web key page also accepts these two fields and passes
-them only to the LiteLLM subprocess environment.
+The script starts Ollama if needed and pulls:
+
+```text
+hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest
+```
+
+LiteLLM then reaches it through `http://127.0.0.1:11434/v1`. No provider API key
+is required for this local fallback.
 
 Second, Hugging Face can be added as an optional Codex-facing layer. Running
 `enable` now installs two managed profiles:
@@ -179,10 +184,12 @@ If you prefer entering keys in a local page for one work session, start:
 ```
 
 Then open `http://127.0.0.1:8787/`, paste `OPENAI_API_KEY`,
-`GEMINI_API_KEY`, `HF_TOKEN`, or the optional Qwen endpoint fields, and submit
-the form. The page starts the LiteLLM proxy on `http://127.0.0.1:4000/v1` with
-those values only in the proxy process environment. The keys are not written to
-disk and the web server suppresses request logging.
+`GEMINI_API_KEY`, `HF_TOKEN`, or optional custom Qwen endpoint fields, and
+submit the form. For the default local Qwen/Ollama fallback, run
+`Start-CodexQwenOllama.ps1`; no Qwen API key is needed. The page starts the
+LiteLLM proxy on `http://127.0.0.1:4000/v1` with submitted values only in the
+proxy process environment. The keys are not written to disk and the web server
+suppresses request logging.
 
 To launch the optional Hugging Face-facing profile instead of the local LiteLLM
 proxy:
diff --git a/scripts/python/Start-CodexQwenOllama.ps1 b/scripts/python/Start-CodexQwenOllama.ps1
new file mode 100644
index 0000000..2372566
--- /dev/null
+++ b/scripts/python/Start-CodexQwenOllama.ps1
@@ -0,0 +1,55 @@
+[CmdletBinding()]
+param(
+    [string]$Model = "hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest",
+    [switch]$SkipPull
+)
+
+$ErrorActionPreference = "Stop"
+
+$ollama = Get-Command ollama -ErrorAction SilentlyContinue
+if (-not $ollama) {
+    throw "Ollama is not installed or not available in PATH."
+}
+
+try {
+    Invoke-RestMethod -Uri "http://127.0.0.1:11434/api/tags" -TimeoutSec 3 | Out-Null
+}
+catch {
+    Start-Process -WindowStyle Hidden -FilePath $ollama.Source -ArgumentList @("serve")
+    $ready = $false
+    for ($i = 0; $i -lt 40; $i++) {
+        Start-Sleep -Milliseconds 500
+        try {
+            Invoke-RestMethod -Uri "http://127.0.0.1:11434/api/tags" -TimeoutSec 2 | Out-Null
+            $ready = $true
+            break
+        }
+        catch {
+            $ready = $false
+        }
+    }
+    if (-not $ready) {
+        throw "Ollama did not become ready on http://127.0.0.1:11434."
+    }
+}
+
+$tags = Invoke-RestMethod -Uri "http://127.0.0.1:11434/api/tags" -TimeoutSec 10
+$modelNames = @($tags.models | ForEach-Object { $_.name })
+if (($modelNames -notcontains $Model) -and (-not $SkipPull)) {
+    & $ollama.Source pull $Model
+    if ($LASTEXITCODE -ne 0) {
+        throw "ollama pull failed for $Model"
+    }
+}
+
+$tags = Invoke-RestMethod -Uri "http://127.0.0.1:11434/api/tags" -TimeoutSec 10
+$modelNames = @($tags.models | ForEach-Object { $_.name })
+if ($modelNames -notcontains $Model) {
+    throw "$Model is not installed. Run without -SkipPull to download it."
+}
+
+[pscustomobject]@{
+    ok = $true
+    model = $Model
+    api_base = "http://127.0.0.1:11434/v1"
+} | ConvertTo-Json
diff --git a/scripts/python/codex_cost_router.py b/scripts/python/codex_cost_router.py
index d72e9e1..f4ce62a 100644
--- a/scripts/python/codex_cost_router.py
+++ b/scripts/python/codex_cost_router.py
@@ -51,6 +51,8 @@
 )
 LITELLM_HOST = "localhost"
 LITELLM_PORT = 4000
+OLLAMA_HOST = "127.0.0.1"
+OLLAMA_PORT = 11434
 WINDOWS_LITELLM_FALLBACK = Path(r"C:\tmp\litellm-oss\Scripts\litellm.exe")
 POLICY_FILE = Path(__file__).with_name("codex-routing-policy.yaml")
 DEFAULT_POLICY = {
@@ -392,8 +394,15 @@ def hf_available() -> bool:
 
 
 def qwen_available() -> bool:
-    """Return whether a self-hosted Qwen endpoint is configured."""
-    return bool(os.environ.get("QWEN_API_BASE"))
+    """Return whether the local Ollama Qwen endpoint is reachable."""
+    configured_base = os.environ.get("QWEN_API_BASE")
+    if configured_base:
+        return True
+    try:
+        with socket.create_connection((OLLAMA_HOST, OLLAMA_PORT), timeout=1):
+            return True
+    except OSError:
+        return False
 
 
 def default_provider() -> str:
@@ -508,7 +517,7 @@ def route_model(
     if provider == "qwen":
         if qwen_available():
             return QWEN_LOCAL_MODEL, f"qwen provider requested; {reason}"
-        return DEFAULT_MODEL, "qwen requested but QWEN_API_BASE is missing; using default OpenAI/Gemini tier"
+        return DEFAULT_MODEL, "qwen requested but Ollama is not listening on 127.0.0.1:11434; using default OpenAI/Gemini tier"
 
     if provider == "openai":
         model = LIGHT_MODEL if complexity == "simple" else DEEP_MODEL
@@ -682,8 +691,7 @@ def print_doctor() -> int:
         ("LiteLLM proxy localhost:4000", proxy_available(), "listening" if proxy_available() else "not listening"),
         ("LITELLM_API_KEY", bool(os.environ.get("LITELLM_API_KEY")), "set" if os.environ.get("LITELLM_API_KEY") else "missing"),
         ("OPENAI_API_KEY", bool(os.environ.get("OPENAI_API_KEY")), "set" if os.environ.get("OPENAI_API_KEY") else "missing"),
-        ("QWEN_API_BASE optional", True, os.environ.get("QWEN_API_BASE") or "missing; self-hosted Qwen fallback disabled"),
-        ("QWEN_API_KEY optional", True, "set" if os.environ.get("QWEN_API_KEY") else "missing; use dummy value for no-auth local servers"),
+        ("Ollama Qwen optional", True, "listening on 127.0.0.1:11434" if qwen_available() else "missing; run Start-CodexQwenOllama.ps1"),
         ("HF_TOKEN optional", True, "set" if hf_available() else "missing; Hugging Face aliases disabled"),
         ("PYTHONUTF8", os.environ.get("PYTHONUTF8") == "1", "1" if os.environ.get("PYTHONUTF8") == "1" else "missing or not 1"),
         ("Cost-routing profile", router_enabled(), "enabled" if router_enabled() else "disabled"),
diff --git a/scripts/python/litellm-cost-routing.yaml b/scripts/python/litellm-cost-routing.yaml
index 7bb2db6..ccda823 100644
--- a/scripts/python/litellm-cost-routing.yaml
+++ b/scripts/python/litellm-cost-routing.yaml
@@ -5,11 +5,10 @@
 # - codex-default: normal coding work, OpenAI biased with Gemini relief
 # - codex-long: long-context reads and synthesis, Gemini Pro biased
 # - codex-deep: high-stakes debugging/security/architecture, OpenAI biased
-# - codex-qwen-local: self-hosted OpenAI-compatible Qwen fallback
+# - codex-qwen-local: local Ollama Qwen fallback
 # API keys are read from environment variables and must never be committed.
-# Qwen fallback expects QWEN_API_BASE, for example http://127.0.0.1:8000/v1.
-# QWEN_API_KEY is optional for many local servers; use any dummy value when the
-# server does not require authentication.
+# Qwen fallback expects Ollama on http://127.0.0.1:11434/v1 with:
+#   hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest
 
 model_list:
   # Backward-compatible aliases used by older wrapper calls.
@@ -92,15 +91,14 @@ model_list:
       api_key: os.environ/GEMINI_API_KEY
       weight: 1
 
-  # Local OpenAI-compatible fallback for a self-hosted Qwen endpoint.
-  # Start a server with a /v1-compatible API and set:
-  #   QWEN_API_BASE=http://127.0.0.1:8000/v1
-  #   QWEN_API_KEY=sk-local-qwen
+  # Local Ollama fallback for Qwen2.5 Coder 7B GGUF.
+  # Prepare with:
+  #   ollama pull hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest
   - model_name: codex-qwen-local
     litellm_params:
-      model: openai/qwen-auto-hosted
-      api_base: os.environ/QWEN_API_BASE
-      api_key: os.environ/QWEN_API_KEY
+      model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest
+      api_base: http://127.0.0.1:11434/v1
+      api_key: sk-ollama-local
       weight: 1
 
   - model_name: codex-hf-cheap
@@ -124,7 +122,7 @@ router_settings:
     gpt-5.4-mini: codex-light
     gemini-3.5-pro: codex-long
     gemini-3.5-flash: codex-light
-    qwen-auto-hosted: codex-qwen-local
+    hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest: codex-qwen-local
   fallbacks:
     - codex-light:
         - codex-default
diff --git a/scripts/python/tests/test_codex_cost_router.py b/scripts/python/tests/test_codex_cost_router.py
index de14cca..fe75d52 100644
--- a/scripts/python/tests/test_codex_cost_router.py
+++ b/scripts/python/tests/test_codex_cost_router.py
@@ -61,7 +61,7 @@ def test_route_model_falls_back_when_hugging_face_token_is_missing(self) -> None
             self.assertIn("HF_TOKEN is missing", reason)
 
     def test_route_model_can_use_self_hosted_qwen_when_endpoint_exists(self) -> None:
-        with patch.dict(ROUTER.os.environ, {"QWEN_API_BASE": "http://127.0.0.1:8000/v1"}):
+        with patch.dict(ROUTER.os.environ, {"QWEN_API_BASE": "http://127.0.0.1:11434/v1"}):
             self.assertEqual(
                 ROUTER.route_model("Use qwen auto heberge as backup", provider="qwen")[0],
                 "codex-qwen-local",
@@ -72,10 +72,12 @@ def test_route_model_can_use_self_hosted_qwen_when_endpoint_exists(self) -> None
             )
 
     def test_route_model_falls_back_when_qwen_endpoint_is_missing(self) -> None:
-        with patch.dict(ROUTER.os.environ, {}, clear=True):
+        with patch.dict(ROUTER.os.environ, {}, clear=True), patch.object(
+            ROUTER.socket, "create_connection", side_effect=OSError
+        ):
             model, reason = ROUTER.route_model("Use Qwen local", provider="qwen")
             self.assertEqual(model, "codex-default")
-            self.assertIn("QWEN_API_BASE is missing", reason)
+            self.assertIn("Ollama is not listening", reason)
 
     def test_codex_provider_helpers_select_expected_profiles(self) -> None:
         self.assertEqual(ROUTER.codex_profile("litellm"), "cost-routing")

From 67615854ac11d51f49af62e948331d3f9ac06b2d Mon Sep 17 00:00:00 2001
From: Tibo2403 <Tibo2403@users.noreply.github.com>
Date: Sat, 27 Jun 2026 22:40:25 +0200
Subject: [PATCH 5/6] Simplify Codex LiteLLM session page

---
 scripts/python/codex_key_session_web.py       | 195 ++++++++++++++----
 .../tests/test_codex_key_session_web.py       |  33 +++
 2 files changed, 193 insertions(+), 35 deletions(-)

diff --git a/scripts/python/codex_key_session_web.py b/scripts/python/codex_key_session_web.py
index 629f564..35b89b1 100644
--- a/scripts/python/codex_key_session_web.py
+++ b/scripts/python/codex_key_session_web.py
@@ -10,6 +10,7 @@
 import sys
 import time
 import urllib.parse
+import urllib.request
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 from pathlib import Path
 from typing import ClassVar
@@ -30,28 +31,56 @@
     :root {{
       color-scheme: light dark;
       font-family: Segoe UI, system-ui, sans-serif;
+      --accent: #1f6feb;
+      --ok: #238636;
+      --warn: #9a6700;
     }}
     body {{
       margin: 0;
       min-height: 100vh;
-      display: grid;
-      place-items: center;
+      background: color-mix(in srgb, Canvas 94%, CanvasText);
       background: Canvas;
       color: CanvasText;
     }}
     main {{
-      width: min(680px, calc(100vw - 32px));
-      border: 1px solid color-mix(in srgb, CanvasText 18%, transparent);
-      border-radius: 8px;
-      padding: 24px;
+      width: min(880px, calc(100vw - 32px));
+      margin: 28px auto;
+      padding: 0 0 28px;
     }}
     h1 {{
-      font-size: 22px;
-      margin: 0 0 8px;
+      font-size: clamp(24px, 4vw, 34px);
+      margin: 0 0 10px;
+      letter-spacing: 0;
+    }}
+    h2 {{
+      font-size: 17px;
+      margin: 0 0 14px;
     }}
     p {{
       line-height: 1.5;
     }}
+    .panel {{
+      border: 1px solid color-mix(in srgb, CanvasText 18%, transparent);
+      border-radius: 8px;
+      padding: 18px;
+      margin-top: 16px;
+      background: Canvas;
+    }}
+    .grid {{
+      display: grid;
+      grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
+      gap: 14px;
+    }}
+    .provider {{
+      border: 1px solid color-mix(in srgb, CanvasText 14%, transparent);
+      border-radius: 8px;
+      padding: 14px;
+      min-height: 96px;
+    }}
+    .provider strong {{
+      display: block;
+      margin-bottom: 4px;
+    }}
     label {{
       display: block;
       margin-top: 16px;
@@ -61,57 +90,139 @@
       width: 100%;
       box-sizing: border-box;
       margin-top: 6px;
-      padding: 10px;
+      padding: 11px 12px;
       border-radius: 6px;
       border: 1px solid color-mix(in srgb, CanvasText 24%, transparent);
       font: inherit;
     }}
+    input[type="checkbox"] {{
+      width: auto;
+      margin: 0 8px 0 0;
+    }}
+    .check {{
+      display: flex;
+      align-items: center;
+      gap: 8px;
+      margin-top: 10px;
+      font-weight: 600;
+    }}
+    .actions {{
+      display: flex;
+      flex-wrap: wrap;
+      gap: 12px;
+      margin-top: 18px;
+    }}
     button {{
-      margin-top: 20px;
       padding: 10px 14px;
       border: 0;
       border-radius: 6px;
-      background: #1f6feb;
+      background: var(--accent);
       color: white;
       font: inherit;
       font-weight: 600;
       cursor: pointer;
     }}
+    button.secondary {{
+      background: color-mix(in srgb, CanvasText 14%, Canvas);
+      color: CanvasText;
+    }}
     .status {{
       margin-top: 16px;
       padding: 12px;
       border-radius: 6px;
-      background: color-mix(in srgb, #1f6feb 12%, Canvas);
+      background: color-mix(in srgb, var(--accent) 12%, Canvas);
+      overflow-wrap: anywhere;
+    }}
+    .meta {{
+      display: grid;
+      grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
+      gap: 10px;
+      margin-top: 14px;
+    }}
+    .pill {{
+      border: 1px solid color-mix(in srgb, CanvasText 14%, transparent);
+      border-radius: 999px;
+      padding: 8px 12px;
       overflow-wrap: anywhere;
     }}
+    details {{
+      margin-top: 14px;
+    }}
+    summary {{
+      cursor: pointer;
+      font-weight: 600;
+    }}
     .muted {{
       opacity: .75;
       font-size: 14px;
     }}
+    .small {{
+      font-size: 13px;
+    }}
   </style>
 </head>
 <body>
   <main>
-    <h1>Codex LiteLLM session keys</h1>
-    <p class="muted">Keys are kept only in this local process environment and passed to the LiteLLM subprocess. They are not written to disk.</p>
+    <h1>Codex LiteLLM dispatch</h1>
+    <p class="muted">Start a local LiteLLM proxy for Codex with cloud keys kept only in this session. Qwen local runs through Ollama and needs no cloud key.</p>
     {status}
-    <form method="post" action="/start" autocomplete="off">
-      <label for="openai">OPENAI_API_KEY</label>
-      <input id="openai" name="OPENAI_API_KEY" type="password" placeholder="sk-..." autocomplete="off">
-      <label for="gemini">GEMINI_API_KEY</label>
-      <input id="gemini" name="GEMINI_API_KEY" type="password" placeholder="AI..." autocomplete="off">
-      <label for="hf">HF_TOKEN optional</label>
-      <input id="hf" name="HF_TOKEN" type="password" placeholder="hf_..." autocomplete="off">
-      <label for="qwen_base">QWEN_API_BASE optional</label>
-      <input id="qwen_base" name="QWEN_API_BASE" type="url" placeholder="http://127.0.0.1:8000/v1" autocomplete="off">
-      <label for="qwen_key">QWEN_API_KEY optional</label>
-      <input id="qwen_key" name="QWEN_API_KEY" type="password" placeholder="sk-local-qwen" autocomplete="off">
-      <button type="submit">Start session proxy</button>
-    </form>
-    <form method="post" action="/stop">
-      <button type="submit">Stop session proxy</button>
-    </form>
-    <p class="muted">Proxy URL: <code>http://127.0.0.1:{proxy_port}/v1</code></p>
+    <section class="panel">
+      <h2>Providers</h2>
+      <div class="grid">
+        <div class="provider">
+          <strong>OpenAI</strong>
+          <span class="muted small">Used for default and deep coding aliases.</span>
+        </div>
+        <div class="provider">
+          <strong>Gemini</strong>
+          <span class="muted small">Used for low-cost, long-context, and relief routing.</span>
+        </div>
+        <div class="provider">
+          <strong>Hugging Face</strong>
+          <span class="muted small">Optional aliases for HF-hosted open models.</span>
+        </div>
+        <div class="provider">
+          <strong>Qwen local</strong>
+          <span class="muted small">{qwen_status}</span>
+        </div>
+      </div>
+      <div class="meta">
+        <div class="pill">Proxy URL: <code>http://127.0.0.1:{proxy_port}/v1</code></div>
+        <div class="pill">Codex model aliases: <code>codex-light</code>, <code>codex-default</code>, <code>codex-long</code>, <code>codex-deep</code></div>
+        <div class="pill">Local fallback: <code>codex-qwen-local</code></div>
+      </div>
+    </section>
+
+    <section class="panel">
+      <h2>Session keys</h2>
+      <form method="post" action="/start" autocomplete="off">
+        <label for="openai">OpenAI API key</label>
+        <input id="openai" name="OPENAI_API_KEY" type="password" placeholder="sk-..." autocomplete="off">
+        <label for="gemini">Gemini API key</label>
+        <input id="gemini" name="GEMINI_API_KEY" type="password" placeholder="AI..." autocomplete="off">
+        <label for="hf">Hugging Face token optional</label>
+        <input id="hf" name="HF_TOKEN" type="password" placeholder="hf_..." autocomplete="off">
+        <label class="check" for="use_qwen">
+          <input id="use_qwen" name="USE_LOCAL_QWEN" type="checkbox" value="1" checked>
+          Enable local Qwen fallback through Ollama
+        </label>
+        <details>
+          <summary>Advanced custom Qwen endpoint</summary>
+          <label for="qwen_base">Qwen API base optional</label>
+          <input id="qwen_base" name="QWEN_API_BASE" type="url" placeholder="http://127.0.0.1:11434/v1" autocomplete="off">
+          <label for="qwen_key">Qwen API key optional</label>
+          <input id="qwen_key" name="QWEN_API_KEY" type="password" placeholder="sk-ollama-local" autocomplete="off">
+        </details>
+        <div class="actions">
+          <button type="submit">Start proxy</button>
+        </div>
+      </form>
+      <form method="post" action="/stop">
+        <div class="actions">
+          <button class="secondary" type="submit">Stop proxy</button>
+        </div>
+      </form>
+    </section>
   </main>
 </body>
 </html>
@@ -145,6 +256,18 @@ def wait_for_port(host: str, port: int, timeout: float = 20.0) -> bool:
     return False
 
 
+def local_qwen_status() -> str:
+    """Return a short status for the local Ollama Qwen fallback."""
+    try:
+        with urllib.request.urlopen("http://127.0.0.1:11434/v1/models", timeout=2) as response:
+            body = response.read().decode("utf-8", errors="replace")
+    except OSError:
+        return "Ollama is not reachable on 127.0.0.1:11434."
+    if "hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest" in body:
+        return "Ready through Ollama on 127.0.0.1:11434."
+    return "Ollama is running, but the Qwen2.5 Coder model was not found."
+
+
 class SessionState:
     """Mutable server state."""
 
@@ -184,7 +307,8 @@ def do_POST(self) -> None:  # noqa: N802
     def _send_page(self) -> None:
         safe_message = html.escape(self.state.message)
         status = f'<div class="status">{safe_message}</div>'
-        body = PAGE.format(status=status, proxy_port=self.proxy_port).encode("utf-8")
+        qwen_status = html.escape(local_qwen_status())
+        body = PAGE.format(status=status, proxy_port=self.proxy_port, qwen_status=qwen_status).encode("utf-8")
         self.send_response(200)
         self.send_header("Content-Type", "text/html; charset=utf-8")
         self.send_header("Cache-Control", "no-store")
@@ -203,10 +327,11 @@ def _start_proxy(self) -> None:
         openai_key = form.get("OPENAI_API_KEY", "")
         gemini_key = form.get("GEMINI_API_KEY", "")
         hf_token = form.get("HF_TOKEN", "")
+        use_local_qwen = form.get("USE_LOCAL_QWEN", "") == "1"
         qwen_base = form.get("QWEN_API_BASE", "")
         qwen_key = form.get("QWEN_API_KEY", "")
-        if not any((openai_key, gemini_key, hf_token, qwen_base)):
-            self.state.message = "Provide at least one provider key."
+        if not any((openai_key, gemini_key, hf_token, use_local_qwen, qwen_base)):
+            self.state.message = "Provide at least one provider key, or keep local Qwen enabled."
             self._send_page()
             return
 
@@ -256,7 +381,7 @@ def _start_proxy(self) -> None:
                 providers.append("Gemini")
             if hf_token:
                 providers.append("Hugging Face")
-            if qwen_base:
+            if use_local_qwen or qwen_base:
                 providers.append("Qwen local")
             self.state.message = "Session proxy started with: " + ", ".join(providers)
         else:
diff --git a/scripts/python/tests/test_codex_key_session_web.py b/scripts/python/tests/test_codex_key_session_web.py
index 8842531..6105c9f 100644
--- a/scripts/python/tests/test_codex_key_session_web.py
+++ b/scripts/python/tests/test_codex_key_session_web.py
@@ -53,6 +53,7 @@ def test_start_proxy_exports_hf_token_under_litellm_mapping_name(self) -> None:
                 "OPENAI_API_KEY": "",
                 "GEMINI_API_KEY": "",
                 "HF_TOKEN": "hf_test",
+                "USE_LOCAL_QWEN": "",
                 "QWEN_API_BASE": "",
                 "QWEN_API_KEY": "",
             }
@@ -79,6 +80,38 @@ def fake_popen(*args: object, **kwargs: object) -> MagicMock:
         self.assertEqual(captured_env["HUGGINGFACE_API_KEY"], "hf_test")
         self.assertIn("Hugging Face", handler.state.message)
 
+    def test_start_proxy_allows_qwen_local_without_cloud_keys(self) -> None:
+        handler = object.__new__(WEB.KeySessionHandler)
+        handler.state = WEB.SessionState()
+        handler.config_path = Path("config.yaml")
+        handler.litellm_path = Path("litellm.exe")
+        handler.proxy_host = "127.0.0.1"
+        handler.proxy_port = 4000
+        handler._read_form = MagicMock(  # type: ignore[method-assign]
+            return_value={
+                "OPENAI_API_KEY": "",
+                "GEMINI_API_KEY": "",
+                "HF_TOKEN": "",
+                "USE_LOCAL_QWEN": "1",
+                "QWEN_API_BASE": "",
+                "QWEN_API_KEY": "",
+            }
+        )
+        handler._stop_proxy = MagicMock()  # type: ignore[method-assign]
+        handler._send_page = MagicMock()  # type: ignore[method-assign]
+
+        process = MagicMock()
+        process.poll.return_value = None
+
+        with (
+            patch.object(WEB.subprocess, "Popen", return_value=process),
+            patch.object(WEB, "wait_for_port", return_value=True),
+            patch.dict(os.environ, {}, clear=True),
+        ):
+            handler._start_proxy()
+
+        self.assertIn("Qwen local", handler.state.message)
+
 
 if __name__ == "__main__":
     unittest.main()

From 7ecc30dc7716d704f37cffbc5d2ab9c439a417f0 Mon Sep 17 00:00:00 2001
From: Tibo2403 <Tibo2403@users.noreply.github.com>
Date: Sat, 27 Jun 2026 23:32:57 +0200
Subject: [PATCH 6/6] Add OpenAI quota saver routing

---
 scripts/python/README_Codex_Cost_Routing.md   | 40 +++++++++-
 scripts/python/codex-routing-policy.yaml      |  1 +
 scripts/python/codex_cost_router.py           | 23 +++++-
 scripts/python/litellm-cost-routing.yaml      | 75 +++++++++++++++++--
 .../python/tests/test_codex_cost_router.py    | 16 ++++
 5 files changed, 142 insertions(+), 13 deletions(-)

diff --git a/scripts/python/README_Codex_Cost_Routing.md b/scripts/python/README_Codex_Cost_Routing.md
index 834444b..a3e3c62 100644
--- a/scripts/python/README_Codex_Cost_Routing.md
+++ b/scripts/python/README_Codex_Cost_Routing.md
@@ -10,21 +10,52 @@ applies budgets, and selects one of these LiteLLM aliases:
 - `codex-default` for normal coding work
 - `codex-long` for long-context reads, log review, and synthesis
 - `codex-deep` for difficult debugging, security, and architecture decisions
+- `codex-no-openai` for Gemini + local Qwen routing when OpenAI quota is low
+  or exhausted
 - `codex-cheap` and `codex-strong` as backward-compatible aliases
 - `codex-hf-cheap` for simple Hugging Face / open-model tasks when `HF_TOKEN`
   is set
 - `codex-hf-fast` for larger Hugging Face / multi-provider tasks when
   `HF_TOKEN` is set
 
-OpenAI and Gemini are both configured through LiteLLM model groups. The normal
-default keeps most code-generation traffic on OpenAI while letting Gemini absorb
-long-context and lower-risk work. This reduces token saturation without sending
-high-stakes changes blindly to the cheapest model.
+OpenAI, Gemini, and local Qwen are configured through LiteLLM model groups. The
+normal default now balances OpenAI with Gemini relief and keeps Qwen as a local
+zero-cost fallback. This reduces token saturation without sending high-stakes
+changes blindly to the cheapest model.
 
 API keys are never committed or written to a configuration file. `OPENAI_API_KEY`
 is required for the default profile; `GEMINI_API_KEY` is optional but recommended
 to activate the OpenAI/Gemini dispatching path.
 
+## OpenAI Quota Saver
+
+When OpenAI quota is low or exhausted, use the `codex-no-openai` alias. It routes
+through Gemini first and local Qwen second, without OpenAI entries in the model
+group:
+
+```powershell
+codex --model codex-no-openai
+```
+
+For one-shot wrapper calls, either force the provider:
+
+```powershell
+python .\scripts\python\codex_cost_router.py run --dry-run `
+  --provider no-openai `
+  "Refactor this Python API without using OpenAI quota"
+```
+
+or set a temporary session mode:
+
+```powershell
+$env:CODEX_ROUTER_OPENAI_MODE = 'avoid'
+python .\scripts\python\codex_cost_router.py run --dry-run `
+  "Refactor this Python API without using OpenAI quota"
+```
+
+For a durable default, set `avoid_openai: true` in
+`codex-routing-policy.yaml`.
+
 ## Hugging Face Integration
 
 Hugging Face can be used in two optional places.
@@ -129,6 +160,7 @@ Default policy:
 default_provider: auto
 default_codex_provider: litellm
 open_models_only: false
+avoid_openai: false
 max_cost_usd: 0.0
 
 task_provider_rules:
diff --git a/scripts/python/codex-routing-policy.yaml b/scripts/python/codex-routing-policy.yaml
index 81052fa..0850f90 100644
--- a/scripts/python/codex-routing-policy.yaml
+++ b/scripts/python/codex-routing-policy.yaml
@@ -6,6 +6,7 @@
 default_provider: auto
 default_codex_provider: litellm
 open_models_only: false
+avoid_openai: false
 max_cost_usd: 0.0
 
 task_provider_rules:
diff --git a/scripts/python/codex_cost_router.py b/scripts/python/codex_cost_router.py
index f4ce62a..916851b 100644
--- a/scripts/python/codex_cost_router.py
+++ b/scripts/python/codex_cost_router.py
@@ -34,9 +34,10 @@
 HF_CHEAP_MODEL = "codex-hf-cheap"
 HF_DIRECT_MODEL = "openai/gpt-oss-120b:fastest"
 QWEN_LOCAL_MODEL = "codex-qwen-local"
+NO_OPENAI_MODEL = "codex-no-openai"
 DEFAULT_MAX_INPUT_TOKENS = 12_000
 DEFAULT_MAX_OUTPUT_TOKENS = 2_000
-PROVIDERS = ("auto", "openai", "gemini", "huggingface", "qwen")
+PROVIDERS = ("auto", "openai", "gemini", "huggingface", "qwen", "no-openai")
 CODEX_PROVIDERS = ("litellm", "huggingface")
 MODELS = (
     LIGHT_MODEL,
@@ -48,6 +49,7 @@
     HF_FAST_MODEL,
     HF_CHEAP_MODEL,
     QWEN_LOCAL_MODEL,
+    NO_OPENAI_MODEL,
 )
 LITELLM_HOST = "localhost"
 LITELLM_PORT = 4000
@@ -59,6 +61,7 @@
     "default_provider": "auto",
     "default_codex_provider": "litellm",
     "open_models_only": False,
+    "avoid_openai": False,
     "max_cost_usd": 0.0,
     "task_provider_rules": {
         "simple": "auto",
@@ -80,6 +83,7 @@
     HF_CHEAP_MODEL: {"input": 0.10, "output": 0.30},
     HF_FAST_MODEL: {"input": 0.25, "output": 0.75},
     QWEN_LOCAL_MODEL: {"input": 0.0, "output": 0.0},
+    NO_OPENAI_MODEL: {"input": 0.40, "output": 1.50},
 }
 
 SIMPLE_TERMS = (
@@ -411,6 +415,16 @@ def default_provider() -> str:
     return provider if provider in PROVIDERS else "auto"
 
 
+def openai_avoidance_enabled(policy: dict[str, Any] | None = None) -> bool:
+    """Return whether OpenAI should be avoided to preserve or bypass quota."""
+    value = os.environ.get("CODEX_ROUTER_OPENAI_MODE", "").casefold()
+    if value in {"avoid", "off", "depleted", "quota", "no-openai", "no_openai"}:
+        return True
+    if value in {"", "auto", "normal", "on"}:
+        return bool(policy and policy.get("avoid_openai"))
+    return False
+
+
 def default_codex_provider() -> str:
     """Read the Codex-facing provider preference with a safe fallback."""
     provider = os.environ.get("CODEX_ROUTER_CODEX_PROVIDER", "litellm").casefold()
@@ -441,6 +455,8 @@ def provider_from_policy(
         return default_provider(), "provider forced by CODEX_ROUTER_PROVIDER"
     if bool(policy.get("open_models_only")):
         return "huggingface", "policy open_models_only"
+    if openai_avoidance_enabled(policy):
+        return "no-openai", "OpenAI avoidance enabled"
     complexity, _ = classify_complexity(prompt)
     rules = policy.get("task_provider_rules", {})
     if isinstance(rules, dict) and complexity in rules:
@@ -519,6 +535,11 @@ def route_model(
             return QWEN_LOCAL_MODEL, f"qwen provider requested; {reason}"
         return DEFAULT_MODEL, "qwen requested but Ollama is not listening on 127.0.0.1:11434; using default OpenAI/Gemini tier"
 
+    if provider == "no-openai":
+        if qwen_available():
+            return NO_OPENAI_MODEL, f"OpenAI avoided; Gemini/Qwen alias selected; {reason}"
+        return LONG_MODEL, f"OpenAI avoided; Qwen unavailable so Gemini long-context alias selected; {reason}"
+
     if provider == "openai":
         model = LIGHT_MODEL if complexity == "simple" else DEEP_MODEL
         return model, f"openai provider requested; {reason}"
diff --git a/scripts/python/litellm-cost-routing.yaml b/scripts/python/litellm-cost-routing.yaml
index ccda823..6330036 100644
--- a/scripts/python/litellm-cost-routing.yaml
+++ b/scripts/python/litellm-cost-routing.yaml
@@ -1,10 +1,11 @@
 # LiteLLM OSS self-hosted proxy example for Codex.
 # Task-oriented aliases let Codex route by workload instead of hard-coding a
 # single provider:
-# - codex-light: cheap/frequent work, Gemini Flash biased
-# - codex-default: normal coding work, OpenAI biased with Gemini relief
+# - codex-light: cheap/frequent work, Gemini Flash biased with local Qwen relief
+# - codex-default: normal coding work, balanced OpenAI/Gemini with local Qwen relief
 # - codex-long: long-context reads and synthesis, Gemini Pro biased
-# - codex-deep: high-stakes debugging/security/architecture, OpenAI biased
+# - codex-deep: high-stakes debugging/security/architecture, OpenAI biased with fast fallback
+# - codex-no-openai: Gemini/Qwen routing when OpenAI quota is exhausted
 # - codex-qwen-local: local Ollama Qwen fallback
 # API keys are read from environment variables and must never be committed.
 # Qwen fallback expects Ollama on http://127.0.0.1:11434/v1 with:
@@ -47,19 +48,33 @@ model_list:
     litellm_params:
       model: openai/gpt-5.4-mini
       api_key: os.environ/OPENAI_API_KEY
-      weight: 3
+      weight: 2
+
+  - model_name: codex-light
+    litellm_params:
+      model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest
+      api_base: http://127.0.0.1:11434/v1
+      api_key: sk-ollama-local
+      weight: 1
 
   - model_name: codex-default
     litellm_params:
       model: openai/gpt-5.5
       api_key: os.environ/OPENAI_API_KEY
-      weight: 8
+      weight: 5
 
   - model_name: codex-default
     litellm_params:
       model: gemini/gemini-3.5-pro
       api_key: os.environ/GEMINI_API_KEY
-      weight: 2
+      weight: 4
+
+  - model_name: codex-default
+    litellm_params:
+      model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest
+      api_base: http://127.0.0.1:11434/v1
+      api_key: sk-ollama-local
+      weight: 1
 
   - model_name: codex-long
     litellm_params:
@@ -79,18 +94,53 @@ model_list:
       api_key: os.environ/OPENAI_API_KEY
       weight: 1
 
+  - model_name: codex-long
+    litellm_params:
+      model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest
+      api_base: http://127.0.0.1:11434/v1
+      api_key: sk-ollama-local
+      weight: 1
+
   - model_name: codex-deep
     litellm_params:
       model: openai/gpt-5.5
       api_key: os.environ/OPENAI_API_KEY
-      weight: 10
+      weight: 7
 
   - model_name: codex-deep
     litellm_params:
       model: gemini/gemini-3.5-pro
       api_key: os.environ/GEMINI_API_KEY
+      weight: 2
+
+  - model_name: codex-deep
+    litellm_params:
+      model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest
+      api_base: http://127.0.0.1:11434/v1
+      api_key: sk-ollama-local
       weight: 1
 
+  # OpenAI quota saver. Use this alias directly, or set
+  # CODEX_ROUTER_OPENAI_MODE=avoid with codex_cost_router.py.
+  - model_name: codex-no-openai
+    litellm_params:
+      model: gemini/gemini-3.5-pro
+      api_key: os.environ/GEMINI_API_KEY
+      weight: 6
+
+  - model_name: codex-no-openai
+    litellm_params:
+      model: gemini/gemini-3.5-flash
+      api_key: os.environ/GEMINI_API_KEY
+      weight: 2
+
+  - model_name: codex-no-openai
+    litellm_params:
+      model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest
+      api_base: http://127.0.0.1:11434/v1
+      api_key: sk-ollama-local
+      weight: 2
+
   # Local Ollama fallback for Qwen2.5 Coder 7B GGUF.
   # Prepare with:
   #   ollama pull hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest
@@ -125,24 +175,33 @@ router_settings:
     hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest: codex-qwen-local
   fallbacks:
     - codex-light:
+        - codex-no-openai
         - codex-default
         - codex-qwen-local
     - codex-default:
+        - codex-no-openai
         - codex-long
         - codex-light
         - codex-qwen-local
     - codex-long:
+        - codex-no-openai
         - codex-default
         - codex-qwen-local
     - codex-deep:
+        - codex-no-openai
         - codex-default
         - codex-long
         - codex-qwen-local
+    - codex-no-openai:
+        - codex-long
+        - codex-qwen-local
     - codex-cheap:
+        - codex-no-openai
         - codex-strong
         - codex-default
         - codex-qwen-local
     - codex-strong:
+        - codex-no-openai
         - codex-default
         - codex-long
         - codex-qwen-local
@@ -166,7 +225,7 @@ router_settings:
   allowed_fails_policy:
     AuthenticationErrorAllowedFails: 0
     TimeoutErrorAllowedFails: 2
-    RateLimitErrorAllowedFails: 4
+    RateLimitErrorAllowedFails: 1
 
 litellm_settings:
   drop_params: true
diff --git a/scripts/python/tests/test_codex_cost_router.py b/scripts/python/tests/test_codex_cost_router.py
index fe75d52..9eaf85c 100644
--- a/scripts/python/tests/test_codex_cost_router.py
+++ b/scripts/python/tests/test_codex_cost_router.py
@@ -71,6 +71,12 @@ def test_route_model_can_use_self_hosted_qwen_when_endpoint_exists(self) -> None
                 "codex-qwen-local",
             )
 
+    def test_route_model_can_avoid_openai_with_gemini_qwen_alias(self) -> None:
+        with patch.dict(ROUTER.os.environ, {"QWEN_API_BASE": "http://127.0.0.1:11434/v1"}):
+            model, reason = ROUTER.route_model("Refactor this Python API", provider="no-openai")
+        self.assertEqual(model, "codex-no-openai")
+        self.assertIn("OpenAI avoided", reason)
+
     def test_route_model_falls_back_when_qwen_endpoint_is_missing(self) -> None:
         with patch.dict(ROUTER.os.environ, {}, clear=True), patch.object(
             ROUTER.socket, "create_connection", side_effect=OSError
@@ -143,6 +149,16 @@ def test_policy_open_models_only_prefers_hugging_face(self) -> None:
         self.assertEqual(ROUTER.provider_from_policy("Security review", None, policy)[0], "huggingface")
         self.assertEqual(ROUTER.codex_provider_from_policy(None, policy)[0], "huggingface")
 
+    def test_policy_or_environment_can_avoid_openai(self) -> None:
+        policy = {**ROUTER.DEFAULT_POLICY, "avoid_openai": True}
+        provider, reason = ROUTER.provider_from_policy("Refactor this Python API", None, policy)
+        self.assertEqual(provider, "no-openai")
+        self.assertIn("OpenAI avoidance", reason)
+        with patch.dict(ROUTER.os.environ, {"CODEX_ROUTER_OPENAI_MODE": "avoid"}):
+            provider, reason = ROUTER.provider_from_policy("Refactor this Python API", None, ROUTER.DEFAULT_POLICY)
+        self.assertEqual(provider, "no-openai")
+        self.assertIn("OpenAI avoidance", reason)
+
     def test_build_optimized_prompt_respects_budget(self) -> None:
         context = "<div>" + ("Architecture production Odoo migration security. " * 1000) + "</div>"
         optimized = ROUTER.build_optimized_prompt(context, 120)