diff --git a/scripts/python/Install-CodexLocalLiteLLMAssets.ps1 b/scripts/python/Install-CodexLocalLiteLLMAssets.ps1 index bc87468..42f54f8 100644 --- a/scripts/python/Install-CodexLocalLiteLLMAssets.ps1 +++ b/scripts/python/Install-CodexLocalLiteLLMAssets.ps1 @@ -8,7 +8,9 @@ New-Item -ItemType Directory -Force -Path $target | Out-Null $files = @( 'litellm-cost-routing.yaml', 'codex_key_session_web.py', - 'Start-CodexKeySessionWeb.ps1' + 'Start-CodexKeySessionWeb.ps1', + 'Start-CodexQwenOllama.ps1', + 'Test-CodexLiteLLMDispatch.ps1' ) foreach ($file in $files) { diff --git a/scripts/python/README.md b/scripts/python/README.md index 02ebe3e..a0ed839 100644 --- a/scripts/python/README.md +++ b/scripts/python/README.md @@ -58,9 +58,10 @@ dispatches those aliases across OpenAI and Gemini while keeping API keys in environment variables. When `HF_TOKEN` is available, it can also route Hugging Face and multi-provider tasks through the `codex-hf-cheap` and `codex-hf-fast` LiteLLM aliases, or launch an optional `cost-routing-hf` Codex profile that -points directly at the Hugging Face router. `codex-routing-policy.yaml` keeps -the default provider rules and fallback order editable without changing Python -code. +points directly at the Hugging Face router. `codex-qwen-local` is available as +a local Ollama fallback through `Qwen/Qwen2.5-Coder-7B-Instruct-GGUF`. +`codex-routing-policy.yaml` keeps the default provider rules and fallback order +editable without changing Python code. See [`README_Codex_Cost_Routing.md`](README_Codex_Cost_Routing.md) for setup, activation, LiteLLM configuration, and usage instructions. @@ -68,7 +69,9 @@ activation, LiteLLM configuration, and usage instructions. To enter OpenAI, Gemini, or Hugging Face keys through a local page for one session, run `Start-CodexKeySessionWeb.ps1` and open `http://127.0.0.1:8787/`. Keys are kept in memory for the LiteLLM subprocess -and are not written to disk. +and are not written to disk. Use `Test-CodexLiteLLMDispatch.ps1` to verify the +local proxy aliases, or add `-Call -Model codex-hf-cheap` after entering a +provider key to make one minimal dispatch request. ## LLM Review Tools diff --git a/scripts/python/README_Codex_Cost_Routing.md b/scripts/python/README_Codex_Cost_Routing.md index 2eb5ed9..a3e3c62 100644 --- a/scripts/python/README_Codex_Cost_Routing.md +++ b/scripts/python/README_Codex_Cost_Routing.md @@ -10,21 +10,52 @@ applies budgets, and selects one of these LiteLLM aliases: - `codex-default` for normal coding work - `codex-long` for long-context reads, log review, and synthesis - `codex-deep` for difficult debugging, security, and architecture decisions +- `codex-no-openai` for Gemini + local Qwen routing when OpenAI quota is low + or exhausted - `codex-cheap` and `codex-strong` as backward-compatible aliases - `codex-hf-cheap` for simple Hugging Face / open-model tasks when `HF_TOKEN` is set - `codex-hf-fast` for larger Hugging Face / multi-provider tasks when `HF_TOKEN` is set -OpenAI and Gemini are both configured through LiteLLM model groups. The normal -default keeps most code-generation traffic on OpenAI while letting Gemini absorb -long-context and lower-risk work. This reduces token saturation without sending -high-stakes changes blindly to the cheapest model. +OpenAI, Gemini, and local Qwen are configured through LiteLLM model groups. The +normal default now balances OpenAI with Gemini relief and keeps Qwen as a local +zero-cost fallback. This reduces token saturation without sending high-stakes +changes blindly to the cheapest model. API keys are never committed or written to a configuration file. `OPENAI_API_KEY` is required for the default profile; `GEMINI_API_KEY` is optional but recommended to activate the OpenAI/Gemini dispatching path. +## OpenAI Quota Saver + +When OpenAI quota is low or exhausted, use the `codex-no-openai` alias. It routes +through Gemini first and local Qwen second, without OpenAI entries in the model +group: + +```powershell +codex --model codex-no-openai +``` + +For one-shot wrapper calls, either force the provider: + +```powershell +python .\scripts\python\codex_cost_router.py run --dry-run ` + --provider no-openai ` + "Refactor this Python API without using OpenAI quota" +``` + +or set a temporary session mode: + +```powershell +$env:CODEX_ROUTER_OPENAI_MODE = 'avoid' +python .\scripts\python\codex_cost_router.py run --dry-run ` + "Refactor this Python API without using OpenAI quota" +``` + +For a durable default, set `avoid_openai: true` in +`codex-routing-policy.yaml`. + ## Hugging Face Integration Hugging Face can be used in two optional places. @@ -34,7 +65,7 @@ provider pool. The local config still includes two optional aliases: ```yaml codex-hf-cheap -> huggingface/groq/openai/gpt-oss-120b -codex-hf-fast -> huggingface/together/deepseek-ai/DeepSeek-R1 +codex-hf-fast -> huggingface/together/openai/gpt-oss-120b ``` Set `HF_TOKEN` in the shell before starting the router. A fine-grained token @@ -56,6 +87,35 @@ python .\scripts\python\codex_cost_router.py run --dry-run ` `--provider auto` routes Hugging Face or multi-provider prompts to the HF aliases only when `HF_TOKEN` is present. Otherwise it keeps the OpenAI-backed aliases. +LiteLLM also uses `HUGGINGFACE_API_KEY` while resolving some Inference Provider +mappings. The local web session exports the submitted `HF_TOKEN` under both +names for the LiteLLM subprocess. If you start LiteLLM manually, set both names +to the same token: + +```powershell +$env:HF_TOKEN = 'hf_...' +$env:HUGGINGFACE_API_KEY = $env:HF_TOKEN +``` + +## Local Ollama Qwen Fallback + +The local LiteLLM config includes `codex-qwen-local` as a final fallback for +the main Codex aliases. It uses Ollama's OpenAI-compatible endpoint with the +lighter Qwen2.5 Coder 7B GGUF model: + +```powershell +.\scripts\python\Start-CodexQwenOllama.ps1 +``` + +The script starts Ollama if needed and pulls: + +```text +hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest +``` + +LiteLLM then reaches it through `http://127.0.0.1:11434/v1`. No provider API key +is required for this local fallback. + Second, Hugging Face can be added as an optional Codex-facing layer. Running `enable` now installs two managed profiles: @@ -100,6 +160,7 @@ Default policy: default_provider: auto default_codex_provider: litellm open_models_only: false +avoid_openai: false max_cost_usd: 0.0 task_provider_rules: @@ -155,9 +216,11 @@ If you prefer entering keys in a local page for one work session, start: ``` Then open `http://127.0.0.1:8787/`, paste `OPENAI_API_KEY`, -`GEMINI_API_KEY`, or `HF_TOKEN`, and submit the form. The page starts the -LiteLLM proxy on `http://127.0.0.1:4000/v1` with those keys only in the proxy -process environment. The keys are not written to disk and the web server +`GEMINI_API_KEY`, `HF_TOKEN`, or optional custom Qwen endpoint fields, and +submit the form. For the default local Qwen/Ollama fallback, run +`Start-CodexQwenOllama.ps1`; no Qwen API key is needed. The page starts the +LiteLLM proxy on `http://127.0.0.1:4000/v1` with submitted values only in the +proxy process environment. The keys are not written to disk and the web server suppresses request logging. To launch the optional Hugging Face-facing profile instead of the local LiteLLM @@ -183,6 +246,23 @@ python .\scripts\python\codex_cost_router.py doctor If a browser opened on `http://localhost:4000/health` shows `Unauthorized`, that is expected: the local proxy is protected by `LITELLM_API_KEY`. +Validate the local proxy aliases without making a paid/model call: + +```powershell +.\scripts\python\Test-CodexLiteLLMDispatch.ps1 +``` + +Run a real minimal provider call after entering the relevant key in the local +web page: + +```powershell +.\scripts\python\Test-CodexLiteLLMDispatch.ps1 -Model codex-hf-cheap -Call +.\scripts\python\Test-CodexLiteLLMDispatch.ps1 -Model codex-qwen-local -Call +.\scripts\python\Test-CodexLiteLLMDispatch.ps1 -Model codex-default -Call +``` + +The test prints a compact JSON result and never prints provider tokens. + ## Optimized One-Shot Requests Use the Python wrapper when prompt cleanup and dynamic model routing are needed: @@ -225,6 +305,7 @@ Prompts and API keys are not logged. - `codex_cost_router.py`: prompt optimization and one-shot routing. - `codex_key_session_web.py`: local-only web form for session keys. - `Start-CodexKeySessionWeb.ps1`: PowerShell launcher for the local key page. +- `Test-CodexLiteLLMDispatch.ps1`: local proxy alias and optional call test. - `codex-routing-policy.yaml`: editable routing policy and fallback order. - `litellm-cost-routing.yaml`: local LiteLLM OSS OpenAI/Gemini model groups, context-window fallbacks, cooldowns, and compatibility aliases. diff --git a/scripts/python/Start-CodexQwenOllama.ps1 b/scripts/python/Start-CodexQwenOllama.ps1 new file mode 100644 index 0000000..2372566 --- /dev/null +++ b/scripts/python/Start-CodexQwenOllama.ps1 @@ -0,0 +1,55 @@ +[CmdletBinding()] +param( + [string]$Model = "hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest", + [switch]$SkipPull +) + +$ErrorActionPreference = "Stop" + +$ollama = Get-Command ollama -ErrorAction SilentlyContinue +if (-not $ollama) { + throw "Ollama is not installed or not available in PATH." +} + +try { + Invoke-RestMethod -Uri "http://127.0.0.1:11434/api/tags" -TimeoutSec 3 | Out-Null +} +catch { + Start-Process -WindowStyle Hidden -FilePath $ollama.Source -ArgumentList @("serve") + $ready = $false + for ($i = 0; $i -lt 40; $i++) { + Start-Sleep -Milliseconds 500 + try { + Invoke-RestMethod -Uri "http://127.0.0.1:11434/api/tags" -TimeoutSec 2 | Out-Null + $ready = $true + break + } + catch { + $ready = $false + } + } + if (-not $ready) { + throw "Ollama did not become ready on http://127.0.0.1:11434." + } +} + +$tags = Invoke-RestMethod -Uri "http://127.0.0.1:11434/api/tags" -TimeoutSec 10 +$modelNames = @($tags.models | ForEach-Object { $_.name }) +if (($modelNames -notcontains $Model) -and (-not $SkipPull)) { + & $ollama.Source pull $Model + if ($LASTEXITCODE -ne 0) { + throw "ollama pull failed for $Model" + } +} + +$tags = Invoke-RestMethod -Uri "http://127.0.0.1:11434/api/tags" -TimeoutSec 10 +$modelNames = @($tags.models | ForEach-Object { $_.name }) +if ($modelNames -notcontains $Model) { + throw "$Model is not installed. Run without -SkipPull to download it." +} + +[pscustomobject]@{ + ok = $true + model = $Model + api_base = "http://127.0.0.1:11434/v1" +} | ConvertTo-Json diff --git a/scripts/python/Test-CodexLiteLLMDispatch.ps1 b/scripts/python/Test-CodexLiteLLMDispatch.ps1 new file mode 100644 index 0000000..0c99189 --- /dev/null +++ b/scripts/python/Test-CodexLiteLLMDispatch.ps1 @@ -0,0 +1,95 @@ +[CmdletBinding()] +param( + [string]$BaseUrl = "http://127.0.0.1:4000/v1", + [string]$ApiKey = "sk-local-codex", + [string]$Model = "codex-default", + [switch]$Call, + [int]$TimeoutSec = 90 +) + +$ErrorActionPreference = "Stop" + +$headers = @{ + "Authorization" = "Bearer $ApiKey" + "Content-Type" = "application/json" +} + +function ConvertTo-ShortError { + param([object]$ErrorRecord) + + $message = $ErrorRecord.Exception.Message + if ($ErrorRecord.ErrorDetails -and $ErrorRecord.ErrorDetails.Message) { + $message = $ErrorRecord.ErrorDetails.Message + } + if ($message.Length -gt 900) { + return $message.Substring(0, 900) + "..." + } + return $message +} + +$models = Invoke-RestMethod -Uri "$BaseUrl/models" -Headers $headers -Method Get -TimeoutSec 10 +$modelIds = @($models.data | ForEach-Object { $_.id }) +$requiredAliases = @( + "codex-light", + "codex-default", + "codex-long", + "codex-deep", + "codex-qwen-local", + "codex-hf-cheap", + "codex-hf-fast" +) +$missingAliases = @($requiredAliases | Where-Object { $modelIds -notcontains $_ }) + +$health = $null +try { + $healthUrl = $BaseUrl -replace "/v1$", "" + $health = Invoke-RestMethod -Uri "$healthUrl/health" -Headers $headers -Method Get -TimeoutSec $TimeoutSec +} +catch { + $health = [pscustomobject]@{ + healthy_count = $null + unhealthy_count = $null + health_error = ConvertTo-ShortError $_ + } +} + +$callResult = $null +if ($Call) { + $body = @{ + model = $Model + messages = @( + @{ + role = "user" + content = "Reply with exactly: dispatch ok" + } + ) + max_tokens = 16 + temperature = 0 + } | ConvertTo-Json -Depth 6 + + try { + $response = Invoke-RestMethod -Uri "$BaseUrl/chat/completions" -Headers $headers -Method Post -Body $body -TimeoutSec $TimeoutSec + $callResult = [pscustomobject]@{ + ok = $true + model = $response.model + content = $response.choices[0].message.content + } + } + catch { + $callResult = [pscustomobject]@{ + ok = $false + error = ConvertTo-ShortError $_ + } + } +} + +[pscustomobject]@{ + ok = ($missingAliases.Count -eq 0 -and (-not $Call -or ($callResult -and $callResult.ok))) + base_url = $BaseUrl + aliases_present = @($requiredAliases | Where-Object { $modelIds -contains $_ }) + aliases_missing = $missingAliases + healthy_count = $health.healthy_count + unhealthy_count = $health.unhealthy_count + health_error = $health.health_error + call = $callResult +} | ConvertTo-Json -Depth 6 diff --git a/scripts/python/codex-routing-policy.yaml b/scripts/python/codex-routing-policy.yaml index a1200f9..0850f90 100644 --- a/scripts/python/codex-routing-policy.yaml +++ b/scripts/python/codex-routing-policy.yaml @@ -1,9 +1,12 @@ # Codex cost-routing policy. # CLI options still have priority, then environment variables, then this file. +# Provider choices: auto, openai, gemini, huggingface, qwen. +# qwen uses a self-hosted OpenAI-compatible endpoint via QWEN_API_BASE. default_provider: auto default_codex_provider: litellm open_models_only: false +avoid_openai: false max_cost_usd: 0.0 task_provider_rules: diff --git a/scripts/python/codex_cost_router.py b/scripts/python/codex_cost_router.py index a7b6af1..916851b 100644 --- a/scripts/python/codex_cost_router.py +++ b/scripts/python/codex_cost_router.py @@ -33,9 +33,11 @@ HF_FAST_MODEL = "codex-hf-fast" HF_CHEAP_MODEL = "codex-hf-cheap" HF_DIRECT_MODEL = "openai/gpt-oss-120b:fastest" +QWEN_LOCAL_MODEL = "codex-qwen-local" +NO_OPENAI_MODEL = "codex-no-openai" DEFAULT_MAX_INPUT_TOKENS = 12_000 DEFAULT_MAX_OUTPUT_TOKENS = 2_000 -PROVIDERS = ("auto", "openai", "gemini", "huggingface") +PROVIDERS = ("auto", "openai", "gemini", "huggingface", "qwen", "no-openai") CODEX_PROVIDERS = ("litellm", "huggingface") MODELS = ( LIGHT_MODEL, @@ -46,15 +48,20 @@ LEGACY_STRONG_MODEL, HF_FAST_MODEL, HF_CHEAP_MODEL, + QWEN_LOCAL_MODEL, + NO_OPENAI_MODEL, ) LITELLM_HOST = "localhost" LITELLM_PORT = 4000 +OLLAMA_HOST = "127.0.0.1" +OLLAMA_PORT = 11434 WINDOWS_LITELLM_FALLBACK = Path(r"C:\tmp\litellm-oss\Scripts\litellm.exe") POLICY_FILE = Path(__file__).with_name("codex-routing-policy.yaml") DEFAULT_POLICY = { "default_provider": "auto", "default_codex_provider": "litellm", "open_models_only": False, + "avoid_openai": False, "max_cost_usd": 0.0, "task_provider_rules": { "simple": "auto", @@ -75,6 +82,8 @@ LEGACY_STRONG_MODEL: {"input": 2.00, "output": 8.00}, HF_CHEAP_MODEL: {"input": 0.10, "output": 0.30}, HF_FAST_MODEL: {"input": 0.25, "output": 0.75}, + QWEN_LOCAL_MODEL: {"input": 0.0, "output": 0.0}, + NO_OPENAI_MODEL: {"input": 0.40, "output": 1.50}, } SIMPLE_TERMS = ( @@ -122,6 +131,16 @@ "provider benchmark", "benchmark providers", ) +QWEN_TERMS = ( + "qwen", + "auto-heberge", + "auto heberge", + "auto-hebergee", + "self-hosted", + "self hosted", + "local llm", + "openai-compatible local", +) LONG_CONTEXT_TERMS = ( "gros contexte", "long contexte", @@ -378,12 +397,34 @@ def hf_available() -> bool: return bool(os.environ.get("HF_TOKEN")) +def qwen_available() -> bool: + """Return whether the local Ollama Qwen endpoint is reachable.""" + configured_base = os.environ.get("QWEN_API_BASE") + if configured_base: + return True + try: + with socket.create_connection((OLLAMA_HOST, OLLAMA_PORT), timeout=1): + return True + except OSError: + return False + + def default_provider() -> str: """Read the provider preference from the environment with a safe fallback.""" provider = os.environ.get("CODEX_ROUTER_PROVIDER", "auto").casefold() return provider if provider in PROVIDERS else "auto" +def openai_avoidance_enabled(policy: dict[str, Any] | None = None) -> bool: + """Return whether OpenAI should be avoided to preserve or bypass quota.""" + value = os.environ.get("CODEX_ROUTER_OPENAI_MODE", "").casefold() + if value in {"avoid", "off", "depleted", "quota", "no-openai", "no_openai"}: + return True + if value in {"", "auto", "normal", "on"}: + return bool(policy and policy.get("avoid_openai")) + return False + + def default_codex_provider() -> str: """Read the Codex-facing provider preference with a safe fallback.""" provider = os.environ.get("CODEX_ROUTER_CODEX_PROVIDER", "litellm").casefold() @@ -414,6 +455,8 @@ def provider_from_policy( return default_provider(), "provider forced by CODEX_ROUTER_PROVIDER" if bool(policy.get("open_models_only")): return "huggingface", "policy open_models_only" + if openai_avoidance_enabled(policy): + return "no-openai", "OpenAI avoidance enabled" complexity, _ = classify_complexity(prompt) rules = policy.get("task_provider_rules", {}) if isinstance(rules, dict) and complexity in rules: @@ -487,6 +530,16 @@ def route_model( return model, f"huggingface provider requested; {reason}" return DEFAULT_MODEL, "huggingface requested but HF_TOKEN is missing; using default OpenAI/Gemini tier" + if provider == "qwen": + if qwen_available(): + return QWEN_LOCAL_MODEL, f"qwen provider requested; {reason}" + return DEFAULT_MODEL, "qwen requested but Ollama is not listening on 127.0.0.1:11434; using default OpenAI/Gemini tier" + + if provider == "no-openai": + if qwen_available(): + return NO_OPENAI_MODEL, f"OpenAI avoided; Gemini/Qwen alias selected; {reason}" + return LONG_MODEL, f"OpenAI avoided; Qwen unavailable so Gemini long-context alias selected; {reason}" + if provider == "openai": model = LIGHT_MODEL if complexity == "simple" else DEEP_MODEL return model, f"openai provider requested; {reason}" @@ -499,6 +552,9 @@ def route_model( model = HF_CHEAP_MODEL if complexity == "simple" else HF_FAST_MODEL return model, f"huggingface-related task; {reason}" + if any(term in normalized for term in QWEN_TERMS) and qwen_available(): + return QWEN_LOCAL_MODEL, f"qwen local task; {reason}" + if wants_long_context: return LONG_MODEL, f"long-context task; {reason}" @@ -656,6 +712,7 @@ def print_doctor() -> int: ("LiteLLM proxy localhost:4000", proxy_available(), "listening" if proxy_available() else "not listening"), ("LITELLM_API_KEY", bool(os.environ.get("LITELLM_API_KEY")), "set" if os.environ.get("LITELLM_API_KEY") else "missing"), ("OPENAI_API_KEY", bool(os.environ.get("OPENAI_API_KEY")), "set" if os.environ.get("OPENAI_API_KEY") else "missing"), + ("Ollama Qwen optional", True, "listening on 127.0.0.1:11434" if qwen_available() else "missing; run Start-CodexQwenOllama.ps1"), ("HF_TOKEN optional", True, "set" if hf_available() else "missing; Hugging Face aliases disabled"), ("PYTHONUTF8", os.environ.get("PYTHONUTF8") == "1", "1" if os.environ.get("PYTHONUTF8") == "1" else "missing or not 1"), ("Cost-routing profile", router_enabled(), "enabled" if router_enabled() else "disabled"), diff --git a/scripts/python/codex_key_session_web.py b/scripts/python/codex_key_session_web.py index f957582..35b89b1 100644 --- a/scripts/python/codex_key_session_web.py +++ b/scripts/python/codex_key_session_web.py @@ -10,6 +10,7 @@ import sys import time import urllib.parse +import urllib.request from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from pathlib import Path from typing import ClassVar @@ -30,28 +31,56 @@ :root {{ color-scheme: light dark; font-family: Segoe UI, system-ui, sans-serif; + --accent: #1f6feb; + --ok: #238636; + --warn: #9a6700; }} body {{ margin: 0; min-height: 100vh; - display: grid; - place-items: center; + background: color-mix(in srgb, Canvas 94%, CanvasText); background: Canvas; color: CanvasText; }} main {{ - width: min(680px, calc(100vw - 32px)); - border: 1px solid color-mix(in srgb, CanvasText 18%, transparent); - border-radius: 8px; - padding: 24px; + width: min(880px, calc(100vw - 32px)); + margin: 28px auto; + padding: 0 0 28px; }} h1 {{ - font-size: 22px; - margin: 0 0 8px; + font-size: clamp(24px, 4vw, 34px); + margin: 0 0 10px; + letter-spacing: 0; + }} + h2 {{ + font-size: 17px; + margin: 0 0 14px; }} p {{ line-height: 1.5; }} + .panel {{ + border: 1px solid color-mix(in srgb, CanvasText 18%, transparent); + border-radius: 8px; + padding: 18px; + margin-top: 16px; + background: Canvas; + }} + .grid {{ + display: grid; + grid-template-columns: repeat(auto-fit, minmax(240px, 1fr)); + gap: 14px; + }} + .provider {{ + border: 1px solid color-mix(in srgb, CanvasText 14%, transparent); + border-radius: 8px; + padding: 14px; + min-height: 96px; + }} + .provider strong {{ + display: block; + margin-bottom: 4px; + }} label {{ display: block; margin-top: 16px; @@ -61,53 +90,139 @@ width: 100%; box-sizing: border-box; margin-top: 6px; - padding: 10px; + padding: 11px 12px; border-radius: 6px; border: 1px solid color-mix(in srgb, CanvasText 24%, transparent); font: inherit; }} + input[type="checkbox"] {{ + width: auto; + margin: 0 8px 0 0; + }} + .check {{ + display: flex; + align-items: center; + gap: 8px; + margin-top: 10px; + font-weight: 600; + }} + .actions {{ + display: flex; + flex-wrap: wrap; + gap: 12px; + margin-top: 18px; + }} button {{ - margin-top: 20px; padding: 10px 14px; border: 0; border-radius: 6px; - background: #1f6feb; + background: var(--accent); color: white; font: inherit; font-weight: 600; cursor: pointer; }} + button.secondary {{ + background: color-mix(in srgb, CanvasText 14%, Canvas); + color: CanvasText; + }} .status {{ margin-top: 16px; padding: 12px; border-radius: 6px; - background: color-mix(in srgb, #1f6feb 12%, Canvas); + background: color-mix(in srgb, var(--accent) 12%, Canvas); + overflow-wrap: anywhere; + }} + .meta {{ + display: grid; + grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); + gap: 10px; + margin-top: 14px; + }} + .pill {{ + border: 1px solid color-mix(in srgb, CanvasText 14%, transparent); + border-radius: 999px; + padding: 8px 12px; overflow-wrap: anywhere; }} + details {{ + margin-top: 14px; + }} + summary {{ + cursor: pointer; + font-weight: 600; + }} .muted {{ opacity: .75; font-size: 14px; }} + .small {{ + font-size: 13px; + }}
-

Codex LiteLLM session keys

-

Keys are kept only in this local process environment and passed to the LiteLLM subprocess. They are not written to disk.

+

Codex LiteLLM dispatch

+

Start a local LiteLLM proxy for Codex with cloud keys kept only in this session. Qwen local runs through Ollama and needs no cloud key.

{status} -
- - - - - - - -
-
- -
-

Proxy URL: http://127.0.0.1:{proxy_port}/v1

+
+

Providers

+
+
+ OpenAI + Used for default and deep coding aliases. +
+
+ Gemini + Used for low-cost, long-context, and relief routing. +
+
+ Hugging Face + Optional aliases for HF-hosted open models. +
+
+ Qwen local + {qwen_status} +
+
+
+
Proxy URL: http://127.0.0.1:{proxy_port}/v1
+
Codex model aliases: codex-light, codex-default, codex-long, codex-deep
+
Local fallback: codex-qwen-local
+
+
+ +
+

Session keys

+
+ + + + + + + +
+ Advanced custom Qwen endpoint + + + + +
+
+ +
+
+
+
+ +
+
+
@@ -141,6 +256,18 @@ def wait_for_port(host: str, port: int, timeout: float = 20.0) -> bool: return False +def local_qwen_status() -> str: + """Return a short status for the local Ollama Qwen fallback.""" + try: + with urllib.request.urlopen("http://127.0.0.1:11434/v1/models", timeout=2) as response: + body = response.read().decode("utf-8", errors="replace") + except OSError: + return "Ollama is not reachable on 127.0.0.1:11434." + if "hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest" in body: + return "Ready through Ollama on 127.0.0.1:11434." + return "Ollama is running, but the Qwen2.5 Coder model was not found." + + class SessionState: """Mutable server state.""" @@ -180,7 +307,8 @@ def do_POST(self) -> None: # noqa: N802 def _send_page(self) -> None: safe_message = html.escape(self.state.message) status = f'
{safe_message}
' - body = PAGE.format(status=status, proxy_port=self.proxy_port).encode("utf-8") + qwen_status = html.escape(local_qwen_status()) + body = PAGE.format(status=status, proxy_port=self.proxy_port, qwen_status=qwen_status).encode("utf-8") self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.send_header("Cache-Control", "no-store") @@ -199,8 +327,11 @@ def _start_proxy(self) -> None: openai_key = form.get("OPENAI_API_KEY", "") gemini_key = form.get("GEMINI_API_KEY", "") hf_token = form.get("HF_TOKEN", "") - if not any((openai_key, gemini_key, hf_token)): - self.state.message = "Provide at least one provider key." + use_local_qwen = form.get("USE_LOCAL_QWEN", "") == "1" + qwen_base = form.get("QWEN_API_BASE", "") + qwen_key = form.get("QWEN_API_KEY", "") + if not any((openai_key, gemini_key, hf_token, use_local_qwen, qwen_base)): + self.state.message = "Provide at least one provider key, or keep local Qwen enabled." self._send_page() return @@ -215,6 +346,10 @@ def _start_proxy(self) -> None: env["GEMINI_API_KEY"] = gemini_key if hf_token: env["HF_TOKEN"] = hf_token + env["HUGGINGFACE_API_KEY"] = hf_token + if qwen_base: + env["QWEN_API_BASE"] = qwen_base.rstrip("/") + env["QWEN_API_KEY"] = qwen_key or "sk-local-qwen" try: self.state.process = subprocess.Popen( @@ -246,6 +381,8 @@ def _start_proxy(self) -> None: providers.append("Gemini") if hf_token: providers.append("Hugging Face") + if use_local_qwen or qwen_base: + providers.append("Qwen local") self.state.message = "Session proxy started with: " + ", ".join(providers) else: self.state.message = "LiteLLM process started, but the proxy port did not become ready yet." diff --git a/scripts/python/litellm-cost-routing.yaml b/scripts/python/litellm-cost-routing.yaml index d7027c4..6330036 100644 --- a/scripts/python/litellm-cost-routing.yaml +++ b/scripts/python/litellm-cost-routing.yaml @@ -1,11 +1,15 @@ # LiteLLM OSS self-hosted proxy example for Codex. # Task-oriented aliases let Codex route by workload instead of hard-coding a # single provider: -# - codex-light: cheap/frequent work, Gemini Flash biased -# - codex-default: normal coding work, OpenAI biased with Gemini relief +# - codex-light: cheap/frequent work, Gemini Flash biased with local Qwen relief +# - codex-default: normal coding work, balanced OpenAI/Gemini with local Qwen relief # - codex-long: long-context reads and synthesis, Gemini Pro biased -# - codex-deep: high-stakes debugging/security/architecture, OpenAI biased +# - codex-deep: high-stakes debugging/security/architecture, OpenAI biased with fast fallback +# - codex-no-openai: Gemini/Qwen routing when OpenAI quota is exhausted +# - codex-qwen-local: local Ollama Qwen fallback # API keys are read from environment variables and must never be committed. +# Qwen fallback expects Ollama on http://127.0.0.1:11434/v1 with: +# hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest model_list: # Backward-compatible aliases used by older wrapper calls. @@ -44,19 +48,33 @@ model_list: litellm_params: model: openai/gpt-5.4-mini api_key: os.environ/OPENAI_API_KEY - weight: 3 + weight: 2 + + - model_name: codex-light + litellm_params: + model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest + api_base: http://127.0.0.1:11434/v1 + api_key: sk-ollama-local + weight: 1 - model_name: codex-default litellm_params: model: openai/gpt-5.5 api_key: os.environ/OPENAI_API_KEY - weight: 8 + weight: 5 - model_name: codex-default litellm_params: model: gemini/gemini-3.5-pro api_key: os.environ/GEMINI_API_KEY - weight: 2 + weight: 4 + + - model_name: codex-default + litellm_params: + model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest + api_base: http://127.0.0.1:11434/v1 + api_key: sk-ollama-local + weight: 1 - model_name: codex-long litellm_params: @@ -76,16 +94,61 @@ model_list: api_key: os.environ/OPENAI_API_KEY weight: 1 + - model_name: codex-long + litellm_params: + model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest + api_base: http://127.0.0.1:11434/v1 + api_key: sk-ollama-local + weight: 1 + - model_name: codex-deep litellm_params: model: openai/gpt-5.5 api_key: os.environ/OPENAI_API_KEY - weight: 10 + weight: 7 - model_name: codex-deep litellm_params: model: gemini/gemini-3.5-pro api_key: os.environ/GEMINI_API_KEY + weight: 2 + + - model_name: codex-deep + litellm_params: + model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest + api_base: http://127.0.0.1:11434/v1 + api_key: sk-ollama-local + weight: 1 + + # OpenAI quota saver. Use this alias directly, or set + # CODEX_ROUTER_OPENAI_MODE=avoid with codex_cost_router.py. + - model_name: codex-no-openai + litellm_params: + model: gemini/gemini-3.5-pro + api_key: os.environ/GEMINI_API_KEY + weight: 6 + + - model_name: codex-no-openai + litellm_params: + model: gemini/gemini-3.5-flash + api_key: os.environ/GEMINI_API_KEY + weight: 2 + + - model_name: codex-no-openai + litellm_params: + model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest + api_base: http://127.0.0.1:11434/v1 + api_key: sk-ollama-local + weight: 2 + + # Local Ollama fallback for Qwen2.5 Coder 7B GGUF. + # Prepare with: + # ollama pull hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest + - model_name: codex-qwen-local + litellm_params: + model: openai/hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest + api_base: http://127.0.0.1:11434/v1 + api_key: sk-ollama-local weight: 1 - model_name: codex-hf-cheap @@ -95,7 +158,7 @@ model_list: - model_name: codex-hf-fast litellm_params: - model: huggingface/together/deepseek-ai/DeepSeek-R1 + model: huggingface/together/openai/gpt-oss-120b api_key: os.environ/HF_TOKEN router_settings: @@ -109,29 +172,49 @@ router_settings: gpt-5.4-mini: codex-light gemini-3.5-pro: codex-long gemini-3.5-flash: codex-light + hf.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:latest: codex-qwen-local fallbacks: - codex-light: + - codex-no-openai - codex-default + - codex-qwen-local - codex-default: + - codex-no-openai - codex-long - codex-light + - codex-qwen-local - codex-long: + - codex-no-openai - codex-default + - codex-qwen-local - codex-deep: + - codex-no-openai - codex-default - codex-long + - codex-qwen-local + - codex-no-openai: + - codex-long + - codex-qwen-local - codex-cheap: + - codex-no-openai - codex-strong - codex-default + - codex-qwen-local - codex-strong: + - codex-no-openai - codex-default - codex-long + - codex-qwen-local - codex-hf-cheap: - codex-light - codex-cheap + - codex-qwen-local - codex-hf-fast: - codex-default - codex-deep + - codex-qwen-local + - codex-qwen-local: + - codex-light context_window_fallbacks: - codex-light: - codex-long @@ -142,7 +225,7 @@ router_settings: allowed_fails_policy: AuthenticationErrorAllowedFails: 0 TimeoutErrorAllowedFails: 2 - RateLimitErrorAllowedFails: 4 + RateLimitErrorAllowedFails: 1 litellm_settings: drop_params: true diff --git a/scripts/python/tests/test_codex_cost_router.py b/scripts/python/tests/test_codex_cost_router.py index cf270bb..9eaf85c 100644 --- a/scripts/python/tests/test_codex_cost_router.py +++ b/scripts/python/tests/test_codex_cost_router.py @@ -60,6 +60,31 @@ def test_route_model_falls_back_when_hugging_face_token_is_missing(self) -> None self.assertEqual(model, "codex-default") self.assertIn("HF_TOKEN is missing", reason) + def test_route_model_can_use_self_hosted_qwen_when_endpoint_exists(self) -> None: + with patch.dict(ROUTER.os.environ, {"QWEN_API_BASE": "http://127.0.0.1:11434/v1"}): + self.assertEqual( + ROUTER.route_model("Use qwen auto heberge as backup", provider="qwen")[0], + "codex-qwen-local", + ) + self.assertEqual( + ROUTER.route_model("Prefer self-hosted local llm fallback")[0], + "codex-qwen-local", + ) + + def test_route_model_can_avoid_openai_with_gemini_qwen_alias(self) -> None: + with patch.dict(ROUTER.os.environ, {"QWEN_API_BASE": "http://127.0.0.1:11434/v1"}): + model, reason = ROUTER.route_model("Refactor this Python API", provider="no-openai") + self.assertEqual(model, "codex-no-openai") + self.assertIn("OpenAI avoided", reason) + + def test_route_model_falls_back_when_qwen_endpoint_is_missing(self) -> None: + with patch.dict(ROUTER.os.environ, {}, clear=True), patch.object( + ROUTER.socket, "create_connection", side_effect=OSError + ): + model, reason = ROUTER.route_model("Use Qwen local", provider="qwen") + self.assertEqual(model, "codex-default") + self.assertIn("Ollama is not listening", reason) + def test_codex_provider_helpers_select_expected_profiles(self) -> None: self.assertEqual(ROUTER.codex_profile("litellm"), "cost-routing") self.assertEqual(ROUTER.codex_profile("huggingface"), "cost-routing-hf") @@ -124,6 +149,16 @@ def test_policy_open_models_only_prefers_hugging_face(self) -> None: self.assertEqual(ROUTER.provider_from_policy("Security review", None, policy)[0], "huggingface") self.assertEqual(ROUTER.codex_provider_from_policy(None, policy)[0], "huggingface") + def test_policy_or_environment_can_avoid_openai(self) -> None: + policy = {**ROUTER.DEFAULT_POLICY, "avoid_openai": True} + provider, reason = ROUTER.provider_from_policy("Refactor this Python API", None, policy) + self.assertEqual(provider, "no-openai") + self.assertIn("OpenAI avoidance", reason) + with patch.dict(ROUTER.os.environ, {"CODEX_ROUTER_OPENAI_MODE": "avoid"}): + provider, reason = ROUTER.provider_from_policy("Refactor this Python API", None, ROUTER.DEFAULT_POLICY) + self.assertEqual(provider, "no-openai") + self.assertIn("OpenAI avoidance", reason) + def test_build_optimized_prompt_respects_budget(self) -> None: context = "
" + ("Architecture production Odoo migration security. " * 1000) + "
" optimized = ROUTER.build_optimized_prompt(context, 120) diff --git a/scripts/python/tests/test_codex_key_session_web.py b/scripts/python/tests/test_codex_key_session_web.py index afce691..6105c9f 100644 --- a/scripts/python/tests/test_codex_key_session_web.py +++ b/scripts/python/tests/test_codex_key_session_web.py @@ -1,9 +1,10 @@ """Tests for the optional local web key session launcher.""" import importlib.util +import os import unittest from pathlib import Path -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch MODULE_PATH = Path(__file__).resolve().parents[1] / "codex_key_session_web.py" @@ -40,6 +41,77 @@ def test_stop_proxy_terminates_running_process(self) -> None: self.assertIsNone(state.process) self.assertEqual(state.message, "done") + def test_start_proxy_exports_hf_token_under_litellm_mapping_name(self) -> None: + handler = object.__new__(WEB.KeySessionHandler) + handler.state = WEB.SessionState() + handler.config_path = Path("config.yaml") + handler.litellm_path = Path("litellm.exe") + handler.proxy_host = "127.0.0.1" + handler.proxy_port = 4000 + handler._read_form = MagicMock( # type: ignore[method-assign] + return_value={ + "OPENAI_API_KEY": "", + "GEMINI_API_KEY": "", + "HF_TOKEN": "hf_test", + "USE_LOCAL_QWEN": "", + "QWEN_API_BASE": "", + "QWEN_API_KEY": "", + } + ) + handler._stop_proxy = MagicMock() # type: ignore[method-assign] + handler._send_page = MagicMock() # type: ignore[method-assign] + + process = MagicMock() + process.poll.return_value = None + captured_env: dict[str, str] = {} + + def fake_popen(*args: object, **kwargs: object) -> MagicMock: + captured_env.update(kwargs["env"]) # type: ignore[index] + return process + + with ( + patch.object(WEB.subprocess, "Popen", side_effect=fake_popen), + patch.object(WEB, "wait_for_port", return_value=True), + patch.dict(os.environ, {}, clear=True), + ): + handler._start_proxy() + + self.assertEqual(captured_env["HF_TOKEN"], "hf_test") + self.assertEqual(captured_env["HUGGINGFACE_API_KEY"], "hf_test") + self.assertIn("Hugging Face", handler.state.message) + + def test_start_proxy_allows_qwen_local_without_cloud_keys(self) -> None: + handler = object.__new__(WEB.KeySessionHandler) + handler.state = WEB.SessionState() + handler.config_path = Path("config.yaml") + handler.litellm_path = Path("litellm.exe") + handler.proxy_host = "127.0.0.1" + handler.proxy_port = 4000 + handler._read_form = MagicMock( # type: ignore[method-assign] + return_value={ + "OPENAI_API_KEY": "", + "GEMINI_API_KEY": "", + "HF_TOKEN": "", + "USE_LOCAL_QWEN": "1", + "QWEN_API_BASE": "", + "QWEN_API_KEY": "", + } + ) + handler._stop_proxy = MagicMock() # type: ignore[method-assign] + handler._send_page = MagicMock() # type: ignore[method-assign] + + process = MagicMock() + process.poll.return_value = None + + with ( + patch.object(WEB.subprocess, "Popen", return_value=process), + patch.object(WEB, "wait_for_port", return_value=True), + patch.dict(os.environ, {}, clear=True), + ): + handler._start_proxy() + + self.assertIn("Qwen local", handler.state.message) + if __name__ == "__main__": unittest.main()