Fix streaming timeout: use httpx.Timeout to separate connect from read

Project Team · Project Team · commit 781d1ed3af6b · 2026-02-20T12:06:46.000-06:00
llama3.2-vision encodes the image before emitting any tokens, so
first-token latency on a T4 can be 30-90s under VRAM pressure.
Passing a plain integer to ollama.Client applied that value as the
httpx read timeout on every individual chunk, which fired during the
image-encoding phase (before the first token) even though Ollama was
working correctly.

Use httpx.Timeout(timeout=&lt;configured&gt;, connect=10) so the read
timeout covers the full inference window, while the connect timeout
still fails fast if Ollama is unreachable.
diff --git a/app/ocr_backends.py b/app/ocr_backends.py
@@ -83,13 +83,26 @@ def __init__(self, model: str = "llama3.2-vision", host: str = "http://localhost
         self._is_available = False
         self._availability_error = None
         
-        # Import ollama library and create a client with the configured timeout.
-        # The module-level ollama.chat() has no timeout parameter; the Client
-        # constructor forwards **kwargs to httpx.Client, which does.
+        # Build an httpx.Timeout that separates concerns:
+        #
+        #   connect=10  — fail fast if Ollama isn't reachable at all
+        #   read=timeout — how long to wait for the *first* streaming token.
+        #
+        # llama3.2-vision does substantial image-encoding work before it emits
+        # any tokens, so first-token latency on a T4 can be 30-90s depending on
+        # VRAM pressure. Using a plain integer timeout applies the same value to
+        # every chunk read, which fires prematurely on that initial encoding
+        # phase even though the model is working fine. By setting read= to the
+        # full configured timeout we preserve the ability to catch a genuinely
+        # hung Ollama while not cutting off a legitimately slow first token.
         try:
+            import httpx
             import ollama
             self.ollama = ollama
-            self._client = ollama.Client(host=host, timeout=timeout)
+            self._client = ollama.Client(
+                host=host,
+                timeout=httpx.Timeout(timeout=float(timeout), connect=10.0),
+            )
         except ImportError:
             self._is_available = False
             self._availability_error = "ollama Python library not installed. Install with: pip install ollama"