From 9073769da8f886ca75ca223b4d48f8de438c76f0 Mon Sep 17 00:00:00 2001
From: Shanu <shanu@tinyhumans.ai>
Date: Mon, 18 May 2026 17:29:45 +0530
Subject: [PATCH 1/4] fix(supervision): enhance error reporting for channel
 disconnections and add test for expected error classification

---
 src/openhuman/channels/runtime/supervision.rs | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/openhuman/channels/runtime/supervision.rs b/src/openhuman/channels/runtime/supervision.rs
index 0667237c0a..25114d556d 100644
--- a/src/openhuman/channels/runtime/supervision.rs
+++ b/src/openhuman/channels/runtime/supervision.rs
@@ -57,7 +57,13 @@ pub(crate) fn spawn_supervised_listener(
                     backoff = initial_backoff_secs.max(1);
                 }
                 Err(e) => {
-                    tracing::error!("Channel {} error: {e}; restarting", ch.name());
+                    let message = format!("Channel {} error: {e:#}; restarting", ch.name());
+                    crate::core::observability::report_error_or_expected(
+                        message.as_str(),
+                        "channels",
+                        "supervised_listener",
+                        &[("channel", ch.name())],
+                    );
                     publish_global(DomainEvent::ChannelDisconnected {
                         channel: ch.name().to_string(),
                         reason: e.to_string(),
@@ -118,4 +124,18 @@ mod tests {
         let result = compute_max_in_flight_messages(usize::MAX);
         assert!(result <= CHANNEL_MAX_IN_FLIGHT_MESSAGES);
     }
+
+    #[test]
+    fn supervision_discord_gateway_reqwest_failure_classifies_as_expected() {
+        let raw = "error sending request for url (https://discord.com/api/v10/gateway/bot)";
+        let wrapped = format!("Channel discord error: {raw}; restarting");
+        let kind = crate::core::observability::expected_error_kind(&wrapped);
+        assert_eq!(
+            kind,
+            Some(crate::core::observability::ExpectedErrorKind::NetworkUnreachable),
+            "supervision wrapper must keep transient transport phrase visible \
+             to the classifier so Sentry stays quiet for OPENHUMAN-TAURI-VP \
+             (got {kind:?} for message {wrapped:?})"
+        );
+    }
 }

From 7e153a9748026825d1e85c06c87e0955537ae5a4 Mon Sep 17 00:00:00 2001
From: Shanu <shanu@tinyhumans.ai>
Date: Mon, 18 May 2026 17:32:44 +0530
Subject: [PATCH 2/4] feat(observability): add handling and logging for custom
 OpenAI upstream bad request errors with tests

---
 src/core/observability.rs                     | 27 +++++++++
 .../inference/provider/compatible.rs          | 55 +++++++++++++++++++
 src/openhuman/inference/provider/ops.rs       | 42 ++++++++++++++
 3 files changed, 124 insertions(+)

diff --git a/src/core/observability.rs b/src/core/observability.rs
index 8f685f17d8..01f8a9fd4b 100644
--- a/src/core/observability.rs
+++ b/src/core/observability.rs
@@ -326,6 +326,10 @@ fn is_provider_user_state_message(lower: &str) -> bool {
         return true;
     }
 
+    if lower.contains("bad request to upstream provider") && lower.contains("upstream_error") {
+        return true;
+    }
+
     // OPENHUMAN-TAURI-97: composio authorize with a blank required field —
     // SharePoint Subdomain, WhatsApp WABA ID, Tenant Name, etc.
     // Backend returns 500 with `"Missing required fields: …"` body.
@@ -1414,6 +1418,29 @@ mod tests {
         );
     }
 
+    #[test]
+    fn classifies_custom_openai_upstream_bad_request_as_provider_user_state() {
+        assert_eq!(
+            expected_error_kind(
+                "custom_openai API error (400 Bad Request): \
+                 {\"error\":{\"message\":\"Bad request to upstream provider\",\
+                 \"type\":\"upstream_error\",\"status\":400}}"
+            ),
+            Some(ExpectedErrorKind::ProviderUserState)
+        );
+
+        // Wrapped by higher-level callers (`agent.run_single`,
+        // `rpc.invoke_method`) must still classify.
+        assert_eq!(
+            expected_error_kind(
+                "agent.run_single failed: custom_openai API error (400 Bad Request): \
+                 {\"error\":{\"message\":\"Bad request to upstream provider\",\
+                 \"type\":\"upstream_error\",\"status\":400}}"
+            ),
+            Some(ExpectedErrorKind::ProviderUserState)
+        );
+    }
+
     #[test]
     fn classifies_missing_required_fields_as_provider_user_state() {
         // OPENHUMAN-TAURI-97: composio authorize with a blank required
diff --git a/src/openhuman/inference/provider/compatible.rs b/src/openhuman/inference/provider/compatible.rs
index 80d7711023..91cd71352f 100644
--- a/src/openhuman/inference/provider/compatible.rs
+++ b/src/openhuman/inference/provider/compatible.rs
@@ -446,6 +446,17 @@ impl OpenAiCompatibleProvider {
                     Some(model),
                     status,
                 );
+            } else if super::is_custom_openai_upstream_bad_request_http_400(
+                self.name.as_str(),
+                status,
+                &error,
+            ) {
+                super::log_custom_openai_upstream_bad_request_http_400(
+                    "responses_api",
+                    self.name.as_str(),
+                    Some(model),
+                    status,
+                );
             } else if super::should_report_provider_http_failure(status) {
                 crate::core::observability::report_error(
                     message.as_str(),
@@ -790,6 +801,17 @@ impl OpenAiCompatibleProvider {
                     Some(native_request.model.as_str()),
                     status,
                 );
+            } else if super::is_custom_openai_upstream_bad_request_http_400(
+                self.name.as_str(),
+                status,
+                &body,
+            ) {
+                super::log_custom_openai_upstream_bad_request_http_400(
+                    "streaming_chat",
+                    self.name.as_str(),
+                    Some(native_request.model.as_str()),
+                    status,
+                );
             } else if super::should_report_provider_http_failure(status) {
                 crate::core::observability::report_error(
                     message.as_str(),
@@ -1251,6 +1273,17 @@ impl Provider for OpenAiCompatibleProvider {
                     Some(model),
                     status,
                 );
+            } else if super::is_custom_openai_upstream_bad_request_http_400(
+                self.name.as_str(),
+                status,
+                &error,
+            ) {
+                super::log_custom_openai_upstream_bad_request_http_400(
+                    "chat_completions",
+                    self.name.as_str(),
+                    Some(model),
+                    status,
+                );
             } else if super::should_report_provider_http_failure(status) {
                 crate::core::observability::report_error(
                     message.as_str(),
@@ -1642,6 +1675,17 @@ impl Provider for OpenAiCompatibleProvider {
                     Some(model),
                     status,
                 );
+            } else if super::is_custom_openai_upstream_bad_request_http_400(
+                self.name.as_str(),
+                status,
+                &error,
+            ) {
+                super::log_custom_openai_upstream_bad_request_http_400(
+                    "native_chat",
+                    self.name.as_str(),
+                    Some(model),
+                    status,
+                );
             } else if super::should_report_provider_http_failure(status) {
                 crate::core::observability::report_error(
                     message.as_str(),
@@ -1779,6 +1823,17 @@ impl Provider for OpenAiCompatibleProvider {
                         Some(model_owned.as_str()),
                         status,
                     );
+                } else if super::is_custom_openai_upstream_bad_request_http_400(
+                    provider_name.as_str(),
+                    status,
+                    &raw_error,
+                ) {
+                    super::log_custom_openai_upstream_bad_request_http_400(
+                        "stream_chat",
+                        provider_name.as_str(),
+                        Some(model_owned.as_str()),
+                        status,
+                    );
                 } else if super::should_report_provider_http_failure(status) {
                     crate::core::observability::report_error(
                         message.as_str(),
diff --git a/src/openhuman/inference/provider/ops.rs b/src/openhuman/inference/provider/ops.rs
index cb3533de8a..378414af95 100644
--- a/src/openhuman/inference/provider/ops.rs
+++ b/src/openhuman/inference/provider/ops.rs
@@ -272,6 +272,25 @@ pub(super) fn is_budget_exhausted_http_400(status: reqwest::StatusCode, body: &s
     status == reqwest::StatusCode::BAD_REQUEST && super::is_budget_exhausted_message(body)
 }
 
+/// Whether a custom OpenAI-compatible proxy returned the known generic
+/// upstream 400 envelope:
+/// `{"error":{"message":"Bad request to upstream provider","type":"upstream_error","status":400}}`.
+///
+/// This shape is deterministic provider/user-state (endpoint-model mismatch,
+/// unsupported schema, provider-side validation) and does not provide
+/// actionable signal for OpenHuman Sentry triage.
+pub(super) fn is_custom_openai_upstream_bad_request_http_400(
+    provider: &str,
+    status: reqwest::StatusCode,
+    body: &str,
+) -> bool {
+    if provider != "custom_openai" || status != reqwest::StatusCode::BAD_REQUEST {
+        return false;
+    }
+    let lower = body.to_ascii_lowercase();
+    lower.contains("bad request to upstream provider") && lower.contains("upstream_error")
+}
+
 pub(super) fn log_budget_exhausted_http_400(
     operation: &str,
     provider: &str,
@@ -290,6 +309,25 @@ pub(super) fn log_budget_exhausted_http_400(
     );
 }
 
+pub(super) fn log_custom_openai_upstream_bad_request_http_400(
+    operation: &str,
+    provider: &str,
+    model: Option<&str>,
+    status: reqwest::StatusCode,
+) {
+    tracing::info!(
+        domain = "llm_provider",
+        operation = operation,
+        provider = provider,
+        model = model.unwrap_or(""),
+        status = status.as_u16(),
+        failure = "non_2xx",
+        kind = "provider_user_state",
+        reason = "custom_openai_upstream_bad_request",
+        "[llm_provider] {operation} custom_openai upstream 400 — not reporting to Sentry"
+    );
+}
+
 /// Build a sanitized provider error from a failed HTTP response.
 ///
 /// Reports the failure to Sentry with `provider` and `status` tags so
@@ -321,6 +359,8 @@ pub async fn api_error(provider: &str, response: reqwest::Response) -> anyhow::E
     let is_auth_failure = matches!(status.as_u16(), 401 | 403);
     let is_backend = provider == openhuman_backend::PROVIDER_LABEL;
     let is_budget_exhausted_user_state = is_budget_exhausted_http_400(status, &body);
+    let is_custom_openai_upstream_bad_request =
+        is_custom_openai_upstream_bad_request_http_400(provider, status, &body);
 
     if is_auth_failure && is_backend {
         tracing::warn!(
@@ -343,6 +383,8 @@ pub async fn api_error(provider: &str, response: reqwest::Response) -> anyhow::E
         );
     } else if is_budget_exhausted_user_state {
         log_budget_exhausted_http_400("api_error", provider, None, status);
+    } else if is_custom_openai_upstream_bad_request {
+        log_custom_openai_upstream_bad_request_http_400("api_error", provider, None, status);
     } else if should_report_provider_http_failure(status) {
         crate::core::observability::report_error(
             message.as_str(),

From 980184b604677ef37a38a339c9ddd6488738570f Mon Sep 17 00:00:00 2001
From: Shanu <shanu@tinyhumans.ai>
Date: Mon, 18 May 2026 18:18:59 +0530
Subject: [PATCH 3/4] feat(database): implement busy timeout for SQLite
 connections to prevent locking issues

---
 src/openhuman/whatsapp_data/store.rs | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/openhuman/whatsapp_data/store.rs b/src/openhuman/whatsapp_data/store.rs
index 6d35df5ef0..2f72a6d848 100644
--- a/src/openhuman/whatsapp_data/store.rs
+++ b/src/openhuman/whatsapp_data/store.rs
@@ -7,10 +7,17 @@
 
 use std::collections::HashMap;
 use std::path::Path;
+use std::time::Duration;
 
 use anyhow::{Context, Result};
 use rusqlite::{params, Connection};
 
+/// Matches the value used by `memory/tree/store.rs`. Without this, any
+/// concurrent writer (e.g. an overlapping `whatsapp_data_ingest` tick, or
+/// ingest racing with a UI-triggered list/search read on the same DB) gets
+/// `SQLITE_BUSY` returned immediately as `database is locked`.
+const SQLITE_BUSY_TIMEOUT: Duration = Duration::from_secs(15);
+
 use crate::openhuman::whatsapp_data::types::{
     ChatMeta, IngestMessage, ListChatsRequest, ListMessagesRequest, SearchMessagesRequest,
     WhatsAppChat, WhatsAppMessage,
@@ -77,8 +84,11 @@ impl WhatsAppDataStore {
     }
 
     fn open_conn(&self) -> Result<Connection> {
-        Connection::open(&self.db_path)
-            .with_context(|| format!("open whatsapp_data db: {}", self.db_path.display()))
+        let conn = Connection::open(&self.db_path)
+            .with_context(|| format!("open whatsapp_data db: {}", self.db_path.display()))?;
+        conn.busy_timeout(SQLITE_BUSY_TIMEOUT)
+            .context("configure whatsapp_data busy_timeout")?;
+        Ok(conn)
     }
 
     fn now_secs() -> i64 {

From 5954fbe233bc43536d4b950db6bb7f6beefca1b6 Mon Sep 17 00:00:00 2001
From: Shanu <shanu@tinyhumans.ai>
Date: Mon, 18 May 2026 19:01:21 +0530
Subject: [PATCH 4/4] fix(observability): anchor custom_openai 400 matcher to
 provider prefix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per CodeRabbit feedback on PR #2107: the previous matcher demoted any
error containing both "bad request to upstream provider" and
"upstream_error" — too broad. Anchor it to the canonical envelope
prefix "custom_openai api error (400" so it can't silence unrelated
errors that happen to mention either substring (e.g. a future provider
whose wire shape reuses one of them).

Adds a regression test confirming an error that contains both inner
substrings WITHOUT the custom_openai 400 anchor is no longer silenced.
---
 src/core/observability.rs | 44 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/src/core/observability.rs b/src/core/observability.rs
index 01f8a9fd4b..e5e051007e 100644
--- a/src/core/observability.rs
+++ b/src/core/observability.rs
@@ -326,7 +326,22 @@ fn is_provider_user_state_message(lower: &str) -> bool {
         return true;
     }
 
-    if lower.contains("bad request to upstream provider") && lower.contains("upstream_error") {
+    // OPENHUMAN-TAURI-XX: custom_openai upstream rejected the request with
+    // its own 400. Wire shape produced by
+    // `inference/provider/compatible.rs::is_custom_openai_upstream_bad_request_http_400`:
+    //
+    //   custom_openai API error (400 Bad Request): {"error":{
+    //     "message":"Bad request to upstream provider",
+    //     "type":"upstream_error","status":400}}
+    //
+    // Anchored to the `custom_openai api error (400` prefix so this can't
+    // silence unrelated errors that happen to mention both
+    // "bad request to upstream provider" and "upstream_error" elsewhere
+    // (e.g. a future provider whose envelope reuses one of those strings).
+    if lower.contains("custom_openai api error (400")
+        && lower.contains("bad request to upstream provider")
+        && lower.contains("upstream_error")
+    {
         return true;
     }
 
@@ -1441,6 +1456,33 @@ mod tests {
         );
     }
 
+    /// Regression for CodeRabbit feedback on PR #2107: the matcher must
+    /// not demote unrelated errors that happen to contain both
+    /// "bad request to upstream provider" and "upstream_error" without
+    /// the `custom_openai API error (400` anchor.
+    #[test]
+    fn does_not_silence_unrelated_error_with_only_inner_substrings() {
+        // No `custom_openai API error (400` prefix → must NOT classify
+        // as ProviderUserState, otherwise we'd silence actionable bugs.
+        assert_eq!(
+            expected_error_kind(
+                "internal panic in router: bad request to upstream provider \
+                 (state=upstream_error)"
+            ),
+            None,
+        );
+
+        // A future hypothetical provider envelope reusing one substring
+        // also must not classify.
+        assert_eq!(
+            expected_error_kind(
+                "anthropic_api error: upstream_error encountered while \
+                 forwarding bad request to upstream provider"
+            ),
+            None,
+        );
+    }
+
     #[test]
     fn classifies_missing_required_fields_as_provider_user_state() {
         // OPENHUMAN-TAURI-97: composio authorize with a blank required