From 9073769da8f886ca75ca223b4d48f8de438c76f0 Mon Sep 17 00:00:00 2001 From: Shanu Date: Mon, 18 May 2026 17:29:45 +0530 Subject: [PATCH 1/4] fix(supervision): enhance error reporting for channel disconnections and add test for expected error classification --- src/openhuman/channels/runtime/supervision.rs | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/openhuman/channels/runtime/supervision.rs b/src/openhuman/channels/runtime/supervision.rs index 0667237c0a..25114d556d 100644 --- a/src/openhuman/channels/runtime/supervision.rs +++ b/src/openhuman/channels/runtime/supervision.rs @@ -57,7 +57,13 @@ pub(crate) fn spawn_supervised_listener( backoff = initial_backoff_secs.max(1); } Err(e) => { - tracing::error!("Channel {} error: {e}; restarting", ch.name()); + let message = format!("Channel {} error: {e:#}; restarting", ch.name()); + crate::core::observability::report_error_or_expected( + message.as_str(), + "channels", + "supervised_listener", + &[("channel", ch.name())], + ); publish_global(DomainEvent::ChannelDisconnected { channel: ch.name().to_string(), reason: e.to_string(), @@ -118,4 +124,18 @@ mod tests { let result = compute_max_in_flight_messages(usize::MAX); assert!(result <= CHANNEL_MAX_IN_FLIGHT_MESSAGES); } + + #[test] + fn supervision_discord_gateway_reqwest_failure_classifies_as_expected() { + let raw = "error sending request for url (https://discord.com/api/v10/gateway/bot)"; + let wrapped = format!("Channel discord error: {raw}; restarting"); + let kind = crate::core::observability::expected_error_kind(&wrapped); + assert_eq!( + kind, + Some(crate::core::observability::ExpectedErrorKind::NetworkUnreachable), + "supervision wrapper must keep transient transport phrase visible \ + to the classifier so Sentry stays quiet for OPENHUMAN-TAURI-VP \ + (got {kind:?} for message {wrapped:?})" + ); + } } From 7e153a9748026825d1e85c06c87e0955537ae5a4 Mon Sep 17 00:00:00 2001 From: Shanu Date: Mon, 18 May 2026 17:32:44 +0530 Subject: [PATCH 2/4] feat(observability): add handling and logging for custom OpenAI upstream bad request errors with tests --- src/core/observability.rs | 27 +++++++++ .../inference/provider/compatible.rs | 55 +++++++++++++++++++ src/openhuman/inference/provider/ops.rs | 42 ++++++++++++++ 3 files changed, 124 insertions(+) diff --git a/src/core/observability.rs b/src/core/observability.rs index 8f685f17d8..01f8a9fd4b 100644 --- a/src/core/observability.rs +++ b/src/core/observability.rs @@ -326,6 +326,10 @@ fn is_provider_user_state_message(lower: &str) -> bool { return true; } + if lower.contains("bad request to upstream provider") && lower.contains("upstream_error") { + return true; + } + // OPENHUMAN-TAURI-97: composio authorize with a blank required field — // SharePoint Subdomain, WhatsApp WABA ID, Tenant Name, etc. // Backend returns 500 with `"Missing required fields: …"` body. @@ -1414,6 +1418,29 @@ mod tests { ); } + #[test] + fn classifies_custom_openai_upstream_bad_request_as_provider_user_state() { + assert_eq!( + expected_error_kind( + "custom_openai API error (400 Bad Request): \ + {\"error\":{\"message\":\"Bad request to upstream provider\",\ + \"type\":\"upstream_error\",\"status\":400}}" + ), + Some(ExpectedErrorKind::ProviderUserState) + ); + + // Wrapped by higher-level callers (`agent.run_single`, + // `rpc.invoke_method`) must still classify. + assert_eq!( + expected_error_kind( + "agent.run_single failed: custom_openai API error (400 Bad Request): \ + {\"error\":{\"message\":\"Bad request to upstream provider\",\ + \"type\":\"upstream_error\",\"status\":400}}" + ), + Some(ExpectedErrorKind::ProviderUserState) + ); + } + #[test] fn classifies_missing_required_fields_as_provider_user_state() { // OPENHUMAN-TAURI-97: composio authorize with a blank required diff --git a/src/openhuman/inference/provider/compatible.rs b/src/openhuman/inference/provider/compatible.rs index 80d7711023..91cd71352f 100644 --- a/src/openhuman/inference/provider/compatible.rs +++ b/src/openhuman/inference/provider/compatible.rs @@ -446,6 +446,17 @@ impl OpenAiCompatibleProvider { Some(model), status, ); + } else if super::is_custom_openai_upstream_bad_request_http_400( + self.name.as_str(), + status, + &error, + ) { + super::log_custom_openai_upstream_bad_request_http_400( + "responses_api", + self.name.as_str(), + Some(model), + status, + ); } else if super::should_report_provider_http_failure(status) { crate::core::observability::report_error( message.as_str(), @@ -790,6 +801,17 @@ impl OpenAiCompatibleProvider { Some(native_request.model.as_str()), status, ); + } else if super::is_custom_openai_upstream_bad_request_http_400( + self.name.as_str(), + status, + &body, + ) { + super::log_custom_openai_upstream_bad_request_http_400( + "streaming_chat", + self.name.as_str(), + Some(native_request.model.as_str()), + status, + ); } else if super::should_report_provider_http_failure(status) { crate::core::observability::report_error( message.as_str(), @@ -1251,6 +1273,17 @@ impl Provider for OpenAiCompatibleProvider { Some(model), status, ); + } else if super::is_custom_openai_upstream_bad_request_http_400( + self.name.as_str(), + status, + &error, + ) { + super::log_custom_openai_upstream_bad_request_http_400( + "chat_completions", + self.name.as_str(), + Some(model), + status, + ); } else if super::should_report_provider_http_failure(status) { crate::core::observability::report_error( message.as_str(), @@ -1642,6 +1675,17 @@ impl Provider for OpenAiCompatibleProvider { Some(model), status, ); + } else if super::is_custom_openai_upstream_bad_request_http_400( + self.name.as_str(), + status, + &error, + ) { + super::log_custom_openai_upstream_bad_request_http_400( + "native_chat", + self.name.as_str(), + Some(model), + status, + ); } else if super::should_report_provider_http_failure(status) { crate::core::observability::report_error( message.as_str(), @@ -1779,6 +1823,17 @@ impl Provider for OpenAiCompatibleProvider { Some(model_owned.as_str()), status, ); + } else if super::is_custom_openai_upstream_bad_request_http_400( + provider_name.as_str(), + status, + &raw_error, + ) { + super::log_custom_openai_upstream_bad_request_http_400( + "stream_chat", + provider_name.as_str(), + Some(model_owned.as_str()), + status, + ); } else if super::should_report_provider_http_failure(status) { crate::core::observability::report_error( message.as_str(), diff --git a/src/openhuman/inference/provider/ops.rs b/src/openhuman/inference/provider/ops.rs index cb3533de8a..378414af95 100644 --- a/src/openhuman/inference/provider/ops.rs +++ b/src/openhuman/inference/provider/ops.rs @@ -272,6 +272,25 @@ pub(super) fn is_budget_exhausted_http_400(status: reqwest::StatusCode, body: &s status == reqwest::StatusCode::BAD_REQUEST && super::is_budget_exhausted_message(body) } +/// Whether a custom OpenAI-compatible proxy returned the known generic +/// upstream 400 envelope: +/// `{"error":{"message":"Bad request to upstream provider","type":"upstream_error","status":400}}`. +/// +/// This shape is deterministic provider/user-state (endpoint-model mismatch, +/// unsupported schema, provider-side validation) and does not provide +/// actionable signal for OpenHuman Sentry triage. +pub(super) fn is_custom_openai_upstream_bad_request_http_400( + provider: &str, + status: reqwest::StatusCode, + body: &str, +) -> bool { + if provider != "custom_openai" || status != reqwest::StatusCode::BAD_REQUEST { + return false; + } + let lower = body.to_ascii_lowercase(); + lower.contains("bad request to upstream provider") && lower.contains("upstream_error") +} + pub(super) fn log_budget_exhausted_http_400( operation: &str, provider: &str, @@ -290,6 +309,25 @@ pub(super) fn log_budget_exhausted_http_400( ); } +pub(super) fn log_custom_openai_upstream_bad_request_http_400( + operation: &str, + provider: &str, + model: Option<&str>, + status: reqwest::StatusCode, +) { + tracing::info!( + domain = "llm_provider", + operation = operation, + provider = provider, + model = model.unwrap_or(""), + status = status.as_u16(), + failure = "non_2xx", + kind = "provider_user_state", + reason = "custom_openai_upstream_bad_request", + "[llm_provider] {operation} custom_openai upstream 400 — not reporting to Sentry" + ); +} + /// Build a sanitized provider error from a failed HTTP response. /// /// Reports the failure to Sentry with `provider` and `status` tags so @@ -321,6 +359,8 @@ pub async fn api_error(provider: &str, response: reqwest::Response) -> anyhow::E let is_auth_failure = matches!(status.as_u16(), 401 | 403); let is_backend = provider == openhuman_backend::PROVIDER_LABEL; let is_budget_exhausted_user_state = is_budget_exhausted_http_400(status, &body); + let is_custom_openai_upstream_bad_request = + is_custom_openai_upstream_bad_request_http_400(provider, status, &body); if is_auth_failure && is_backend { tracing::warn!( @@ -343,6 +383,8 @@ pub async fn api_error(provider: &str, response: reqwest::Response) -> anyhow::E ); } else if is_budget_exhausted_user_state { log_budget_exhausted_http_400("api_error", provider, None, status); + } else if is_custom_openai_upstream_bad_request { + log_custom_openai_upstream_bad_request_http_400("api_error", provider, None, status); } else if should_report_provider_http_failure(status) { crate::core::observability::report_error( message.as_str(), From 980184b604677ef37a38a339c9ddd6488738570f Mon Sep 17 00:00:00 2001 From: Shanu Date: Mon, 18 May 2026 18:18:59 +0530 Subject: [PATCH 3/4] feat(database): implement busy timeout for SQLite connections to prevent locking issues --- src/openhuman/whatsapp_data/store.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/openhuman/whatsapp_data/store.rs b/src/openhuman/whatsapp_data/store.rs index 6d35df5ef0..2f72a6d848 100644 --- a/src/openhuman/whatsapp_data/store.rs +++ b/src/openhuman/whatsapp_data/store.rs @@ -7,10 +7,17 @@ use std::collections::HashMap; use std::path::Path; +use std::time::Duration; use anyhow::{Context, Result}; use rusqlite::{params, Connection}; +/// Matches the value used by `memory/tree/store.rs`. Without this, any +/// concurrent writer (e.g. an overlapping `whatsapp_data_ingest` tick, or +/// ingest racing with a UI-triggered list/search read on the same DB) gets +/// `SQLITE_BUSY` returned immediately as `database is locked`. +const SQLITE_BUSY_TIMEOUT: Duration = Duration::from_secs(15); + use crate::openhuman::whatsapp_data::types::{ ChatMeta, IngestMessage, ListChatsRequest, ListMessagesRequest, SearchMessagesRequest, WhatsAppChat, WhatsAppMessage, @@ -77,8 +84,11 @@ impl WhatsAppDataStore { } fn open_conn(&self) -> Result { - Connection::open(&self.db_path) - .with_context(|| format!("open whatsapp_data db: {}", self.db_path.display())) + let conn = Connection::open(&self.db_path) + .with_context(|| format!("open whatsapp_data db: {}", self.db_path.display()))?; + conn.busy_timeout(SQLITE_BUSY_TIMEOUT) + .context("configure whatsapp_data busy_timeout")?; + Ok(conn) } fn now_secs() -> i64 { From 5954fbe233bc43536d4b950db6bb7f6beefca1b6 Mon Sep 17 00:00:00 2001 From: Shanu Date: Mon, 18 May 2026 19:01:21 +0530 Subject: [PATCH 4/4] fix(observability): anchor custom_openai 400 matcher to provider prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per CodeRabbit feedback on PR #2107: the previous matcher demoted any error containing both "bad request to upstream provider" and "upstream_error" — too broad. Anchor it to the canonical envelope prefix "custom_openai api error (400" so it can't silence unrelated errors that happen to mention either substring (e.g. a future provider whose wire shape reuses one of them). Adds a regression test confirming an error that contains both inner substrings WITHOUT the custom_openai 400 anchor is no longer silenced. --- src/core/observability.rs | 44 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/src/core/observability.rs b/src/core/observability.rs index 01f8a9fd4b..e5e051007e 100644 --- a/src/core/observability.rs +++ b/src/core/observability.rs @@ -326,7 +326,22 @@ fn is_provider_user_state_message(lower: &str) -> bool { return true; } - if lower.contains("bad request to upstream provider") && lower.contains("upstream_error") { + // OPENHUMAN-TAURI-XX: custom_openai upstream rejected the request with + // its own 400. Wire shape produced by + // `inference/provider/compatible.rs::is_custom_openai_upstream_bad_request_http_400`: + // + // custom_openai API error (400 Bad Request): {"error":{ + // "message":"Bad request to upstream provider", + // "type":"upstream_error","status":400}} + // + // Anchored to the `custom_openai api error (400` prefix so this can't + // silence unrelated errors that happen to mention both + // "bad request to upstream provider" and "upstream_error" elsewhere + // (e.g. a future provider whose envelope reuses one of those strings). + if lower.contains("custom_openai api error (400") + && lower.contains("bad request to upstream provider") + && lower.contains("upstream_error") + { return true; } @@ -1441,6 +1456,33 @@ mod tests { ); } + /// Regression for CodeRabbit feedback on PR #2107: the matcher must + /// not demote unrelated errors that happen to contain both + /// "bad request to upstream provider" and "upstream_error" without + /// the `custom_openai API error (400` anchor. + #[test] + fn does_not_silence_unrelated_error_with_only_inner_substrings() { + // No `custom_openai API error (400` prefix → must NOT classify + // as ProviderUserState, otherwise we'd silence actionable bugs. + assert_eq!( + expected_error_kind( + "internal panic in router: bad request to upstream provider \ + (state=upstream_error)" + ), + None, + ); + + // A future hypothetical provider envelope reusing one substring + // also must not classify. + assert_eq!( + expected_error_kind( + "anthropic_api error: upstream_error encountered while \ + forwarding bad request to upstream provider" + ), + None, + ); + } + #[test] fn classifies_missing_required_fields_as_provider_user_state() { // OPENHUMAN-TAURI-97: composio authorize with a blank required