diff --git a/app/src-tauri/src/lib.rs b/app/src-tauri/src/lib.rs index b51961cf78..b081d52ea2 100644 --- a/app/src-tauri/src/lib.rs +++ b/app/src-tauri/src/lib.rs @@ -1250,6 +1250,11 @@ pub fn run() { ); return None; } + if openhuman_core::core::observability::is_transient_backend_api_failure(&event) + || openhuman_core::core::observability::is_transient_integrations_failure(&event) + { + return None; + } // Strip server_name (hostname) to avoid leaking machine identity. event.server_name = None; event.user = None; diff --git a/src/api/rest.rs b/src/api/rest.rs index 1684197450..efab8d2961 100644 --- a/src/api/rest.rs +++ b/src/api/rest.rs @@ -423,16 +423,42 @@ impl BackendOAuthClient { } let response = request.send().await.map_err(|e| { - crate::core::observability::report_error( - e.to_string().as_str(), - "backend_api", - "authed_json", - &[ - ("method", method.as_str()), - ("path", url.path()), - ("failure", "transport"), - ], - ); + // Walk the error source chain so transient markers hidden in nested + // causes (reqwest -> hyper -> rustls TLS EOF, etc.) still classify + // correctly. The top-level `e.to_string()` often only carries the + // outermost wrapper, e.g. "error sending request for url (...)". + let mut error_message = e.to_string(); + let mut src: Option<&(dyn std::error::Error + 'static)> = std::error::Error::source(&e); + while let Some(s) = src { + error_message.push_str(" → "); + error_message.push_str(&s.to_string()); + src = s.source(); + } + if crate::core::observability::contains_transient_transport_phrase(&error_message) { + tracing::warn!( + domain = "backend_api", + operation = "authed_json", + method = method.as_str(), + path = url.path(), + failure = "transport", + error = %error_message, + "[backend_api] transient transport failure on {} {}: {}", + method.as_str(), + url.path(), + error_message, + ); + } else { + crate::core::observability::report_error( + error_message.as_str(), + "backend_api", + "authed_json", + &[ + ("method", method.as_str()), + ("path", url.path()), + ("failure", "transport"), + ], + ); + } anyhow::Error::new(e).context(format!( "backend request {} {}", method.as_str(), @@ -445,15 +471,19 @@ impl BackendOAuthClient { if !status.is_success() { let status_code = status.as_u16(); let status_str = status_code.to_string(); - // 502/503/504 are transient infrastructure errors (proxy/CDN/backend + // These are transient infrastructure errors (proxy/CDN/backend // temporarily unavailable). They are not code bugs and callers already // implement retry/disable logic, so skip Sentry to avoid noise. - let is_transient_infra = matches!(status_code, 502 | 503 | 504); + let is_transient_infra = + crate::core::observability::is_transient_http_status_code(status_code); if is_transient_infra { tracing::warn!( + domain = "backend_api", + operation = "authed_json", method = method.as_str(), path = url.path(), status = status_code, + failure = "non_2xx", "[backend_api] transient {status} on {} {} — not reporting to Sentry", method.as_str(), url.path(), diff --git a/src/core/jsonrpc.rs b/src/core/jsonrpc.rs index 6f07b49259..7ee76687b5 100644 --- a/src/core/jsonrpc.rs +++ b/src/core/jsonrpc.rs @@ -111,6 +111,26 @@ pub async fn rpc_handler(State(state): State, Json(req): Json err ({}ms): {}", method, ms, display_message); + } else if crate::core::observability::is_transient_message_failure(&display_message) { + // Downstream call (backend_api / integrations / provider) already + // demoted the underlying transient failure to a warn. The error + // string still propagates up to here; re-reporting at error level + // would re-create the very Sentry noise the lower-layer demote + // was meant to avoid (#8Z, #93, #8W, #96). + // + // Redact before logging — `display_message` is upstream-derived + // (backend / provider response) and can carry URL fragments, + // query params, or pasted-through provider error text that + // includes tokens. `sanitize_api_error` runs the same scrub + // used in the SessionExpired publish path below. + let redacted = + crate::openhuman::providers::ops::sanitize_api_error(&display_message); + tracing::warn!( + method = %method, + elapsed_ms = ms as u64, + error = %redacted, + "[rpc] transient downstream failure — not reporting to Sentry (message redacted)" + ); } else { crate::core::observability::report_error_or_expected( display_message.as_str(), diff --git a/src/core/observability.rs b/src/core/observability.rs index 521a6db9f1..a7a793943c 100644 --- a/src/core/observability.rs +++ b/src/core/observability.rs @@ -33,7 +33,24 @@ pub type Tag<'a> = (&'a str, &'a str); /// (`openhuman::providers::ops::should_report_provider_http_failure`) and the /// `before_send` filter (`is_transient_provider_http_failure`). Update here /// and both sites pick it up — keeps the two layers from drifting. -pub const TRANSIENT_PROVIDER_HTTP_STATUSES: &[u16] = &[408, 429, 502, 503, 504]; +pub const TRANSIENT_PROVIDER_HTTP_STATUSES: &[u16] = &[408, 429, 502, 503, 504, 520]; + +/// HTTP status codes that represent transient backend / integration transport +/// failures rather than application bugs. Keep this as strings because Sentry +/// tags are strings, and the before_send classifiers match tag values exactly. +pub const TRANSIENT_HTTP_STATUSES: &[&str] = &["408", "429", "502", "503", "504", "520"]; + +/// Transport-layer phrases observed from reqwest / hyper for temporary +/// upstream interruptions. Keep these specific so rare configuration failures +/// still reach Sentry. +pub const TRANSIENT_TRANSPORT_PHRASES: &[&str] = &[ + "timeout", + "operation timed out", + "connection forcibly closed", + "connection reset", + "tls handshake eof", + "error sending request", +]; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ExpectedErrorKind { @@ -294,6 +311,102 @@ pub fn is_transient_provider_http_failure(event: &sentry::protocol::Event<'_>) - TRANSIENT_PROVIDER_HTTP_STATUSES.contains(&status_u16) } +pub fn is_transient_http_status(status: &str) -> bool { + TRANSIENT_HTTP_STATUSES.contains(&status) +} + +pub fn is_transient_http_status_code(status: u16) -> bool { + let status = status.to_string(); + is_transient_http_status(status.as_str()) +} + +pub fn contains_transient_transport_phrase(message: &str) -> bool { + let lower = message.to_ascii_lowercase(); + TRANSIENT_TRANSPORT_PHRASES + .iter() + .any(|phrase| lower.contains(phrase)) +} + +fn event_has_transient_transport_phrase(event: &sentry::protocol::Event<'_>) -> bool { + event + .message + .as_deref() + .is_some_and(contains_transient_transport_phrase) + || event + .logentry + .as_ref() + .is_some_and(|log| contains_transient_transport_phrase(&log.message)) + || event.exception.values.iter().any(|exception| { + exception + .value + .as_deref() + .is_some_and(contains_transient_transport_phrase) + }) +} + +fn is_transient_domain_failure(event: &sentry::protocol::Event<'_>, domain: &str) -> bool { + let tags = &event.tags; + if tags.get("domain").map(String::as_str) != Some(domain) { + return false; + } + + match tags.get("failure").map(String::as_str) { + Some("non_2xx") => tags + .get("status") + .is_some_and(|status| is_transient_http_status(status)), + Some("transport") => event_has_transient_transport_phrase(event), + _ => false, + } +} + +/// Transient backend API failures (gateway hiccups, scheduled downtime). +/// Match by event tags written by report_error at the authed_json call site. +pub fn is_transient_backend_api_failure(event: &sentry::protocol::Event<'_>) -> bool { + is_transient_domain_failure(event, "backend_api") +} + +/// Transient integrations / Composio failures (timeout, connection reset, +/// gateway hiccups). +pub fn is_transient_integrations_failure(event: &sentry::protocol::Event<'_>) -> bool { + is_transient_domain_failure(event, "integrations") +} + +/// String tokens that mark a formatted error message as a transient HTTP +/// failure. Used at upstream emit sites (`rpc.invoke_method`, +/// `web_channel.run_chat_task`) where the error has already been stringified +/// and the original `status` / `failure` tag context is gone. +/// +/// Each token combines a status code with a non-numeric anchor (parenthesis +/// or canonical reason phrase) so bare numeric coincidences ("process 502 +/// exited") do not match. +const TRANSIENT_STATUS_MESSAGE_TOKENS: &[&str] = &[ + "(408 ", + "(429 ", + "(502 ", + "(503 ", + "(504 ", + "(520 ", + "408 request timeout", + "429 too many requests", + "502 bad gateway", + "503 service unavailable", + "504 gateway timeout", + "520 ", +]; + +/// Returns true when a formatted error message describes a transient HTTP +/// or transport-layer failure that has already been demoted further down the +/// stack. Use at upstream re-emit sites (`rpc.invoke_method`, +/// `web_channel.run_chat_task`) where `report_error` is called with the +/// stringified downstream error and no `failure` / `status` tag context. +pub fn is_transient_message_failure(msg: &str) -> bool { + let lower = msg.to_ascii_lowercase(); + TRANSIENT_STATUS_MESSAGE_TOKENS + .iter() + .any(|token| lower.contains(token)) + || contains_transient_transport_phrase(&lower) +} + #[cfg(test)] mod tests { use super::*; @@ -527,6 +640,15 @@ mod tests { event } + fn event_with_tags_and_message( + pairs: &[(&str, &str)], + message: &str, + ) -> sentry::protocol::Event<'static> { + let mut event = event_with_tags(pairs); + event.message = Some(message.to_string()); + event + } + #[test] fn transient_filter_drops_429_408_502_503_504() { for status in ["429", "408", "502", "503", "504"] { @@ -608,6 +730,181 @@ mod tests { ); } + #[test] + fn backend_api_filter_drops_transient_statuses() { + for status in TRANSIENT_HTTP_STATUSES { + let event = event_with_tags(&[ + ("domain", "backend_api"), + ("failure", "non_2xx"), + ("status", status), + ]); + assert!( + is_transient_backend_api_failure(&event), + "backend status {status} must be classified as transient" + ); + } + } + + #[test] + fn backend_api_filter_drops_transient_transport_phrases() { + for phrase in TRANSIENT_TRANSPORT_PHRASES { + let event = event_with_tags_and_message( + &[("domain", "backend_api"), ("failure", "transport")], + &format!("GET /teams failed: {phrase}"), + ); + assert!( + is_transient_backend_api_failure(&event), + "backend transport phrase {phrase} must be classified as transient" + ); + } + } + + #[test] + fn backend_api_filter_keeps_non_transient_failures() { + for status in ["404", "500"] { + let event = event_with_tags(&[ + ("domain", "backend_api"), + ("failure", "non_2xx"), + ("status", status), + ]); + assert!( + !is_transient_backend_api_failure(&event), + "backend status {status} must stay visible" + ); + } + + let wrong_domain = event_with_tags(&[ + ("domain", "scheduler"), + ("failure", "non_2xx"), + ("status", "503"), + ]); + assert!( + !is_transient_backend_api_failure(&wrong_domain), + "domain scoping must keep unrelated transient-shaped events visible" + ); + + let non_matching_transport = event_with_tags_and_message( + &[("domain", "backend_api"), ("failure", "transport")], + "GET /teams failed: certificate verify failed", + ); + assert!( + !is_transient_backend_api_failure(&non_matching_transport), + "transport failures without an allowlisted phrase must stay visible" + ); + } + + #[test] + fn integrations_filter_drops_transient_statuses() { + for status in TRANSIENT_HTTP_STATUSES { + let event = event_with_tags(&[ + ("domain", "integrations"), + ("failure", "non_2xx"), + ("status", status), + ]); + assert!( + is_transient_integrations_failure(&event), + "integrations status {status} must be classified as transient" + ); + } + } + + #[test] + fn integrations_filter_drops_transient_transport_phrases() { + for phrase in TRANSIENT_TRANSPORT_PHRASES { + let event = event_with_tags_and_message( + &[("domain", "integrations"), ("failure", "transport")], + &format!("GET /agent-integrations/tools failed: {phrase}"), + ); + assert!( + is_transient_integrations_failure(&event), + "integrations transport phrase {phrase} must be classified as transient" + ); + } + } + + #[test] + fn integrations_filter_keeps_non_transient_failures() { + for status in ["404", "500"] { + let event = event_with_tags(&[ + ("domain", "integrations"), + ("failure", "non_2xx"), + ("status", status), + ]); + assert!( + !is_transient_integrations_failure(&event), + "integrations status {status} must stay visible" + ); + } + + let wrong_domain = event_with_tags(&[ + ("domain", "composio"), + ("failure", "non_2xx"), + ("status", "503"), + ]); + assert!( + !is_transient_integrations_failure(&wrong_domain), + "domain scoping must keep composio-tagged events visible" + ); + + let non_matching_transport = event_with_tags_and_message( + &[("domain", "integrations"), ("failure", "transport")], + "GET /agent-integrations/tools failed: invalid certificate", + ); + assert!( + !is_transient_integrations_failure(&non_matching_transport), + "transport failures without an allowlisted phrase must stay visible" + ); + } + + #[test] + fn message_failure_classifier_matches_canonical_status_phrases() { + for msg in [ + "rpc.invoke_method failed: GET /teams failed (502 Bad Gateway)", + "GET /teams/me/usage failed (503 Service Unavailable)", + "downstream returned (504 Gateway Timeout): retry budget exhausted", + "OpenHuman API error (520 ): cf", + "POST /channels/telegram/typing failed (429 Too Many Requests)", + "auth connect failed: 503 Service Unavailable", + ] { + assert!( + is_transient_message_failure(msg), + "{msg:?} must be classified as transient" + ); + } + } + + #[test] + fn message_failure_classifier_matches_transport_phrases() { + for msg in [ + "integrations.get failed: composio/tools → operation timed out", + "GET https://api.example.com → connection forcibly closed (os 10054)", + "POST /v1/foo → tls handshake eof", + "error sending request for url (https://api.example.com)", + ] { + assert!( + is_transient_message_failure(msg), + "{msg:?} must be classified as transient" + ); + } + } + + #[test] + fn message_failure_classifier_keeps_unrelated_messages() { + for msg in [ + "rpc.invoke_method failed: schema validation error", + "process 502 exited unexpectedly", + "GET /teams failed (404 Not Found)", + "GET /teams failed (500 Internal Server Error)", + "unrelated error with port 5023", + "", + ] { + assert!( + !is_transient_message_failure(msg), + "{msg:?} must not be classified as transient" + ); + } + } + #[test] fn report_error_or_expected_does_not_panic() { report_error_or_expected( diff --git a/src/main.rs b/src/main.rs index e5fb20f13e..65316a6745 100644 --- a/src/main.rs +++ b/src/main.rs @@ -59,6 +59,11 @@ fn main() { if openhuman_core::core::observability::is_transient_provider_http_failure(&event) { return None; } + if openhuman_core::core::observability::is_transient_backend_api_failure(&event) + || openhuman_core::core::observability::is_transient_integrations_failure(&event) + { + return None; + } // Strip server_name (hostname) to avoid leaking machine identity event.server_name = None; // Attach the cached account uid so Sentry can count unique users diff --git a/src/openhuman/integrations/client.rs b/src/openhuman/integrations/client.rs index 0c354ac783..b60bcf1a96 100644 --- a/src/openhuman/integrations/client.rs +++ b/src/openhuman/integrations/client.rs @@ -121,16 +121,27 @@ impl IntegrationClient { let body_text = resp.text().await.unwrap_or_default(); let detail = extract_error_detail(&body_text, MAX_ERROR_BODY_LEN); let status_str = status.as_u16().to_string(); - crate::core::observability::report_error( - format!("Backend returned {status} for POST {url}: {detail}").as_str(), - "integrations", - "post", - &[ - ("path", path), - ("status", status_str.as_str()), - ("failure", "non_2xx"), - ], - ); + if crate::core::observability::is_transient_http_status_code(status.as_u16()) { + tracing::warn!( + domain = "integrations", + operation = "post", + path = path, + status = status_str.as_str(), + failure = "non_2xx", + "[integrations] transient {status} for POST {url}: {detail}", + ); + } else { + crate::core::observability::report_error( + format!("Backend returned {status} for POST {url}: {detail}").as_str(), + "integrations", + "post", + &[ + ("path", path), + ("status", status_str.as_str()), + ("failure", "non_2xx"), + ], + ); + } anyhow::bail!("Backend returned {status} for POST {url}: {detail}"); } @@ -189,16 +200,27 @@ impl IntegrationClient { let body_text = resp.text().await.unwrap_or_default(); let detail = extract_error_detail(&body_text, MAX_ERROR_BODY_LEN); let status_str = status.as_u16().to_string(); - crate::core::observability::report_error( - format!("Backend returned {status} for GET {url}: {detail}").as_str(), - "integrations", - "get", - &[ - ("path", path), - ("status", status_str.as_str()), - ("failure", "non_2xx"), - ], - ); + if crate::core::observability::is_transient_http_status_code(status.as_u16()) { + tracing::warn!( + domain = "integrations", + operation = "get", + path = path, + status = status_str.as_str(), + failure = "non_2xx", + "[integrations] transient {status} for GET {url}: {detail}", + ); + } else { + crate::core::observability::report_error( + format!("Backend returned {status} for GET {url}: {detail}").as_str(), + "integrations", + "get", + &[ + ("path", path), + ("status", status_str.as_str()), + ("failure", "non_2xx"), + ], + ); + } anyhow::bail!("Backend returned {status} for GET {url}: {detail}"); } diff --git a/tests/observability_smoke.rs b/tests/observability_smoke.rs index 47294d49bb..f58c003c1b 100644 --- a/tests/observability_smoke.rs +++ b/tests/observability_smoke.rs @@ -1,5 +1,5 @@ -//! Runtime smoke for the Sentry `before_send` filter that drops per-attempt -//! transient-upstream provider failures (OPENHUMAN-TAURI-2E / 84 / T). +//! Runtime smoke for the Sentry `before_send` filters that drop per-attempt +//! transient-upstream provider, backend_api, and integrations failures. //! //! Unit tests in `src/core/observability.rs` exercise the pure filter //! function. This integration test wires the actual `sentry::init` → @@ -7,7 +7,10 @@ //! behaves as designed: transient events are dropped, permanent events //! and aggregate `all_exhausted` events still surface. -use openhuman_core::core::observability::is_transient_provider_http_failure; +use openhuman_core::core::observability::{ + is_transient_backend_api_failure, is_transient_integrations_failure, + is_transient_provider_http_failure, +}; use sentry::protocol::Event; use std::collections::BTreeMap; use std::sync::Arc; @@ -22,6 +25,12 @@ fn event_with_tags(tags: &[(&str, &str)]) -> Event<'static> { event } +fn event_with_tags_and_message(tags: &[(&str, &str)], message: &str) -> Event<'static> { + let mut event = event_with_tags(tags); + event.message = Some(message.to_string()); + event +} + /// Drive an envelope-capturing Sentry client through a sequence of events /// and return how many made it past `before_send`. /// @@ -46,7 +55,10 @@ fn count_captured(events: Vec>) -> usize { ), // Same filter shape the real binary installs in main.rs. before_send: Some(Arc::new(|event| { - if is_transient_provider_http_failure(&event) { + if is_transient_provider_http_failure(&event) + || is_transient_backend_api_failure(&event) + || is_transient_integrations_failure(&event) + { None } else { Some(event) @@ -68,6 +80,38 @@ fn count_captured(events: Vec>) -> usize { transport.fetch_and_clear_envelopes().len() } +#[test] +fn drops_backend_api_transient_statuses() { + let events = ["408", "429", "502", "503", "504", "520"] + .into_iter() + .map(|status| { + event_with_tags(&[ + ("domain", "backend_api"), + ("failure", "non_2xx"), + ("status", status), + ]) + }) + .collect(); + assert_eq!( + count_captured(events), + 0, + "transient backend_api statuses must be filtered in before_send" + ); +} + +#[test] +fn drops_integrations_transient_transport_timeout() { + let event = event_with_tags_and_message( + &[("domain", "integrations"), ("failure", "transport")], + "GET /agent-integrations/tools failed: operation timed out", + ); + assert_eq!( + count_captured(vec![event]), + 0, + "transient integrations timeouts must be filtered in before_send" + ); +} + #[test] fn drops_per_attempt_429_503_504_408_502() { // Each of these matches the tag shape `ops::api_error` sets when a @@ -111,6 +155,20 @@ fn keeps_permanent_failures() { ); } +#[test] +fn keeps_backend_api_404_failure() { + let event = event_with_tags(&[ + ("domain", "backend_api"), + ("failure", "non_2xx"), + ("status", "404"), + ]); + assert_eq!( + count_captured(vec![event]), + 1, + "non-transient backend_api 404 failures must reach Sentry" + ); +} + #[test] fn keeps_aggregate_all_exhausted_event() { // The reliable_chat layer fires a single aggregate