From 362400c4d9b507db8d8319bd308188e11ac13522 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Wed, 13 May 2026 16:42:01 +0530 Subject: [PATCH 01/11] feat(observability): add transient backend constants --- src/core/observability.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/core/observability.rs b/src/core/observability.rs index 521a6db9f1..2dd99557c5 100644 --- a/src/core/observability.rs +++ b/src/core/observability.rs @@ -35,6 +35,23 @@ pub type Tag<'a> = (&'a str, &'a str); /// and both sites pick it up — keeps the two layers from drifting. pub const TRANSIENT_PROVIDER_HTTP_STATUSES: &[u16] = &[408, 429, 502, 503, 504]; +/// HTTP status codes that represent transient backend / integration transport +/// failures rather than application bugs. Keep this as strings because Sentry +/// tags are strings, and the before_send classifiers match tag values exactly. +pub const TRANSIENT_HTTP_STATUSES: &[&str] = &["408", "429", "502", "503", "504", "520"]; + +/// Transport-layer phrases observed from reqwest / hyper for temporary +/// upstream interruptions. Keep these specific so rare configuration failures +/// still reach Sentry. +pub const TRANSIENT_TRANSPORT_PHRASES: &[&str] = &[ + "timeout", + "operation timed out", + "connection forcibly closed", + "connection reset", + "tls handshake eof", + "error sending request", +]; + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ExpectedErrorKind { LocalAiDisabled, From 2f7a2185abc9ec28a0eb1bebc8eb6fe402fd890f Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Wed, 13 May 2026 16:46:54 +0530 Subject: [PATCH 02/11] feat(observability): classify transient backend_api failures --- src/core/observability.rs | 126 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/src/core/observability.rs b/src/core/observability.rs index 2dd99557c5..b6fccbd3c8 100644 --- a/src/core/observability.rs +++ b/src/core/observability.rs @@ -311,6 +311,60 @@ pub fn is_transient_provider_http_failure(event: &sentry::protocol::Event<'_>) - TRANSIENT_PROVIDER_HTTP_STATUSES.contains(&status_u16) } +pub fn is_transient_http_status(status: &str) -> bool { + TRANSIENT_HTTP_STATUSES.contains(&status) +} + +pub fn is_transient_http_status_code(status: u16) -> bool { + let status = status.to_string(); + is_transient_http_status(status.as_str()) +} + +pub fn contains_transient_transport_phrase(message: &str) -> bool { + let lower = message.to_ascii_lowercase(); + TRANSIENT_TRANSPORT_PHRASES + .iter() + .any(|phrase| lower.contains(phrase)) +} + +fn event_has_transient_transport_phrase(event: &sentry::protocol::Event<'_>) -> bool { + event + .message + .as_deref() + .is_some_and(contains_transient_transport_phrase) + || event + .logentry + .as_ref() + .is_some_and(|log| contains_transient_transport_phrase(&log.message)) + || event.exception.values.iter().any(|exception| { + exception + .value + .as_deref() + .is_some_and(contains_transient_transport_phrase) + }) +} + +fn is_transient_domain_failure(event: &sentry::protocol::Event<'_>, domain: &str) -> bool { + let tags = &event.tags; + if tags.get("domain").map(String::as_str) != Some(domain) { + return false; + } + + match tags.get("failure").map(String::as_str) { + Some("non_2xx") => tags + .get("status") + .is_some_and(|status| is_transient_http_status(status)), + Some("transport") => event_has_transient_transport_phrase(event), + _ => false, + } +} + +/// Transient backend API failures (gateway hiccups, scheduled downtime). +/// Match by event tags written by report_error at the authed_json call site. +pub fn is_transient_backend_api_failure(event: &sentry::protocol::Event<'_>) -> bool { + is_transient_domain_failure(event, "backend_api") +} + #[cfg(test)] mod tests { use super::*; @@ -544,6 +598,15 @@ mod tests { event } + fn event_with_tags_and_message( + pairs: &[(&str, &str)], + message: &str, + ) -> sentry::protocol::Event<'static> { + let mut event = event_with_tags(pairs); + event.message = Some(message.to_string()); + event + } + #[test] fn transient_filter_drops_429_408_502_503_504() { for status in ["429", "408", "502", "503", "504"] { @@ -625,6 +688,69 @@ mod tests { ); } + #[test] + fn backend_api_filter_drops_transient_statuses() { + for status in TRANSIENT_HTTP_STATUSES { + let event = event_with_tags(&[ + ("domain", "backend_api"), + ("failure", "non_2xx"), + ("status", status), + ]); + assert!( + is_transient_backend_api_failure(&event), + "backend status {status} must be classified as transient" + ); + } + } + + #[test] + fn backend_api_filter_drops_transient_transport_phrases() { + for phrase in TRANSIENT_TRANSPORT_PHRASES { + let event = event_with_tags_and_message( + &[("domain", "backend_api"), ("failure", "transport")], + &format!("GET /teams failed: {phrase}"), + ); + assert!( + is_transient_backend_api_failure(&event), + "backend transport phrase {phrase} must be classified as transient" + ); + } + } + + #[test] + fn backend_api_filter_keeps_non_transient_failures() { + for status in ["404", "500"] { + let event = event_with_tags(&[ + ("domain", "backend_api"), + ("failure", "non_2xx"), + ("status", status), + ]); + assert!( + !is_transient_backend_api_failure(&event), + "backend status {status} must stay visible" + ); + } + + let wrong_domain = event_with_tags(&[ + ("domain", "scheduler"), + ("failure", "non_2xx"), + ("status", "503"), + ]); + assert!( + !is_transient_backend_api_failure(&wrong_domain), + "domain scoping must keep unrelated transient-shaped events visible" + ); + + let non_matching_transport = event_with_tags_and_message( + &[("domain", "backend_api"), ("failure", "transport")], + "GET /teams failed: certificate verify failed", + ); + assert!( + !is_transient_backend_api_failure(&non_matching_transport), + "transport failures without an allowlisted phrase must stay visible" + ); + } + #[test] fn report_error_or_expected_does_not_panic() { report_error_or_expected( From 9b25fc5098efc3266dead380e47a078c9e7099f1 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Wed, 13 May 2026 16:48:06 +0530 Subject: [PATCH 03/11] feat(observability): classify transient integrations failures --- src/core/observability.rs | 69 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/core/observability.rs b/src/core/observability.rs index b6fccbd3c8..1e54065ddf 100644 --- a/src/core/observability.rs +++ b/src/core/observability.rs @@ -365,6 +365,12 @@ pub fn is_transient_backend_api_failure(event: &sentry::protocol::Event<'_>) -> is_transient_domain_failure(event, "backend_api") } +/// Transient integrations / Composio failures (timeout, connection reset, +/// gateway hiccups). +pub fn is_transient_integrations_failure(event: &sentry::protocol::Event<'_>) -> bool { + is_transient_domain_failure(event, "integrations") +} + #[cfg(test)] mod tests { use super::*; @@ -751,6 +757,69 @@ mod tests { ); } + #[test] + fn integrations_filter_drops_transient_statuses() { + for status in TRANSIENT_HTTP_STATUSES { + let event = event_with_tags(&[ + ("domain", "integrations"), + ("failure", "non_2xx"), + ("status", status), + ]); + assert!( + is_transient_integrations_failure(&event), + "integrations status {status} must be classified as transient" + ); + } + } + + #[test] + fn integrations_filter_drops_transient_transport_phrases() { + for phrase in TRANSIENT_TRANSPORT_PHRASES { + let event = event_with_tags_and_message( + &[("domain", "integrations"), ("failure", "transport")], + &format!("GET /agent-integrations/tools failed: {phrase}"), + ); + assert!( + is_transient_integrations_failure(&event), + "integrations transport phrase {phrase} must be classified as transient" + ); + } + } + + #[test] + fn integrations_filter_keeps_non_transient_failures() { + for status in ["404", "500"] { + let event = event_with_tags(&[ + ("domain", "integrations"), + ("failure", "non_2xx"), + ("status", status), + ]); + assert!( + !is_transient_integrations_failure(&event), + "integrations status {status} must stay visible" + ); + } + + let wrong_domain = event_with_tags(&[ + ("domain", "composio"), + ("failure", "non_2xx"), + ("status", "503"), + ]); + assert!( + !is_transient_integrations_failure(&wrong_domain), + "domain scoping must keep composio-tagged events visible" + ); + + let non_matching_transport = event_with_tags_and_message( + &[("domain", "integrations"), ("failure", "transport")], + "GET /agent-integrations/tools failed: invalid certificate", + ); + assert!( + !is_transient_integrations_failure(&non_matching_transport), + "transport failures without an allowlisted phrase must stay visible" + ); + } + #[test] fn report_error_or_expected_does_not_panic() { report_error_or_expected( From 2d4a07a6b4c41966f6ed5ac62db3ae6aa6199b47 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Wed, 13 May 2026 16:49:04 +0530 Subject: [PATCH 04/11] fix(observability): demote transient backend and integration emissions --- src/api/rest.rs | 44 ++++++++++++++------ src/openhuman/integrations/client.rs | 62 +++++++++++++++++++--------- 2 files changed, 74 insertions(+), 32 deletions(-) diff --git a/src/api/rest.rs b/src/api/rest.rs index 1684197450..e5aa6d073d 100644 --- a/src/api/rest.rs +++ b/src/api/rest.rs @@ -423,16 +423,32 @@ impl BackendOAuthClient { } let response = request.send().await.map_err(|e| { - crate::core::observability::report_error( - e.to_string().as_str(), - "backend_api", - "authed_json", - &[ - ("method", method.as_str()), - ("path", url.path()), - ("failure", "transport"), - ], - ); + let error_message = e.to_string(); + if crate::core::observability::contains_transient_transport_phrase(&error_message) { + tracing::warn!( + domain = "backend_api", + operation = "authed_json", + method = method.as_str(), + path = url.path(), + failure = "transport", + error = %error_message, + "[backend_api] transient transport failure on {} {}: {}", + method.as_str(), + url.path(), + error_message, + ); + } else { + crate::core::observability::report_error( + error_message.as_str(), + "backend_api", + "authed_json", + &[ + ("method", method.as_str()), + ("path", url.path()), + ("failure", "transport"), + ], + ); + } anyhow::Error::new(e).context(format!( "backend request {} {}", method.as_str(), @@ -445,15 +461,19 @@ impl BackendOAuthClient { if !status.is_success() { let status_code = status.as_u16(); let status_str = status_code.to_string(); - // 502/503/504 are transient infrastructure errors (proxy/CDN/backend + // These are transient infrastructure errors (proxy/CDN/backend // temporarily unavailable). They are not code bugs and callers already // implement retry/disable logic, so skip Sentry to avoid noise. - let is_transient_infra = matches!(status_code, 502 | 503 | 504); + let is_transient_infra = + crate::core::observability::is_transient_http_status_code(status_code); if is_transient_infra { tracing::warn!( + domain = "backend_api", + operation = "authed_json", method = method.as_str(), path = url.path(), status = status_code, + failure = "non_2xx", "[backend_api] transient {status} on {} {} — not reporting to Sentry", method.as_str(), url.path(), diff --git a/src/openhuman/integrations/client.rs b/src/openhuman/integrations/client.rs index 0c354ac783..b60bcf1a96 100644 --- a/src/openhuman/integrations/client.rs +++ b/src/openhuman/integrations/client.rs @@ -121,16 +121,27 @@ impl IntegrationClient { let body_text = resp.text().await.unwrap_or_default(); let detail = extract_error_detail(&body_text, MAX_ERROR_BODY_LEN); let status_str = status.as_u16().to_string(); - crate::core::observability::report_error( - format!("Backend returned {status} for POST {url}: {detail}").as_str(), - "integrations", - "post", - &[ - ("path", path), - ("status", status_str.as_str()), - ("failure", "non_2xx"), - ], - ); + if crate::core::observability::is_transient_http_status_code(status.as_u16()) { + tracing::warn!( + domain = "integrations", + operation = "post", + path = path, + status = status_str.as_str(), + failure = "non_2xx", + "[integrations] transient {status} for POST {url}: {detail}", + ); + } else { + crate::core::observability::report_error( + format!("Backend returned {status} for POST {url}: {detail}").as_str(), + "integrations", + "post", + &[ + ("path", path), + ("status", status_str.as_str()), + ("failure", "non_2xx"), + ], + ); + } anyhow::bail!("Backend returned {status} for POST {url}: {detail}"); } @@ -189,16 +200,27 @@ impl IntegrationClient { let body_text = resp.text().await.unwrap_or_default(); let detail = extract_error_detail(&body_text, MAX_ERROR_BODY_LEN); let status_str = status.as_u16().to_string(); - crate::core::observability::report_error( - format!("Backend returned {status} for GET {url}: {detail}").as_str(), - "integrations", - "get", - &[ - ("path", path), - ("status", status_str.as_str()), - ("failure", "non_2xx"), - ], - ); + if crate::core::observability::is_transient_http_status_code(status.as_u16()) { + tracing::warn!( + domain = "integrations", + operation = "get", + path = path, + status = status_str.as_str(), + failure = "non_2xx", + "[integrations] transient {status} for GET {url}: {detail}", + ); + } else { + crate::core::observability::report_error( + format!("Backend returned {status} for GET {url}: {detail}").as_str(), + "integrations", + "get", + &[ + ("path", path), + ("status", status_str.as_str()), + ("failure", "non_2xx"), + ], + ); + } anyhow::bail!("Backend returned {status} for GET {url}: {detail}"); } From b8dafb8ac062ab33287ea2fabf06b9e9981d0973 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Wed, 13 May 2026 16:49:19 +0530 Subject: [PATCH 05/11] chore(observability): wire backend transient before_send filters --- app/src-tauri/src/lib.rs | 5 +++++ src/main.rs | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/app/src-tauri/src/lib.rs b/app/src-tauri/src/lib.rs index b51961cf78..b081d52ea2 100644 --- a/app/src-tauri/src/lib.rs +++ b/app/src-tauri/src/lib.rs @@ -1250,6 +1250,11 @@ pub fn run() { ); return None; } + if openhuman_core::core::observability::is_transient_backend_api_failure(&event) + || openhuman_core::core::observability::is_transient_integrations_failure(&event) + { + return None; + } // Strip server_name (hostname) to avoid leaking machine identity. event.server_name = None; event.user = None; diff --git a/src/main.rs b/src/main.rs index e5fb20f13e..65316a6745 100644 --- a/src/main.rs +++ b/src/main.rs @@ -59,6 +59,11 @@ fn main() { if openhuman_core::core::observability::is_transient_provider_http_failure(&event) { return None; } + if openhuman_core::core::observability::is_transient_backend_api_failure(&event) + || openhuman_core::core::observability::is_transient_integrations_failure(&event) + { + return None; + } // Strip server_name (hostname) to avoid leaking machine identity event.server_name = None; // Attach the cached account uid so Sentry can count unique users From c4a0f0992cda79ed735bbcedda8a4c4170fa27fc Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Wed, 13 May 2026 16:49:41 +0530 Subject: [PATCH 06/11] test(observability): smoke backend integration transient filters --- tests/observability_smoke.rs | 66 +++++++++++++++++++++++++++++++++--- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/tests/observability_smoke.rs b/tests/observability_smoke.rs index 47294d49bb..f58c003c1b 100644 --- a/tests/observability_smoke.rs +++ b/tests/observability_smoke.rs @@ -1,5 +1,5 @@ -//! Runtime smoke for the Sentry `before_send` filter that drops per-attempt -//! transient-upstream provider failures (OPENHUMAN-TAURI-2E / 84 / T). +//! Runtime smoke for the Sentry `before_send` filters that drop per-attempt +//! transient-upstream provider, backend_api, and integrations failures. //! //! Unit tests in `src/core/observability.rs` exercise the pure filter //! function. This integration test wires the actual `sentry::init` → @@ -7,7 +7,10 @@ //! behaves as designed: transient events are dropped, permanent events //! and aggregate `all_exhausted` events still surface. -use openhuman_core::core::observability::is_transient_provider_http_failure; +use openhuman_core::core::observability::{ + is_transient_backend_api_failure, is_transient_integrations_failure, + is_transient_provider_http_failure, +}; use sentry::protocol::Event; use std::collections::BTreeMap; use std::sync::Arc; @@ -22,6 +25,12 @@ fn event_with_tags(tags: &[(&str, &str)]) -> Event<'static> { event } +fn event_with_tags_and_message(tags: &[(&str, &str)], message: &str) -> Event<'static> { + let mut event = event_with_tags(tags); + event.message = Some(message.to_string()); + event +} + /// Drive an envelope-capturing Sentry client through a sequence of events /// and return how many made it past `before_send`. /// @@ -46,7 +55,10 @@ fn count_captured(events: Vec>) -> usize { ), // Same filter shape the real binary installs in main.rs. before_send: Some(Arc::new(|event| { - if is_transient_provider_http_failure(&event) { + if is_transient_provider_http_failure(&event) + || is_transient_backend_api_failure(&event) + || is_transient_integrations_failure(&event) + { None } else { Some(event) @@ -68,6 +80,38 @@ fn count_captured(events: Vec>) -> usize { transport.fetch_and_clear_envelopes().len() } +#[test] +fn drops_backend_api_transient_statuses() { + let events = ["408", "429", "502", "503", "504", "520"] + .into_iter() + .map(|status| { + event_with_tags(&[ + ("domain", "backend_api"), + ("failure", "non_2xx"), + ("status", status), + ]) + }) + .collect(); + assert_eq!( + count_captured(events), + 0, + "transient backend_api statuses must be filtered in before_send" + ); +} + +#[test] +fn drops_integrations_transient_transport_timeout() { + let event = event_with_tags_and_message( + &[("domain", "integrations"), ("failure", "transport")], + "GET /agent-integrations/tools failed: operation timed out", + ); + assert_eq!( + count_captured(vec![event]), + 0, + "transient integrations timeouts must be filtered in before_send" + ); +} + #[test] fn drops_per_attempt_429_503_504_408_502() { // Each of these matches the tag shape `ops::api_error` sets when a @@ -111,6 +155,20 @@ fn keeps_permanent_failures() { ); } +#[test] +fn keeps_backend_api_404_failure() { + let event = event_with_tags(&[ + ("domain", "backend_api"), + ("failure", "non_2xx"), + ("status", "404"), + ]); + assert_eq!( + count_captured(vec![event]), + 1, + "non-transient backend_api 404 failures must reach Sentry" + ); +} + #[test] fn keeps_aggregate_all_exhausted_event() { // The reliable_chat layer fires a single aggregate From f18c4db5517a7485f9cd8f7e4bc576c329187343 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Wed, 13 May 2026 17:19:59 +0530 Subject: [PATCH 07/11] feat(observability): add 520 to TRANSIENT_PROVIDER_HTTP_STATUSES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cloudflare 520 (``) is the same upstream-hiccup shape as 502/503/504 — closes the OPENHUMAN-TAURI-49 leak through `is_transient_provider_http_failure` without a separate classifier. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/core/observability.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/observability.rs b/src/core/observability.rs index 1e54065ddf..924d730675 100644 --- a/src/core/observability.rs +++ b/src/core/observability.rs @@ -33,7 +33,7 @@ pub type Tag<'a> = (&'a str, &'a str); /// (`openhuman::providers::ops::should_report_provider_http_failure`) and the /// `before_send` filter (`is_transient_provider_http_failure`). Update here /// and both sites pick it up — keeps the two layers from drifting. -pub const TRANSIENT_PROVIDER_HTTP_STATUSES: &[u16] = &[408, 429, 502, 503, 504]; +pub const TRANSIENT_PROVIDER_HTTP_STATUSES: &[u16] = &[408, 429, 502, 503, 504, 520]; /// HTTP status codes that represent transient backend / integration transport /// failures rather than application bugs. Keep this as strings because Sentry From 8666e72d40c5e225b4e7d6e3e6ad75dbd724fbd0 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Wed, 13 May 2026 17:20:12 +0530 Subject: [PATCH 08/11] feat(observability): is_transient_message_failure for tag-less re-emits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `is_transient_backend_api_failure` / `is_transient_integrations_failure` match on Sentry `domain` + `failure` + `status` tags written at the authed_json / integrations call sites. Upstream re-emit sites (`rpc.invoke_method` in jsonrpc.rs, `web_channel.run_chat_task` in web.rs) call `report_error` with the stringified downstream error and no `failure` / `status` tag context — so the tag-based classifiers miss them. `is_transient_message_failure(&str)` closes that gap. It matches a tight token list: parenthesised status (`(502 `) or canonical reason phrases (`502 bad gateway`, `429 too many requests`, …) plus the existing `TRANSIENT_TRANSPORT_PHRASES`. Bare status numbers without a non-numeric anchor do NOT match (negative test covers "process 502 exited"). Unit tests cover canonical 408/429/502/503/504/520 phrases, all transport phrases, and 5 negative cases (404, 500, schema errors, bare numbers, empty string). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/core/observability.rs | 85 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/src/core/observability.rs b/src/core/observability.rs index 924d730675..a7a793943c 100644 --- a/src/core/observability.rs +++ b/src/core/observability.rs @@ -371,6 +371,42 @@ pub fn is_transient_integrations_failure(event: &sentry::protocol::Event<'_>) -> is_transient_domain_failure(event, "integrations") } +/// String tokens that mark a formatted error message as a transient HTTP +/// failure. Used at upstream emit sites (`rpc.invoke_method`, +/// `web_channel.run_chat_task`) where the error has already been stringified +/// and the original `status` / `failure` tag context is gone. +/// +/// Each token combines a status code with a non-numeric anchor (parenthesis +/// or canonical reason phrase) so bare numeric coincidences ("process 502 +/// exited") do not match. +const TRANSIENT_STATUS_MESSAGE_TOKENS: &[&str] = &[ + "(408 ", + "(429 ", + "(502 ", + "(503 ", + "(504 ", + "(520 ", + "408 request timeout", + "429 too many requests", + "502 bad gateway", + "503 service unavailable", + "504 gateway timeout", + "520 ", +]; + +/// Returns true when a formatted error message describes a transient HTTP +/// or transport-layer failure that has already been demoted further down the +/// stack. Use at upstream re-emit sites (`rpc.invoke_method`, +/// `web_channel.run_chat_task`) where `report_error` is called with the +/// stringified downstream error and no `failure` / `status` tag context. +pub fn is_transient_message_failure(msg: &str) -> bool { + let lower = msg.to_ascii_lowercase(); + TRANSIENT_STATUS_MESSAGE_TOKENS + .iter() + .any(|token| lower.contains(token)) + || contains_transient_transport_phrase(&lower) +} + #[cfg(test)] mod tests { use super::*; @@ -820,6 +856,55 @@ mod tests { ); } + #[test] + fn message_failure_classifier_matches_canonical_status_phrases() { + for msg in [ + "rpc.invoke_method failed: GET /teams failed (502 Bad Gateway)", + "GET /teams/me/usage failed (503 Service Unavailable)", + "downstream returned (504 Gateway Timeout): retry budget exhausted", + "OpenHuman API error (520 ): cf", + "POST /channels/telegram/typing failed (429 Too Many Requests)", + "auth connect failed: 503 Service Unavailable", + ] { + assert!( + is_transient_message_failure(msg), + "{msg:?} must be classified as transient" + ); + } + } + + #[test] + fn message_failure_classifier_matches_transport_phrases() { + for msg in [ + "integrations.get failed: composio/tools → operation timed out", + "GET https://api.example.com → connection forcibly closed (os 10054)", + "POST /v1/foo → tls handshake eof", + "error sending request for url (https://api.example.com)", + ] { + assert!( + is_transient_message_failure(msg), + "{msg:?} must be classified as transient" + ); + } + } + + #[test] + fn message_failure_classifier_keeps_unrelated_messages() { + for msg in [ + "rpc.invoke_method failed: schema validation error", + "process 502 exited unexpectedly", + "GET /teams failed (404 Not Found)", + "GET /teams failed (500 Internal Server Error)", + "unrelated error with port 5023", + "", + ] { + assert!( + !is_transient_message_failure(msg), + "{msg:?} must not be classified as transient" + ); + } + } + #[test] fn report_error_or_expected_does_not_panic() { report_error_or_expected( From a1aaf3d98a2576424d2e4666a6d94067e37775bd Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Wed, 13 May 2026 17:20:20 +0530 Subject: [PATCH 09/11] fix(rpc): demote transient downstream failures to warn `rpc.invoke_method` re-emits stringified errors from backend_api, integrations, and provider downstream calls. Without this gate every upstream 502/503/504/520/timeout/conn-reset hit Sentry once at the emit site (now demoted by the cluster A / R filters) AND a second time here as a `domain="rpc"` event the tag-based classifiers cannot catch. Closes the residual leaks behind OPENHUMAN-TAURI-8Z, -93, -8W, -96. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/core/jsonrpc.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/core/jsonrpc.rs b/src/core/jsonrpc.rs index 6f07b49259..aa491bfeee 100644 --- a/src/core/jsonrpc.rs +++ b/src/core/jsonrpc.rs @@ -111,6 +111,18 @@ pub async fn rpc_handler(State(state): State, Json(req): Json err ({}ms): {}", method, ms, display_message); + } else if crate::core::observability::is_transient_message_failure(&display_message) { + // Downstream call (backend_api / integrations / provider) already + // demoted the underlying transient failure to a warn. The error + // string still propagates up to here; re-reporting at error level + // would re-create the very Sentry noise the lower-layer demote + // was meant to avoid (#8Z, #93, #8W, #96). + tracing::warn!( + method = %method, + elapsed_ms = ms as u64, + "[rpc] transient downstream failure — not reporting to Sentry: {}", + display_message + ); } else { crate::core::observability::report_error_or_expected( display_message.as_str(), From 8e28b3dbcfcabe915bb7dd171e41bb6abb6626f4 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Wed, 13 May 2026 18:22:40 +0530 Subject: [PATCH 10/11] fix(api/rest): walk error source chain for transient classification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `request.send().await.map_err` was matching the transient transport phrase against `e.to_string()` only — which on reqwest typically returns just the outermost wrapper ("error sending request for url (...)"). When the transient marker (TLS EOF, connection reset, timeout) lives in a nested cause, the top-level string omits it and the failure misclassifies as non-transient noise on Sentry. Walk `Error::source()` to assemble the full chain before phrase matching, mirroring the existing integrations/client.rs pattern. Per CodeRabbit review on PR #1632. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/api/rest.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/api/rest.rs b/src/api/rest.rs index e5aa6d073d..efab8d2961 100644 --- a/src/api/rest.rs +++ b/src/api/rest.rs @@ -423,7 +423,17 @@ impl BackendOAuthClient { } let response = request.send().await.map_err(|e| { - let error_message = e.to_string(); + // Walk the error source chain so transient markers hidden in nested + // causes (reqwest -> hyper -> rustls TLS EOF, etc.) still classify + // correctly. The top-level `e.to_string()` often only carries the + // outermost wrapper, e.g. "error sending request for url (...)". + let mut error_message = e.to_string(); + let mut src: Option<&(dyn std::error::Error + 'static)> = std::error::Error::source(&e); + while let Some(s) = src { + error_message.push_str(" → "); + error_message.push_str(&s.to_string()); + src = s.source(); + } if crate::core::observability::contains_transient_transport_phrase(&error_message) { tracing::warn!( domain = "backend_api", From ffb7ea1ffec2a7efdc25a193f6135f047513aae9 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Wed, 13 May 2026 21:28:00 +0530 Subject: [PATCH 11/11] fix(jsonrpc): redact upstream message in transient-failure log (#1632 CR) CodeRabbit major: the `is_transient_message_failure` branch logged `display_message` verbatim via tracing::warn!, but the value is upstream-derived (backend / provider) and can carry URL fragments, query params, or pasted-through provider error text including tokens. Per openhuman/CLAUDE.md "never log secrets or full PII". Route through `providers::ops::sanitize_api_error` (same helper used in the SessionExpired publish path) before emitting. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/core/jsonrpc.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/core/jsonrpc.rs b/src/core/jsonrpc.rs index aa491bfeee..7ee76687b5 100644 --- a/src/core/jsonrpc.rs +++ b/src/core/jsonrpc.rs @@ -117,11 +117,19 @@ pub async fn rpc_handler(State(state): State, Json(req): Json