From c62c782d96db56554ff67d8e8dcadee7019e4121 Mon Sep 17 00:00:00 2001 From: cyq <15000851237@163.com> Date: Mon, 1 Jun 2026 17:45:42 +0800 Subject: [PATCH 1/3] feat(search): allow custom DuckDuckGo endpoint --- config.example.toml | 2 + crates/tui/src/config.rs | 53 +++++++++++++++++++++ crates/tui/src/core/engine.rs | 4 ++ crates/tui/src/main.rs | 3 ++ crates/tui/src/runtime_threads.rs | 1 + crates/tui/src/tools/spec.rs | 5 ++ crates/tui/src/tools/web_search.rs | 76 +++++++++++++++++++++++++----- crates/tui/src/tui/ui.rs | 1 + docs/CONFIGURATION.md | 6 +++ 9 files changed, 139 insertions(+), 12 deletions(-) diff --git a/config.example.toml b/config.example.toml index b4d21c158..82b4465de 100644 --- a/config.example.toml +++ b/config.example.toml @@ -353,6 +353,7 @@ max_subagents = 10 # optional (1-20) # # baidu: 百度 AI Search via qianfan.baidubce.com,需 api_key # # volcengine: 火山引擎 Ark web_search (免费 2 万次/月), 需 api_key # # 也回退到 VOLCENGINE_API_KEY / VOLCENGINE_ARK_API_KEY / ARK_API_KEY 环境变量 +# base_url = "https://search.example/html/" # optional DuckDuckGo-compatible HTML endpoint # api_key = "YOUR_SEARCH_KEY" # required for tavily, bocha, and baidu; optional for metaso # # WARNING: treat config.toml like a secret file when # # storing API keys. Prefer env vars for local smoke tests. @@ -360,6 +361,7 @@ max_subagents = 10 # optional (1-20) # Env-var overrides: # DEEPSEEK_SEARCH_PROVIDER → search.provider # DEEPSEEK_SEARCH_API_KEY → search.api_key +# DEEPSEEK_SEARCH_BASE_URL → search.base_url # METASO_API_KEY → metaso key fallback # BAIDU_SEARCH_API_KEY → baidu key fallback diff --git a/crates/tui/src/config.rs b/crates/tui/src/config.rs index 10dd8493b..a01299d43 100644 --- a/crates/tui/src/config.rs +++ b/crates/tui/src/config.rs @@ -931,6 +931,11 @@ pub struct SearchConfig { /// Search provider: `bing` | `duckduckgo` | `tavily` | `bocha` | `metaso` | `baidu` | `volcengine`. Default: `duckduckgo`. #[serde(default)] pub provider: Option, + /// Optional DuckDuckGo-compatible HTML endpoint. When set with the + /// DuckDuckGo provider, `web_search` appends the `q` query parameter to + /// this URL instead of using `https://html.duckduckgo.com/html/`. + #[serde(default)] + pub base_url: Option, /// API key for Tavily, Bocha, Metaso, Baidu, or Volcengine. Not required for Bing or DuckDuckGo. /// Metaso also falls back to `METASO_API_KEY` env var, then a built-in default. /// Baidu also falls back to `BAIDU_SEARCH_API_KEY` env var. @@ -3340,6 +3345,14 @@ fn apply_env_overrides(config: &mut Config) { .get_or_insert_with(SearchConfig::default) .api_key = Some(value); } + if let Ok(value) = std::env::var("DEEPSEEK_SEARCH_BASE_URL") + && !value.trim().is_empty() + { + config + .search + .get_or_insert_with(SearchConfig::default) + .base_url = Some(value); + } if let Ok(value) = std::env::var("DEEPSEEK_REQUIREMENTS_PATH") { config.requirements_path = Some(value); } @@ -4868,6 +4881,25 @@ mod tests { ); } + #[test] + fn search_config_preserves_custom_base_url() { + let config: Config = toml::from_str( + r#" + [search] + provider = "duckduckgo" + base_url = "https://search.internal.example/html/" + "#, + ) + .expect("search config"); + + let search = config.search.expect("search table"); + assert_eq!(search.provider, Some(SearchProvider::DuckDuckGo)); + assert_eq!( + search.base_url.as_deref(), + Some("https://search.internal.example/html/") + ); + } + #[test] fn explicit_baidu_search_provider_is_preserved() { let config: Config = toml::from_str( @@ -5011,6 +5043,27 @@ mod tests { ); } + #[test] + fn apply_env_overrides_sets_search_base_url() { + let _guard = lock_test_env(); + let prev = env::var_os("DEEPSEEK_SEARCH_BASE_URL"); + unsafe { + env::set_var( + "DEEPSEEK_SEARCH_BASE_URL", + "https://search.internal.example/html/", + ) + }; + let mut config = Config::default(); + + apply_env_overrides(&mut config); + + unsafe { EnvGuard::restore_var("DEEPSEEK_SEARCH_BASE_URL", prev) }; + assert_eq!( + config.search.and_then(|search| search.base_url), + Some("https://search.internal.example/html/".to_string()) + ); + } + #[test] fn search_provider_resolution_ignores_invalid_env_override() { let _guard = lock_test_env(); diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs index 5813b5381..1411046a6 100644 --- a/crates/tui/src/core/engine.rs +++ b/crates/tui/src/core/engine.rs @@ -182,6 +182,8 @@ pub struct EngineConfig { /// Metaso also falls back to `METASO_API_KEY` env var, then a built-in key. /// Baidu also falls back to `BAIDU_SEARCH_API_KEY`. pub search_api_key: Option, + /// Optional DuckDuckGo-compatible HTML endpoint override. + pub search_base_url: Option, /// Per-step DeepSeek API timeout for sub-agent `create_message` requests. /// Resolved from `[subagents] api_timeout_secs` (clamped to 1..=1800) /// once at engine construction, then threaded onto every @@ -241,6 +243,7 @@ impl Default for EngineConfig { workshop: None, search_provider: crate::config::SearchProvider::default(), search_api_key: None, + search_base_url: None, subagent_api_timeout: Duration::from_secs( crate::config::DEFAULT_SUBAGENT_API_TIMEOUT_SECS, ), @@ -1711,6 +1714,7 @@ impl Engine { // Wire search provider config. ctx.search_provider = self.config.search_provider; ctx.search_api_key = self.config.search_api_key.clone(); + ctx.search_base_url = self.config.search_base_url.clone(); let policy = sandbox_policy_for_mode(mode, &self.session.workspace); let mut ctx = ctx.with_elevated_sandbox_policy(policy); diff --git a/crates/tui/src/main.rs b/crates/tui/src/main.rs index 9feaaac46..a874aa563 100644 --- a/crates/tui/src/main.rs +++ b/crates/tui/src/main.rs @@ -5385,6 +5385,7 @@ async fn run_exec_agent( workshop: config.workshop.clone(), search_provider: config.search_provider(), search_api_key: config.search.as_ref().and_then(|s| s.api_key.clone()), + search_base_url: config.search.as_ref().and_then(|s| s.base_url.clone()), tools_always_load: config.tools_always_load(), tools: config.tools.clone(), }; @@ -5956,6 +5957,7 @@ mod doctor_endpoint_tests { let config = Config { search: Some(crate::config::SearchConfig { provider: Some(crate::config::SearchProvider::DuckDuckGo), + base_url: None, api_key: None, }), ..Default::default() @@ -5995,6 +5997,7 @@ mod doctor_endpoint_tests { let config = Config { search: Some(crate::config::SearchConfig { provider: Some(crate::config::SearchProvider::Bing), + base_url: None, api_key: None, }), ..Default::default() diff --git a/crates/tui/src/runtime_threads.rs b/crates/tui/src/runtime_threads.rs index 51f79922c..1d5fe0520 100644 --- a/crates/tui/src/runtime_threads.rs +++ b/crates/tui/src/runtime_threads.rs @@ -2026,6 +2026,7 @@ impl RuntimeThreadManager { workshop: self.config.workshop.clone(), search_provider: self.config.search_provider(), search_api_key: self.config.search.as_ref().and_then(|s| s.api_key.clone()), + search_base_url: self.config.search.as_ref().and_then(|s| s.base_url.clone()), tools_always_load: self.config.tools_always_load(), tools: self.config.tools.clone(), }; diff --git a/crates/tui/src/tools/spec.rs b/crates/tui/src/tools/spec.rs index 6a66c37fa..e19f3777a 100644 --- a/crates/tui/src/tools/spec.rs +++ b/crates/tui/src/tools/spec.rs @@ -169,6 +169,8 @@ pub struct ToolContext { /// Metaso also falls back to `METASO_API_KEY` env var, then a built-in key. /// Baidu also falls back to `BAIDU_SEARCH_API_KEY`. pub search_api_key: Option, + /// Optional DuckDuckGo-compatible HTML endpoint override for `web_search`. + pub search_base_url: Option, /// Per-session workshop variable store (#548). Holds the raw content of /// the most recent large-tool routing event so the parent can call @@ -210,6 +212,7 @@ impl ToolContext { large_output_router: None, search_provider: crate::config::SearchProvider::default(), search_api_key: None, + search_base_url: None, workshop_vars: None, } } @@ -247,6 +250,7 @@ impl ToolContext { large_output_router: None, search_provider: crate::config::SearchProvider::default(), search_api_key: None, + search_base_url: None, workshop_vars: None, } } @@ -284,6 +288,7 @@ impl ToolContext { large_output_router: None, search_provider: crate::config::SearchProvider::default(), search_api_key: None, + search_base_url: None, workshop_vars: None, } } diff --git a/crates/tui/src/tools/web_search.rs b/crates/tui/src/tools/web_search.rs index 3e36ae5d4..b9b8415ed 100644 --- a/crates/tui/src/tools/web_search.rs +++ b/crates/tui/src/tools/web_search.rs @@ -7,6 +7,7 @@ //! //! Set `[search]` in config.toml to switch providers: //! provider = "duckduckgo" # or tavily/bocha/metaso/baidu/volcengine +//! base_url = "https://search.example/html/" # optional DDG-compatible URL //! api_key = "tvly-..." use super::spec::{ @@ -22,7 +23,7 @@ use serde_json::{Value, json}; use std::sync::OnceLock; use std::time::Duration; -const DUCKDUCKGO_HOST: &str = "html.duckduckgo.com"; +const DUCKDUCKGO_ENDPOINT: &str = "https://html.duckduckgo.com/html/"; const BING_HOST: &str = "www.bing.com"; const TAVILY_ENDPOINT: &str = "https://api.tavily.com/search"; const BOCHA_ENDPOINT: &str = "https://api.bochaai.com/v1/ai/search"; @@ -139,7 +140,7 @@ impl ToolSpec for WebSearchTool { } fn description(&self) -> &'static str { - "Search the web and return ranked results with URLs and snippets. Default backend is DuckDuckGo with Bing fallback; set `[search] provider = \"bing\" | \"tavily\" | \"bocha\" | \"metaso\" | \"baidu\"` in config.toml to switch backends. Use this instead of scraping search engines with `curl` in `exec_shell`. For a known canonical URL, prefer `fetch_url` directly." + "Search the web and return ranked results with URLs and snippets. Default backend is DuckDuckGo with Bing fallback; set `[search] provider = \"bing\" | \"tavily\" | \"bocha\" | \"metaso\" | \"baidu\"` in config.toml to switch backends, or `[search] base_url` for a DuckDuckGo-compatible endpoint. Use this instead of scraping search engines with `curl` in `exec_shell`. For a known canonical URL, prefer `fetch_url` directly." } fn input_schema(&self) -> Value { @@ -261,13 +262,16 @@ impl ToolSpec for WebSearchTool { } // Per-domain network policy gate (#135). The "host" for web search is - // the upstream search engine domain — DuckDuckGo first, Bing on - // fallback. We gate DuckDuckGo here; Bing is gated separately inside - // the fallback path so a deny on one engine doesn't block the other. - check_policy(decider, DUCKDUCKGO_HOST)?; + // the upstream search engine domain — DuckDuckGo-compatible first, + // Bing on fallback. We gate the configured endpoint here; Bing is + // gated separately inside the fallback path so a deny on one engine + // doesn't silently allow the other. + let (url, duckduckgo_host) = + duckduckgo_search_url(context.search_base_url.as_deref(), &query)?; + let allow_bing_fallback = + duckduckgo_allows_bing_fallback(context.search_base_url.as_deref()); + check_policy(decider, &duckduckgo_host)?; - let encoded = url_encode(&query); - let url = format!("https://html.duckduckgo.com/html/?q={encoded}"); let resp = client .get(&url) .header( @@ -302,7 +306,7 @@ impl ToolSpec for WebSearchTool { message_suffix = Some("Bing returned no results; used DuckDuckGo fallback"); } - if results.is_empty() { + if results.is_empty() && allow_bing_fallback { let duckduckgo_blocked = is_duckduckgo_challenge(&body); // Bing is a separate host — gate it independently so a deny on // DuckDuckGo doesn't silently let Bing through (and vice versa). @@ -1332,6 +1336,30 @@ fn normalize_bing_url(href: &str) -> String { href.to_string() } +fn duckduckgo_search_url( + base_url: Option<&str>, + query: &str, +) -> Result<(String, String), ToolError> { + let raw = base_url + .map(str::trim) + .filter(|value| !value.is_empty()) + .unwrap_or(DUCKDUCKGO_ENDPOINT); + let mut url = reqwest::Url::parse(raw).map_err(|err| { + ToolError::invalid_input(format!( + "Invalid DuckDuckGo-compatible search base_url: {err}" + )) + })?; + url.query_pairs_mut().append_pair("q", query); + let host = url.host_str().ok_or_else(|| { + ToolError::invalid_input("DuckDuckGo-compatible search base_url must include a host") + })?; + Ok((url.to_string(), host.to_string())) +} + +fn duckduckgo_allows_bing_fallback(base_url: Option<&str>) -> bool { + base_url.is_none_or(|value| value.trim().is_empty()) +} + fn normalize_text(text: &str) -> String { let stripped = strip_html_tags(text); let decoded = decode_html_entities(&stripped); @@ -1435,9 +1463,9 @@ fn extract_query_param(url: &str, key: &str) -> Option { mod tests { use super::{ ERROR_BODY_PREVIEW_BYTES, WebSearchEntry, WebSearchTool, baidu_search_payload, - decode_html_entities, extract_search_query, is_likely_spam_results, normalize_bing_url, - optional_search_max_results, parse_baidu_results, root_domain, sanitize_error_body, - truncate_error_body, volcengine_extract_text, + decode_html_entities, duckduckgo_search_url, extract_search_query, is_likely_spam_results, + normalize_bing_url, optional_search_max_results, parse_baidu_results, root_domain, + sanitize_error_body, truncate_error_body, volcengine_extract_text, }; use serde_json::json; @@ -1969,4 +1997,28 @@ mod tests { "should not complain about missing API key (built-in default); got `{msg}`" ); } + + #[test] + fn duckduckgo_compatible_url_uses_custom_base_url_and_preserves_query() { + let (url, host) = duckduckgo_search_url( + Some("https://search.internal.example/html/?region=us"), + "rust async", + ) + .expect("custom duckduckgo-compatible url"); + + assert_eq!(host, "search.internal.example"); + assert_eq!( + url, + "https://search.internal.example/html/?region=us&q=rust+async" + ); + } + + #[test] + fn custom_duckduckgo_endpoint_disables_public_bing_fallback() { + assert!(super::duckduckgo_allows_bing_fallback(None)); + assert!(super::duckduckgo_allows_bing_fallback(Some(" "))); + assert!(!super::duckduckgo_allows_bing_fallback(Some( + "https://search.internal.example/html/" + ))); + } } diff --git a/crates/tui/src/tui/ui.rs b/crates/tui/src/tui/ui.rs index e92a2a056..41f812457 100644 --- a/crates/tui/src/tui/ui.rs +++ b/crates/tui/src/tui/ui.rs @@ -779,6 +779,7 @@ fn build_engine_config(app: &App, config: &Config) -> EngineConfig { workshop: config.workshop.clone(), search_provider: config.search_provider(), search_api_key: config.search.as_ref().and_then(|s| s.api_key.clone()), + search_base_url: config.search.as_ref().and_then(|s| s.base_url.clone()), tools_always_load: config.tools_always_load(), tools: config.tools.clone(), } diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index de9a03a6a..c84cfe6a5 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -819,6 +819,11 @@ parseable results. Bing remains selectable for users who explicitly want it, and Tavily, Bocha, Metaso, or Baidu can be selected when an API-backed provider is preferred. +For a private/internal search service that serves DuckDuckGo-compatible HTML, +keep `provider = "duckduckgo"` and set `base_url`; CodeWhale appends the `q` +query parameter to that endpoint and applies network policy to its host. +Custom endpoints do not fall back to public Bing. + **Metaso** ([metaso.cn](https://metaso.cn)) has a 100 searches/day free quota; set `METASO_API_KEY` or `[search] api_key` for a higher quota. @@ -830,6 +835,7 @@ only; it does not add a Baidu model provider. ```toml [search] provider = "baidu" # duckduckgo | bing | tavily | bocha | metaso | baidu +# base_url = "https://search.example/html/" # optional with provider = "duckduckgo" # api_key = "YOUR_KEY" # required for tavily, bocha, and baidu; optional for metaso ``` From ea7d72cea39a4faa8478271f204870d7df415e20 Mon Sep 17 00:00:00 2001 From: cyq <15000851237@163.com> Date: Mon, 1 Jun 2026 23:02:26 +0800 Subject: [PATCH 2/3] fix(search): avoid newer endpoint helper syntax --- crates/tui/src/config.rs | 15 ++++++++------- crates/tui/src/tools/web_search.rs | 5 ++++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/crates/tui/src/config.rs b/crates/tui/src/config.rs index a01299d43..ccd37f28d 100644 --- a/crates/tui/src/config.rs +++ b/crates/tui/src/config.rs @@ -3345,13 +3345,14 @@ fn apply_env_overrides(config: &mut Config) { .get_or_insert_with(SearchConfig::default) .api_key = Some(value); } - if let Ok(value) = std::env::var("DEEPSEEK_SEARCH_BASE_URL") - && !value.trim().is_empty() - { - config - .search - .get_or_insert_with(SearchConfig::default) - .base_url = Some(value); + match std::env::var("DEEPSEEK_SEARCH_BASE_URL") { + Ok(value) if !value.trim().is_empty() => { + config + .search + .get_or_insert_with(SearchConfig::default) + .base_url = Some(value); + } + _ => {} } if let Ok(value) = std::env::var("DEEPSEEK_REQUIREMENTS_PATH") { config.requirements_path = Some(value); diff --git a/crates/tui/src/tools/web_search.rs b/crates/tui/src/tools/web_search.rs index b9b8415ed..7f5913e36 100644 --- a/crates/tui/src/tools/web_search.rs +++ b/crates/tui/src/tools/web_search.rs @@ -1357,7 +1357,10 @@ fn duckduckgo_search_url( } fn duckduckgo_allows_bing_fallback(base_url: Option<&str>) -> bool { - base_url.is_none_or(|value| value.trim().is_empty()) + match base_url { + Some(value) => value.trim().is_empty(), + None => true, + } } fn normalize_text(text: &str) -> String { From 8b6b3e61d03b892c82e52b666001f0e7870aef18 Mon Sep 17 00:00:00 2001 From: cyq <15000851237@163.com> Date: Mon, 1 Jun 2026 23:07:49 +0800 Subject: [PATCH 3/3] fix(search): surface custom endpoint config errors --- crates/tui/src/tools/web_search.rs | 89 +++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 9 deletions(-) diff --git a/crates/tui/src/tools/web_search.rs b/crates/tui/src/tools/web_search.rs index 7f5913e36..17b57162b 100644 --- a/crates/tui/src/tools/web_search.rs +++ b/crates/tui/src/tools/web_search.rs @@ -197,6 +197,15 @@ impl ToolSpec for WebSearchTool { let max_results = max_results.clamp(1, MAX_RESULTS); let timeout_ms = optional_u64(&input, "timeout_ms", DEFAULT_TIMEOUT_MS).min(60_000); + if configured_search_base_url(context.search_base_url.as_deref()).is_some() + && !matches!(context.search_provider, SearchProvider::DuckDuckGo) + { + return Err(ToolError::invalid_input(format!( + "[search].base_url is only supported with provider = \"duckduckgo\"; current provider is \"{}\"", + context.search_provider.as_str() + ))); + } + // Dispatch to the configured API-backed search providers before // building the HTML-scraping client used by Bing/DuckDuckGo. match context.search_provider { @@ -306,8 +315,14 @@ impl ToolSpec for WebSearchTool { message_suffix = Some("Bing returned no results; used DuckDuckGo fallback"); } + let duckduckgo_blocked = is_duckduckgo_challenge(&body); + if results.is_empty() && duckduckgo_blocked && !allow_bing_fallback { + return Err(ToolError::execution_failed(format!( + "DuckDuckGo-compatible search endpoint at {duckduckgo_host} returned a bot challenge; check the private search service, credentials, or network policy" + ))); + } + if results.is_empty() && allow_bing_fallback { - let duckduckgo_blocked = is_duckduckgo_challenge(&body); // Bing is a separate host — gate it independently so a deny on // DuckDuckGo doesn't silently let Bing through (and vice versa). check_policy(decider, BING_HOST)?; @@ -1340,10 +1355,7 @@ fn duckduckgo_search_url( base_url: Option<&str>, query: &str, ) -> Result<(String, String), ToolError> { - let raw = base_url - .map(str::trim) - .filter(|value| !value.is_empty()) - .unwrap_or(DUCKDUCKGO_ENDPOINT); + let raw = configured_search_base_url(base_url).unwrap_or(DUCKDUCKGO_ENDPOINT); let mut url = reqwest::Url::parse(raw).map_err(|err| { ToolError::invalid_input(format!( "Invalid DuckDuckGo-compatible search base_url: {err}" @@ -1356,11 +1368,12 @@ fn duckduckgo_search_url( Ok((url.to_string(), host.to_string())) } +fn configured_search_base_url(base_url: Option<&str>) -> Option<&str> { + base_url.map(str::trim).filter(|value| !value.is_empty()) +} + fn duckduckgo_allows_bing_fallback(base_url: Option<&str>) -> bool { - match base_url { - Some(value) => value.trim().is_empty(), - None => true, - } + configured_search_base_url(base_url).is_none() } fn normalize_text(text: &str) -> String { @@ -2024,4 +2037,62 @@ mod tests { "https://search.internal.example/html/" ))); } + + #[tokio::test] + async fn custom_duckduckgo_challenge_returns_actionable_error() { + use crate::config::SearchProvider; + use crate::tools::spec::{ToolContext, ToolSpec}; + use wiremock::matchers::{method, path, query_param}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/html/")) + .and(query_param("q", "rust async")) + .respond_with(ResponseTemplate::new(200).set_body_string( + r#"
Unfortunately, bots use DuckDuckGo too
"#, + )) + .mount(&server) + .await; + + let tmp = tempfile::tempdir().expect("tempdir"); + let mut ctx = ToolContext::new(tmp.path().to_path_buf()); + ctx.search_provider = SearchProvider::DuckDuckGo; + ctx.search_base_url = Some(format!("{}/html/", server.uri())); + + let err = WebSearchTool + .execute(json!({"query": "rust async"}), &ctx) + .await + .expect_err("custom endpoint challenge should error"); + let msg = err.to_string(); + assert!( + msg.contains("DuckDuckGo-compatible search endpoint") + && msg.contains("bot challenge") + && msg.contains("private search service"), + "got `{msg}`" + ); + } + + #[tokio::test] + async fn search_base_url_with_non_duckduckgo_provider_is_explicit_error() { + use crate::config::SearchProvider; + use crate::tools::spec::{ToolContext, ToolSpec}; + + let tmp = tempfile::tempdir().expect("tempdir"); + let mut ctx = ToolContext::new(tmp.path().to_path_buf()); + ctx.search_provider = SearchProvider::Tavily; + ctx.search_base_url = Some("https://search.internal.example/html/".to_string()); + + let err = WebSearchTool + .execute(json!({"query": "rust async"}), &ctx) + .await + .expect_err("non-duckduckgo provider with base_url should error"); + let msg = err.to_string(); + assert!( + msg.contains("[search].base_url") + && msg.contains("provider = \"duckduckgo\"") + && msg.contains("tavily"), + "got `{msg}`" + ); + } }