diff --git a/.env.example b/.env.example index b4e08e1a..33a4ab0d 100644 --- a/.env.example +++ b/.env.example @@ -7,3 +7,6 @@ PROVIDER_API_KEY= # OpenAI-compatible providers can use any env var name as long as config.yaml # points model.api_key_env at the same name. OPENAI_API_KEY= + +# Required when tools.search_provider is set to 'tavily' in config.yaml. +TAVILY_API_KEY= diff --git a/config.example.yaml b/config.example.yaml index fd662e4d..ac85bf1e 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -16,6 +16,7 @@ tools: web: true shell: true files: true + # search_provider: duckduckgo # 'duckduckgo' (default) or 'tavily' email: enabled: false smtp_host: "" diff --git a/nipux_cli/config.py b/nipux_cli/config.py index 7d1c2a3d..e8b5eedf 100644 --- a/nipux_cli/config.py +++ b/nipux_cli/config.py @@ -129,6 +129,7 @@ class ToolAccessConfig: web: bool = True shell: bool = True files: bool = True + search_provider: str = "duckduckgo" @dataclass(frozen=True) @@ -214,6 +215,7 @@ def load_config(path: str | Path | None = None) -> AppConfig: web=bool(tools_raw.get("web", True)), shell=bool(tools_raw.get("shell", True)), files=bool(tools_raw.get("files", True)), + search_provider=str(tools_raw.get("search_provider") or "duckduckgo"), ) email = EmailConfig( enabled=bool(email_raw.get("enabled", False)), @@ -256,6 +258,7 @@ def default_config_yaml( " web: true\n" " shell: true\n" " files: true\n" + " # search_provider: duckduckgo # 'duckduckgo' (default) or 'tavily'\n" "email:\n" " enabled: false\n" " smtp_host: \"\"\n" diff --git a/nipux_cli/tools.py b/nipux_cli/tools.py index cc88ebd0..30b4b4cf 100644 --- a/nipux_cli/tools.py +++ b/nipux_cli/tools.py @@ -853,14 +853,14 @@ def _browser_call(name: str, args: dict[str, Any], ctx: ToolContext) -> str: def _web_call(name: str, args: dict[str, Any], ctx: ToolContext) -> str: - del ctx from nipux_cli.web import web_extract, web_search + search_provider = ctx.config.tools.search_provider if name == "web_search": - return _json(web_search(str(args.get("query") or ""), limit=int(args.get("limit") or 5))) + return _json(web_search(str(args.get("query") or ""), limit=int(args.get("limit") or 5), search_provider=search_provider)) if name == "web_extract": urls = args.get("urls") if isinstance(args.get("urls"), list) else [] - return _json(web_extract(urls[:5])) + return _json(web_extract(urls[:5], search_provider=search_provider)) raise KeyError(name) diff --git a/nipux_cli/web.py b/nipux_cli/web.py index 222e6677..3b34cff3 100644 --- a/nipux_cli/web.py +++ b/nipux_cli/web.py @@ -79,7 +79,7 @@ def _duckduckgo_link(raw: str) -> str: return html.unescape(raw) -def web_search(query: str, *, limit: int = 5) -> dict[str, Any]: +def _ddg_search(query: str, *, limit: int = 5) -> dict[str, Any]: url = "https://duckduckgo.com/html/?" + urllib.parse.urlencode({"q": query}) markup, _ = _request(url) pattern = re.compile( @@ -95,7 +95,7 @@ def web_search(query: str, *, limit: int = 5) -> dict[str, Any]: return {"success": True, "query": query, "results": results} -def web_extract(urls: list[str], *, limit_chars: int = 12_000) -> dict[str, Any]: +def _ddg_extract(urls: list[str], *, limit_chars: int = 12_000) -> dict[str, Any]: pages = [] for url in urls[:5]: try: @@ -119,3 +119,45 @@ def web_extract(urls: list[str], *, limit_chars: int = 12_000) -> dict[str, Any] except Exception as exc: pages.append({"url": url, "error": str(exc)}) return {"success": True, "pages": pages} + + +def _tavily_search(query: str, *, limit: int = 5) -> dict[str, Any]: + from tavily import TavilyClient + + client = TavilyClient() + response = client.search(query=query, max_results=limit) + results = [ + {"title": r.get("title", ""), "url": r.get("url", "")} + for r in response.get("results", []) + ] + return {"success": True, "query": query, "results": results} + + +def _tavily_extract(urls: list[str], *, limit_chars: int = 12_000) -> dict[str, Any]: + from tavily import TavilyClient + + client = TavilyClient() + response = client.extract(urls=urls[:5]) + pages = [] + for r in response.get("results", []): + text = r.get("raw_content") or r.get("text") or "" + pages.append({ + "url": r.get("url", ""), + "text": text[:limit_chars], + "truncated": len(text) > limit_chars, + }) + for f in response.get("failed_results", []): + pages.append({"url": f.get("url", ""), "error": f.get("error", "extraction failed")}) + return {"success": True, "pages": pages} + + +def web_search(query: str, *, limit: int = 5, search_provider: str = "duckduckgo") -> dict[str, Any]: + if search_provider == "tavily": + return _tavily_search(query, limit=limit) + return _ddg_search(query, limit=limit) + + +def web_extract(urls: list[str], *, limit_chars: int = 12_000, search_provider: str = "duckduckgo") -> dict[str, Any]: + if search_provider == "tavily": + return _tavily_extract(urls, limit_chars=limit_chars) + return _ddg_extract(urls, limit_chars=limit_chars) diff --git a/pyproject.toml b/pyproject.toml index 69e35e81..436c96ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ classifiers = [ dependencies = [ "openai>=2.21.0,<3", "pyyaml>=6.0.2,<7", + "tavily-python>=0.5.0", ] [project.urls]