Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ ZAI_PLAN_API_KEY=
TAVILY_API_KEY=
JINA_API_KEY=
FIRECRAWL_API_KEY=
# fastCRW (Firecrawl-compatible web scraper). CRW_API_URL defaults to the
# managed cloud; override it to point at a self-hosted server.
CRW_API_KEY=
CRW_API_URL=

# Tracing backend (optional). Only one runs at a time: Langfuse takes
# precedence when its keys are set, otherwise LangSmith is used. Values are
Expand Down
4 changes: 4 additions & 0 deletions intentkit/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,10 @@ def __init__(self) -> None:
self.jina_api_key: str | None = self.load("JINA_API_KEY")
self.cookiefun_api_key: str | None = self.load("COOKIEFUN_API_KEY")
self.firecrawl_api_key: str | None = self.load("FIRECRAWL_API_KEY")
# fastCRW (Firecrawl-compatible web scraper; single binary, self-host or
# cloud). CRW_API_URL overrides the base for self-hosted deployments.
self.crw_api_key: str | None = self.load("CRW_API_KEY")
self.crw_api_url: str = self.load("CRW_API_URL", "https://fastcrw.com/api")
self.cryptopanic_api_key: str | None = self.load("CRYPTOPANIC_API_KEY")
self.unrealspeech_api_key: str | None = self.load("UNREALSPEECH_API_KEY")
self.dune_api_key: str | None = self.load("DUNE_API_KEY")
Expand Down
79 changes: 79 additions & 0 deletions intentkit/tools/crw/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# fastCRW Tools

The fastCRW tools provide web scraping and content indexing using [fastCRW](https://fastcrw.com),
a Firecrawl-compatible web data engine that ships as a single binary. Run it
self-hosted (free, open core) or against the managed cloud. These tools mirror
the Firecrawl provider and are additive — both providers can be enabled side by
side.

## Tools Overview

### 1. crw_scrape
Scrapes a single webpage and REPLACES any existing indexed content for that URL, preventing duplicates.

**Parameters:**
- `url` (required): The URL to scrape
- `formats` (optional): Output formats - markdown, html, rawHtml, screenshot, links, json (default: ["markdown"])
- `only_main_content` (optional): Extract only main content (default: true)
- `include_tags` (optional): HTML tags to include (e.g., ["h1", "h2", "p"])
- `exclude_tags` (optional): HTML tags to exclude
- `wait_for` (optional): Wait time in milliseconds before scraping
- `timeout` (optional): Maximum timeout in milliseconds (default: 30000)
- `index_content` (optional): Whether to index content for querying (default: true)
- `chunk_size` (optional): Size of text chunks for indexing (default: 1000)
- `chunk_overlap` (optional): Overlap between chunks (default: 200)

### 2. crw_crawl
Crawls multiple pages from a website and indexes all content.

**Parameters:**
- `url` (required): The base URL to start crawling
- `limit` (optional): Maximum number of pages to crawl (default: 10)
- `include_paths` (optional): URL patterns to include (e.g., ["/docs/*"])
- `exclude_paths` (optional): URL patterns to exclude
- `max_depth` (optional): Maximum crawl depth
- `index_content` (optional): Whether to index content for querying (default: true)
- `chunk_size` (optional): Size of text chunks for indexing (default: 1000)
- `chunk_overlap` (optional): Overlap between chunks (default: 200)

### 3. crw_query_indexed_content
Queries previously indexed fastCRW content using semantic search.

**Parameters:**
- `query` (required): The search query
- `max_results` (optional): Maximum number of results to return (1-10, default: 4)

### 4. crw_clear_indexed_content
Clears all previously indexed fastCRW content from the vector store.

**Parameters:**
- `confirm` (required): Must be set to true to confirm the deletion (default: false)

**Note:** This action is permanent and cannot be undone.

## Configuration

fastCRW is Firecrawl-compatible; the integration defaults to the managed cloud
and lets you override the base URL for a self-hosted server.

```bash
# Managed cloud (default base URL https://fastcrw.com/api)
export CRW_API_KEY=your-api-key-here

# Self-hosted server (auth optional; CRW_API_KEY may be omitted)
export CRW_API_URL=http://localhost:3000
```

- `CRW_API_KEY` — Bearer token. Optional for self-hosted instances that run without auth.
- `CRW_API_URL` — Base URL, default `https://fastcrw.com/api`. Set this to point at a self-hosted server.

Content indexing uses OpenAI embeddings, so `OPENAI_API_KEY` must also be configured.

## Features and Benefits

- **Firecrawl-compatible**: Same REST surface and data shapes as Firecrawl.
- **Single binary**: Self-host the open core for free, or use the managed cloud.
- **JavaScript Rendering**: Handles SPAs and dynamic content.
- **Intelligent Chunking**: Optimized text splitting for better search.
- **Content Replacement**: Replace mode prevents duplicate/stale content on re-scrape.
- **Semantic Search**: Uses OpenAI embeddings for intelligent querying.
102 changes: 102 additions & 0 deletions intentkit/tools/crw/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""fastCRW tools for web scraping and crawling.

fastCRW is a Firecrawl-compatible web scraper that ships as a single binary and
runs self-hosted or on the managed cloud at https://fastcrw.com. This toolset
mirrors the Firecrawl provider with a different base URL (CRW_API_URL) and key
(CRW_API_KEY); it is additive and does not affect the Firecrawl toolset.
"""

import logging
from typing import NotRequired, TypedDict

from intentkit.config.config import config as system_config
from intentkit.tools.base import ToolsetConfig, ToolState
from intentkit.tools.crw.base import CrwBaseTool
from intentkit.tools.crw.clear import CrwClearIndexedContent
from intentkit.tools.crw.crawl import CrwCrawl
from intentkit.tools.crw.query import CrwQueryIndexedContent
from intentkit.tools.crw.scrape import CrwScrape

# Cache tools at the system level, because they are stateless
_cache: dict[str, CrwBaseTool] = {}

logger = logging.getLogger(__name__)


class ToolStates(TypedDict):
crw_scrape: ToolState
crw_crawl: ToolState
crw_query_indexed_content: ToolState
crw_clear_indexed_content: ToolState


class Config(ToolsetConfig):
"""Configuration for fastCRW tools."""

states: ToolStates
rate_limit_number: NotRequired[int]
rate_limit_minutes: NotRequired[int]


async def get_tools(
config: "Config",
is_private: bool,
**_,
) -> list[CrwBaseTool]:
"""Get all fastCRW tools.

Args:
config: The configuration for fastCRW tools.
is_private: Whether to include private tools.

Returns:
A list of fastCRW tools.
"""
available_tools = []

# Include tools based on their state
for tool_name, state in config["states"].items():
if state == "disabled":
continue
elif state == "public" or (state == "private" and is_private):
available_tools.append(tool_name)

# Get each tool using the cached getter
return [s for name in available_tools if (s := get_crw_tool(name))]


def get_crw_tool(
name: str,
) -> CrwBaseTool | None:
"""Get a fastCRW tool by name."""
if name == "crw_scrape":
if name not in _cache:
_cache[name] = CrwScrape()
return _cache[name]
elif name == "crw_crawl":
if name not in _cache:
_cache[name] = CrwCrawl()
return _cache[name]
elif name == "crw_query_indexed_content":
if name not in _cache:
_cache[name] = CrwQueryIndexedContent()
return _cache[name]
elif name == "crw_clear_indexed_content":
if name not in _cache:
_cache[name] = CrwClearIndexedContent()
return _cache[name]
else:
logger.warning("Unknown fastCRW tool: %s", name)
return None


def available() -> bool:
"""Check if this toolset is available based on system config.

fastCRW self-host may run without auth, and CRW_API_URL always has a default,
so the toolset is available when a key or a custom base URL is configured.
"""
return bool(system_config.crw_api_key) or bool(
system_config.crw_api_url
and system_config.crw_api_url != "https://fastcrw.com/api"
)
24 changes: 24 additions & 0 deletions intentkit/tools/crw/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from langchain_core.tools.base import ToolException

from intentkit.config.config import config
from intentkit.tools.base import IntentKitTool


class CrwBaseTool(IntentKitTool):
"""Base class for fastCRW tools.

fastCRW is a Firecrawl-compatible web scraper that ships as a single binary
and runs self-hosted or on the managed cloud. The REST surface mirrors
Firecrawl, so these tools mirror the Firecrawl provider with a different
base URL (CRW_API_URL, default https://fastcrw.com/api) and key (CRW_API_KEY).
"""

def get_api_key(self):
# Self-hosted fastCRW may run without auth, so the key is optional there.
return config.crw_api_key

def get_api_url(self) -> str:
base = (config.crw_api_url or "https://fastcrw.com/api").rstrip("/")
return base

category: str = "crw"
79 changes: 79 additions & 0 deletions intentkit/tools/crw/clear.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import logging

from langchain_core.tools import ArgsSchema
from langchain_core.tools.base import ToolException
from pydantic import BaseModel, Field

from intentkit.models.tool import AgentToolData
from intentkit.tools.crw.base import CrwBaseTool

logger = logging.getLogger(__name__)


class CrwClearInput(BaseModel):
"""Input for fastCRW clear tool."""

confirm: bool = Field(
description="Must be true to confirm deletion.",
default=False,
)


class CrwClearIndexedContent(CrwBaseTool):
"""Tool for clearing all indexed fastCRW content.

This tool removes all previously indexed content from the fastCRW vector store,
allowing for a fresh start with new content.
"""

name: str = "crw_clear_indexed_content"
description: str = "Permanently clear all indexed fastCRW content from the vector store. Cannot be undone."
args_schema: ArgsSchema | None = CrwClearInput

async def _arun(
self,
confirm: bool = False,
**kwargs,
) -> str:
"""Clear all indexed fastCRW content for the agent.

Args:
confirm: Must be True to confirm the deletion
config: The configuration for the tool call

Returns:
str: Confirmation message
"""
context = self.get_context()
agent_id = context.agent_id

if not agent_id:
raise ToolException("Error: Agent ID not available for clearing content.")
if not confirm:
raise ToolException(
"Error: You must set confirm=true to clear all indexed content."
)
logger.info(
f"crw_clear: Starting clear indexed content operation for agent {agent_id}"
)

try:
# Delete vector store data (using web_scraper storage format for compatibility)
vector_store_key = f"vector_store_{agent_id}"
await AgentToolData.delete(agent_id, "web_scraper", vector_store_key)

# Delete metadata
metadata_key = f"indexed_urls_{agent_id}"
await AgentToolData.delete(agent_id, "web_scraper", metadata_key)

logger.info(
f"crw_clear: Successfully cleared all indexed content for agent {agent_id}"
)
return "Successfully cleared all fastCRW indexed content. The vector store is now empty and ready for new content."

except Exception as e:
logger.error(
f"crw_clear: Error clearing indexed content for agent {agent_id}: {e}",
exc_info=True,
)
raise ToolException(f"Error clearing indexed content: {str(e)}")
Loading