diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9cf1c80..22b3e50 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,6 +6,9 @@ on: pull_request: branches: [main] +permissions: + contents: read + env: CARGO_TERM_COLOR: always RUST_BACKTRACE: 1 @@ -80,3 +83,14 @@ jobs: run: cargo doc --workspace --no-deps env: RUSTDOCFLAGS: -D warnings + + examples: + name: Examples + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - name: Run fetch_urls example + run: cargo run -p fetchkit --example fetch_urls + timeout-minutes: 2 diff --git a/Cargo.lock b/Cargo.lock index 21b7414..db78ba6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -104,6 +104,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -317,6 +328,7 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" name = "fetchkit" version = "0.1.0" dependencies = [ + "async-trait", "bytes", "futures", "reqwest", diff --git a/Cargo.toml b/Cargo.toml index e1d6b47..d88dfdb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ description = "AI-friendly fetchkit tool, CLI, MCP server, and library" tokio = { version = "1", features = ["rt-multi-thread", "macros", "time", "sync"] } # HTTP client -reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "gzip", "brotli", "deflate", "stream"] } +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "gzip", "brotli", "deflate", "stream", "json"] } # Serialization serde = { version = "1", features = ["derive"] } @@ -42,3 +42,6 @@ bytes = "1" # Testing wiremock = "0.6" + +# Async traits +async-trait = "0.1" diff --git a/crates/fetchkit/Cargo.toml b/crates/fetchkit/Cargo.toml index 0abc335..3e69044 100644 --- a/crates/fetchkit/Cargo.toml +++ b/crates/fetchkit/Cargo.toml @@ -18,6 +18,7 @@ tracing = { workspace = true } thiserror = { workspace = true } futures = { workspace = true } bytes = { workspace = true } +async-trait = { workspace = true } [dev-dependencies] wiremock = { workspace = true } diff --git a/crates/fetchkit/examples/fetch_urls.rs b/crates/fetchkit/examples/fetch_urls.rs new file mode 100644 index 0000000..1d87c0d --- /dev/null +++ b/crates/fetchkit/examples/fetch_urls.rs @@ -0,0 +1,146 @@ +//! Example: Fetch various URLs and display results +//! +//! Run with: cargo run -p fetchkit --example fetch_urls +//! +//! This example demonstrates the fetcher system with different URL types. + +use fetchkit::{fetch, FetchRequest, FetchResponse}; + +/// Test case definition +struct TestCase { + url: &'static str, + description: &'static str, + expect_format: Option<&'static str>, + expect_contains: Option<&'static str>, +} + +/// Define test cases here +const TEST_CASES: &[TestCase] = &[ + TestCase { + url: "https://example.com", + description: "Simple HTML page", + expect_format: Some("markdown"), + expect_contains: Some("Example Domain"), + }, + TestCase { + url: "https://httpbin.org/json", + description: "JSON endpoint", + expect_format: Some("raw"), + expect_contains: Some("slideshow"), + }, + TestCase { + url: "https://httpbin.org/html", + description: "HTML endpoint", + expect_format: Some("markdown"), + expect_contains: Some("Herman Melville"), + }, + TestCase { + url: "https://github.com/rust-lang/rust", + description: "GitHub repository (uses GitHubRepoFetcher)", + expect_format: Some("github_repo"), + expect_contains: Some("rust-lang/rust"), + }, + TestCase { + url: "https://raw.githubusercontent.com/rust-lang/rust/master/README.md", + description: "Raw markdown file", + expect_format: Some("raw"), + expect_contains: Some("Rust"), + }, +]; + +#[tokio::main] +async fn main() { + println!("FetchKit URL Examples"); + println!("=====================\n"); + + let mut passed = 0; + let mut failed = 0; + + for (i, case) in TEST_CASES.iter().enumerate() { + println!("{}. {}", i + 1, case.description); + println!(" URL: {}", case.url); + + let request = FetchRequest::new(case.url).as_markdown(); + + match fetch(request).await { + Ok(response) => { + let check_result = check_expectations(case, &response); + print_response_summary(&response); + + if check_result { + println!(" ✓ PASS\n"); + passed += 1; + } else { + println!(" ✗ FAIL (expectations not met)\n"); + failed += 1; + } + } + Err(e) => { + println!(" Error: {}", e); + println!(" ✗ FAIL\n"); + failed += 1; + } + } + } + + println!("====================="); + println!("Results: {} passed, {} failed", passed, failed); + + if failed > 0 { + std::process::exit(1); + } +} + +fn print_response_summary(response: &FetchResponse) { + println!(" Status: {}", response.status_code); + + if let Some(ref format) = response.format { + println!(" Format: {}", format); + } + + if let Some(ref ct) = response.content_type { + println!(" Content-Type: {}", ct); + } + + if let Some(size) = response.size { + println!(" Size: {} bytes", size); + } + + if let Some(ref content) = response.content { + let preview = content.chars().take(100).collect::(); + let preview = preview.replace('\n', " "); + println!( + " Preview: {}{}", + preview, + if content.len() > 100 { "..." } else { "" } + ); + } + + if let Some(ref error) = response.error { + println!(" Error: {}", error); + } +} + +fn check_expectations(case: &TestCase, response: &FetchResponse) -> bool { + // Check format + if let Some(expected_format) = case.expect_format { + if response.format.as_deref() != Some(expected_format) { + println!( + " Expected format '{}', got '{:?}'", + expected_format, response.format + ); + return false; + } + } + + // Check content contains + if let Some(expected_text) = case.expect_contains { + let content = response.content.as_deref().unwrap_or(""); + if !content.contains(expected_text) { + println!(" Expected content to contain '{}'", expected_text); + return false; + } + } + + true +} diff --git a/crates/fetchkit/src/client.rs b/crates/fetchkit/src/client.rs index aeef36d..0840fc3 100644 --- a/crates/fetchkit/src/client.rs +++ b/crates/fetchkit/src/client.rs @@ -1,40 +1,11 @@ //! HTTP client for FetchKit +//! +//! This module provides the main entry points for fetching URLs. +//! The actual fetch logic is implemented by fetchers in the [`fetchers`](crate::fetchers) module. -use crate::convert::{filter_excessive_newlines, html_to_markdown, html_to_text, is_html}; use crate::error::FetchError; -use crate::types::{FetchRequest, FetchResponse, HttpMethod}; -use crate::DEFAULT_USER_AGENT; -use bytes::Bytes; -use futures::StreamExt; -use reqwest::header::{HeaderMap, HeaderValue, ACCEPT, CONTENT_DISPOSITION, USER_AGENT}; -use std::time::Duration; -use tracing::{error, warn}; - -/// Binary content type prefixes -const BINARY_PREFIXES: &[&str] = &[ - "image/", - "audio/", - "video/", - "application/octet-stream", - "application/pdf", - "application/zip", - "application/gzip", - "application/x-tar", - "application/x-rar", - "application/x-7z", - "application/vnd.ms-", - "application/vnd.openxmlformats", - "font/", -]; - -/// First-byte timeout (connect + first response byte) -const FIRST_BYTE_TIMEOUT: Duration = Duration::from_secs(1); - -/// Body timeout (total) -const BODY_TIMEOUT: Duration = Duration::from_secs(30); - -/// Timeout message appended to truncated content -const TIMEOUT_MESSAGE: &str = "\n\n[..more content timed out...]"; +use crate::fetchers::FetcherRegistry; +use crate::types::{FetchRequest, FetchResponse}; /// Fetch options that can be configured via tool builder #[derive(Debug, Clone, Default)] @@ -52,330 +23,62 @@ pub struct FetchOptions { } /// Fetch a URL and return the response +/// +/// Uses the default fetcher registry with all built-in fetchers. +/// Markdown and text conversions are enabled by default. +/// For custom options, use [`fetch_with_options`]. pub async fn fetch(req: FetchRequest) -> Result { - fetch_with_options(req, FetchOptions::default()).await + let options = FetchOptions { + enable_markdown: true, + enable_text: true, + ..Default::default() + }; + fetch_with_options(req, options).await } /// Fetch a URL with custom options +/// +/// Uses the default fetcher registry with all built-in fetchers. +/// For custom fetcher configuration, use [`FetcherRegistry`] directly. pub async fn fetch_with_options( req: FetchRequest, options: FetchOptions, ) -> Result { - // Validate URL + // Validate URL early if req.url.is_empty() { return Err(FetchError::MissingUrl); } - if !req.url.starts_with("http://") && !req.url.starts_with("https://") { - return Err(FetchError::InvalidUrlScheme); - } - - // Check allow/block lists - if !options.allow_prefixes.is_empty() { - let allowed = options - .allow_prefixes - .iter() - .any(|prefix| req.url.starts_with(prefix)); - if !allowed { - return Err(FetchError::BlockedUrl); - } - } - - if options - .block_prefixes - .iter() - .any(|prefix| req.url.starts_with(prefix)) - { - return Err(FetchError::BlockedUrl); - } - - let method = req.effective_method(); - let wants_markdown = options.enable_markdown && req.wants_markdown(); - let wants_text = options.enable_text && req.wants_text(); - - // Build headers - let mut headers = HeaderMap::new(); - let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT); - headers.insert( - USER_AGENT, - HeaderValue::from_str(user_agent) - .unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT)), - ); - - // Set Accept header based on conversion mode - let accept = if wants_markdown { - "text/html, text/markdown, text/plain, */*;q=0.8" - } else if wants_text { - "text/html, text/plain, */*;q=0.8" - } else { - "*/*" - }; - headers.insert(ACCEPT, HeaderValue::from_static(accept)); - - // Build client - let client = reqwest::Client::builder() - .default_headers(headers) - .connect_timeout(FIRST_BYTE_TIMEOUT) - .timeout(FIRST_BYTE_TIMEOUT) // Initial timeout for first byte - .build() - .map_err(FetchError::ClientBuildError)?; - - // Build request - let reqwest_method = match method { - HttpMethod::Get => reqwest::Method::GET, - HttpMethod::Head => reqwest::Method::HEAD, - }; - - let request = client.request(reqwest_method.clone(), &req.url); - - // Send request - let response = request.send().await.map_err(FetchError::from_reqwest)?; - - let status_code = response.status().as_u16(); - let headers = response.headers().clone(); - - // Extract metadata - let content_type = headers - .get("content-type") - .and_then(|v| v.to_str().ok()) - .map(|s| s.to_string()); - - let last_modified = headers - .get("last-modified") - .and_then(|v| v.to_str().ok()) - .map(|s| s.to_string()); - - let content_length: Option = headers - .get("content-length") - .and_then(|v| v.to_str().ok()) - .and_then(|s| s.parse().ok()); - - let filename = extract_filename(&headers, &req.url); - - // Handle HEAD request - if method == HttpMethod::Head { - return Ok(FetchResponse { - url: req.url, - status_code, - content_type, - size: content_length, - last_modified, - filename, - method: Some("HEAD".to_string()), - ..Default::default() - }); - } - - // Check for binary content - if let Some(ref ct) = content_type { - if is_binary_content_type(ct) { - return Ok(FetchResponse { - url: req.url, - status_code, - content_type, - size: content_length, - last_modified, - filename, - error: Some( - "Binary content is not supported. Only textual content (HTML, text, JSON, etc.) can be fetched." - .to_string(), - ), - ..Default::default() - }); - } - } - - // Read body with timeout - let (body, truncated) = read_body_with_timeout(response, BODY_TIMEOUT).await; - let size = body.len() as u64; - - // Convert to string - let content = String::from_utf8_lossy(&body).to_string(); - - // Determine format and convert if needed - let (format, final_content) = if is_html(&content_type, &content) { - if wants_markdown { - ("markdown".to_string(), html_to_markdown(&content)) - } else if wants_text { - ("text".to_string(), html_to_text(&content)) - } else { - ("raw".to_string(), content) - } - } else { - ("raw".to_string(), content) - }; - - // Apply newline filtering - let mut final_content = filter_excessive_newlines(&final_content); - - // Add timeout message if truncated - if truncated { - final_content.push_str(TIMEOUT_MESSAGE); - } - - Ok(FetchResponse { - url: req.url, - status_code, - content_type, - size: Some(size), - last_modified, - filename, - format: Some(format), - content: Some(final_content), - truncated: if truncated { Some(true) } else { None }, - ..Default::default() - }) -} - -/// Check if content type indicates binary content -fn is_binary_content_type(content_type: &str) -> bool { - let ct_lower = content_type.to_lowercase(); - BINARY_PREFIXES - .iter() - .any(|prefix| ct_lower.starts_with(prefix)) -} - -/// Extract filename from Content-Disposition header or URL -fn extract_filename(headers: &HeaderMap, url: &str) -> Option { - // Try Content-Disposition header first - if let Some(disposition) = headers.get(CONTENT_DISPOSITION) { - if let Ok(value) = disposition.to_str() { - if let Some(filename) = parse_content_disposition_filename(value) { - return Some(filename); - } - } - } - - // Fallback to URL path - if let Ok(parsed) = url::Url::parse(url) { - if let Some(mut segments) = parsed.path_segments() { - if let Some(last) = segments.next_back() { - if last.contains('.') && !last.is_empty() { - return Some(last.to_string()); - } - } - } - } - - None -} - -/// Parse filename from Content-Disposition header value -fn parse_content_disposition_filename(value: &str) -> Option { - // Look for filename="..." or filename=... - let patterns = ["filename=\"", "filename="]; - for pattern in patterns { - if let Some(start) = value.find(pattern) { - let rest = &value[start + pattern.len()..]; - if pattern.ends_with('"') { - // Quoted - if let Some(end) = rest.find('"') { - return Some(rest[..end].to_string()); - } - } else { - // Unquoted - take until space or semicolon - let end = rest - .find(|c: char| c.is_whitespace() || c == ';') - .unwrap_or(rest.len()); - let filename = rest[..end].trim_matches('"'); - if !filename.is_empty() { - return Some(filename.to_string()); - } - } - } - } - None -} - -/// Read response body with timeout, returning partial content if timeout occurs -async fn read_body_with_timeout(response: reqwest::Response, timeout: Duration) -> (Bytes, bool) { - let mut body = Vec::new(); - let mut stream = response.bytes_stream(); - let deadline = tokio::time::Instant::now() + timeout; - - loop { - let chunk_future = stream.next(); - let timeout_future = tokio::time::sleep_until(deadline); - - tokio::select! { - chunk = chunk_future => { - match chunk { - Some(Ok(bytes)) => { - body.extend_from_slice(&bytes); - } - Some(Err(e)) => { - error!("Error reading body chunk: {}", e); - // Return partial content on error - let has_content = !body.is_empty(); - return (Bytes::from(body), has_content); - } - None => { - // Stream complete - return (Bytes::from(body), false); - } - } - } - _ = timeout_future => { - warn!("Body timeout reached, returning partial content"); - return (Bytes::from(body), true); - } - } - } + // Use registry with default fetchers + let registry = FetcherRegistry::with_defaults(); + registry.fetch(req, options).await } #[cfg(test)] mod tests { use super::*; - #[test] - fn test_is_binary_content_type() { - assert!(is_binary_content_type("image/png")); - assert!(is_binary_content_type("image/jpeg")); - assert!(is_binary_content_type("audio/mp3")); - assert!(is_binary_content_type("video/mp4")); - assert!(is_binary_content_type("application/pdf")); - assert!(is_binary_content_type("application/octet-stream")); - assert!(is_binary_content_type("application/zip")); - assert!(is_binary_content_type("application/vnd.ms-excel")); - assert!(is_binary_content_type( - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - )); - assert!(is_binary_content_type("font/woff2")); - - assert!(!is_binary_content_type("text/html")); - assert!(!is_binary_content_type("text/plain")); - assert!(!is_binary_content_type("application/json")); - assert!(!is_binary_content_type("application/javascript")); + #[tokio::test] + async fn test_fetch_empty_url() { + let req = FetchRequest::new(""); + let result = fetch(req).await; + assert!(matches!(result, Err(FetchError::MissingUrl))); } - #[test] - fn test_parse_content_disposition_filename() { - assert_eq!( - parse_content_disposition_filename("attachment; filename=\"file.pdf\""), - Some("file.pdf".to_string()) - ); - assert_eq!( - parse_content_disposition_filename("attachment; filename=file.pdf"), - Some("file.pdf".to_string()) - ); - assert_eq!( - parse_content_disposition_filename("inline; filename=\"report.xlsx\"; size=1234"), - Some("report.xlsx".to_string()) - ); - assert_eq!(parse_content_disposition_filename("inline"), None); + #[tokio::test] + async fn test_fetch_invalid_scheme() { + let req = FetchRequest::new("ftp://example.com"); + let result = fetch(req).await; + assert!(matches!(result, Err(FetchError::InvalidUrlScheme))); } - #[test] - fn test_extract_filename_from_url() { - let headers = HeaderMap::new(); - assert_eq!( - extract_filename(&headers, "https://example.com/path/to/file.pdf"), - Some("file.pdf".to_string()) - ); - assert_eq!( - extract_filename(&headers, "https://example.com/path/to/document"), - None - ); - assert_eq!(extract_filename(&headers, "https://example.com/"), None); + #[tokio::test] + async fn test_fetch_options_default() { + let options = FetchOptions::default(); + assert!(options.user_agent.is_none()); + assert!(options.allow_prefixes.is_empty()); + assert!(options.block_prefixes.is_empty()); + assert!(!options.enable_markdown); + assert!(!options.enable_text); } } diff --git a/crates/fetchkit/src/error.rs b/crates/fetchkit/src/error.rs index 79cad94..da011b0 100644 --- a/crates/fetchkit/src/error.rs +++ b/crates/fetchkit/src/error.rs @@ -36,6 +36,10 @@ pub enum FetchError { /// Other request error #[error("Request failed: {0}")] RequestError(String), + + /// Fetcher-specific error + #[error("Fetcher error: {0}")] + FetcherError(String), } impl FetchError { diff --git a/crates/fetchkit/src/fetchers/default.rs b/crates/fetchkit/src/fetchers/default.rs new file mode 100644 index 0000000..2cb076e --- /dev/null +++ b/crates/fetchkit/src/fetchers/default.rs @@ -0,0 +1,391 @@ +//! Default HTTP fetcher +//! +//! Handles general HTTP/HTTPS URLs with HTML conversion support. +//! This is the fallback fetcher that handles any URL not matched by +//! specialized fetchers. + +use crate::client::FetchOptions; +use crate::convert::{filter_excessive_newlines, html_to_markdown, html_to_text, is_html}; +use crate::error::FetchError; +use crate::fetchers::Fetcher; +use crate::types::{FetchRequest, FetchResponse, HttpMethod}; +use crate::DEFAULT_USER_AGENT; +use async_trait::async_trait; +use bytes::Bytes; +use futures::StreamExt; +use reqwest::header::{HeaderMap, HeaderValue, ACCEPT, CONTENT_DISPOSITION, USER_AGENT}; +use std::time::Duration; +use tracing::{error, warn}; +use url::Url; + +/// Binary content type prefixes +const BINARY_PREFIXES: &[&str] = &[ + "image/", + "audio/", + "video/", + "application/octet-stream", + "application/pdf", + "application/zip", + "application/gzip", + "application/x-tar", + "application/x-rar", + "application/x-7z", + "application/vnd.ms-", + "application/vnd.openxmlformats", + "font/", +]; + +/// First-byte timeout (connect + first response byte) +const FIRST_BYTE_TIMEOUT: Duration = Duration::from_secs(1); + +/// Body timeout (total) +const BODY_TIMEOUT: Duration = Duration::from_secs(30); + +/// Timeout message appended to truncated content +const TIMEOUT_MESSAGE: &str = "\n\n[..more content timed out...]"; + +/// Default HTTP fetcher +/// +/// Handles all HTTP/HTTPS URLs with: +/// - GET and HEAD methods +/// - HTML to markdown/text conversion +/// - Binary content detection +/// - Timeout handling with partial content +pub struct DefaultFetcher; + +impl DefaultFetcher { + /// Create a new default fetcher + pub fn new() -> Self { + Self + } +} + +impl Default for DefaultFetcher { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl Fetcher for DefaultFetcher { + fn name(&self) -> &'static str { + "default" + } + + fn matches(&self, _url: &Url) -> bool { + // Default fetcher matches all URLs + true + } + + async fn fetch( + &self, + request: &FetchRequest, + options: &FetchOptions, + ) -> Result { + // Validate URL + if request.url.is_empty() { + return Err(FetchError::MissingUrl); + } + + let method = request.effective_method(); + let wants_markdown = options.enable_markdown && request.wants_markdown(); + let wants_text = options.enable_text && request.wants_text(); + + // Build headers + let mut headers = HeaderMap::new(); + let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT); + headers.insert( + USER_AGENT, + HeaderValue::from_str(user_agent) + .unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT)), + ); + + // Set Accept header based on conversion mode + let accept = if wants_markdown { + "text/html, text/markdown, text/plain, */*;q=0.8" + } else if wants_text { + "text/html, text/plain, */*;q=0.8" + } else { + "*/*" + }; + headers.insert(ACCEPT, HeaderValue::from_static(accept)); + + // Build client + let client = reqwest::Client::builder() + .default_headers(headers) + .connect_timeout(FIRST_BYTE_TIMEOUT) + .timeout(FIRST_BYTE_TIMEOUT) + .build() + .map_err(FetchError::ClientBuildError)?; + + // Build request + let reqwest_method = match method { + HttpMethod::Get => reqwest::Method::GET, + HttpMethod::Head => reqwest::Method::HEAD, + }; + + let http_request = client.request(reqwest_method, &request.url); + + // Send request + let response = http_request + .send() + .await + .map_err(FetchError::from_reqwest)?; + + let status_code = response.status().as_u16(); + let resp_headers = response.headers().clone(); + + // Extract metadata + let content_type = resp_headers + .get("content-type") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + let last_modified = resp_headers + .get("last-modified") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + let content_length: Option = resp_headers + .get("content-length") + .and_then(|v| v.to_str().ok()) + .and_then(|s| s.parse().ok()); + + let filename = extract_filename(&resp_headers, &request.url); + + // Handle HEAD request + if method == HttpMethod::Head { + return Ok(FetchResponse { + url: request.url.clone(), + status_code, + content_type, + size: content_length, + last_modified, + filename, + method: Some("HEAD".to_string()), + ..Default::default() + }); + } + + // Check for binary content + if let Some(ref ct) = content_type { + if is_binary_content_type(ct) { + return Ok(FetchResponse { + url: request.url.clone(), + status_code, + content_type, + size: content_length, + last_modified, + filename, + error: Some( + "Binary content is not supported. Only textual content (HTML, text, JSON, etc.) can be fetched." + .to_string(), + ), + ..Default::default() + }); + } + } + + // Read body with timeout + let (body, truncated) = read_body_with_timeout(response, BODY_TIMEOUT).await; + let size = body.len() as u64; + + // Convert to string + let content = String::from_utf8_lossy(&body).to_string(); + + // Determine format and convert if needed + let (format, final_content) = if is_html(&content_type, &content) { + if wants_markdown { + ("markdown".to_string(), html_to_markdown(&content)) + } else if wants_text { + ("text".to_string(), html_to_text(&content)) + } else { + ("raw".to_string(), content) + } + } else { + ("raw".to_string(), content) + }; + + // Apply newline filtering + let mut final_content = filter_excessive_newlines(&final_content); + + // Add timeout message if truncated + if truncated { + final_content.push_str(TIMEOUT_MESSAGE); + } + + Ok(FetchResponse { + url: request.url.clone(), + status_code, + content_type, + size: Some(size), + last_modified, + filename, + format: Some(format), + content: Some(final_content), + truncated: if truncated { Some(true) } else { None }, + ..Default::default() + }) + } +} + +/// Check if content type indicates binary content +fn is_binary_content_type(content_type: &str) -> bool { + let ct_lower = content_type.to_lowercase(); + BINARY_PREFIXES + .iter() + .any(|prefix| ct_lower.starts_with(prefix)) +} + +/// Extract filename from Content-Disposition header or URL +fn extract_filename(headers: &HeaderMap, url: &str) -> Option { + // Try Content-Disposition header first + if let Some(disposition) = headers.get(CONTENT_DISPOSITION) { + if let Ok(value) = disposition.to_str() { + if let Some(filename) = parse_content_disposition_filename(value) { + return Some(filename); + } + } + } + + // Fallback to URL path + if let Ok(parsed) = url::Url::parse(url) { + if let Some(mut segments) = parsed.path_segments() { + if let Some(last) = segments.next_back() { + if last.contains('.') && !last.is_empty() { + return Some(last.to_string()); + } + } + } + } + + None +} + +/// Parse filename from Content-Disposition header value +fn parse_content_disposition_filename(value: &str) -> Option { + let patterns = ["filename=\"", "filename="]; + for pattern in patterns { + if let Some(start) = value.find(pattern) { + let rest = &value[start + pattern.len()..]; + if pattern.ends_with('"') { + // Quoted + if let Some(end) = rest.find('"') { + return Some(rest[..end].to_string()); + } + } else { + // Unquoted - take until space or semicolon + let end = rest + .find(|c: char| c.is_whitespace() || c == ';') + .unwrap_or(rest.len()); + let filename = rest[..end].trim_matches('"'); + if !filename.is_empty() { + return Some(filename.to_string()); + } + } + } + } + None +} + +/// Read response body with timeout, returning partial content if timeout occurs +async fn read_body_with_timeout(response: reqwest::Response, timeout: Duration) -> (Bytes, bool) { + let mut body = Vec::new(); + let mut stream = response.bytes_stream(); + let deadline = tokio::time::Instant::now() + timeout; + + loop { + let chunk_future = stream.next(); + let timeout_future = tokio::time::sleep_until(deadline); + + tokio::select! { + chunk = chunk_future => { + match chunk { + Some(Ok(bytes)) => { + body.extend_from_slice(&bytes); + } + Some(Err(e)) => { + error!("Error reading body chunk: {}", e); + let has_content = !body.is_empty(); + return (Bytes::from(body), has_content); + } + None => { + // Stream complete + return (Bytes::from(body), false); + } + } + } + _ = timeout_future => { + warn!("Body timeout reached, returning partial content"); + return (Bytes::from(body), true); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_is_binary_content_type() { + assert!(is_binary_content_type("image/png")); + assert!(is_binary_content_type("image/jpeg")); + assert!(is_binary_content_type("audio/mp3")); + assert!(is_binary_content_type("video/mp4")); + assert!(is_binary_content_type("application/pdf")); + assert!(is_binary_content_type("application/octet-stream")); + assert!(is_binary_content_type("application/zip")); + assert!(is_binary_content_type("application/vnd.ms-excel")); + assert!(is_binary_content_type( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + )); + assert!(is_binary_content_type("font/woff2")); + + assert!(!is_binary_content_type("text/html")); + assert!(!is_binary_content_type("text/plain")); + assert!(!is_binary_content_type("application/json")); + assert!(!is_binary_content_type("application/javascript")); + } + + #[test] + fn test_parse_content_disposition_filename() { + assert_eq!( + parse_content_disposition_filename("attachment; filename=\"file.pdf\""), + Some("file.pdf".to_string()) + ); + assert_eq!( + parse_content_disposition_filename("attachment; filename=file.pdf"), + Some("file.pdf".to_string()) + ); + assert_eq!( + parse_content_disposition_filename("inline; filename=\"report.xlsx\"; size=1234"), + Some("report.xlsx".to_string()) + ); + assert_eq!(parse_content_disposition_filename("inline"), None); + } + + #[test] + fn test_extract_filename_from_url() { + let headers = HeaderMap::new(); + assert_eq!( + extract_filename(&headers, "https://example.com/path/to/file.pdf"), + Some("file.pdf".to_string()) + ); + assert_eq!( + extract_filename(&headers, "https://example.com/path/to/document"), + None + ); + assert_eq!(extract_filename(&headers, "https://example.com/"), None); + } + + #[test] + fn test_default_fetcher_matches_all() { + let fetcher = DefaultFetcher::new(); + let url = Url::parse("https://example.com").unwrap(); + assert!(fetcher.matches(&url)); + + let url = Url::parse("https://github.com/owner/repo").unwrap(); + assert!(fetcher.matches(&url)); + } +} diff --git a/crates/fetchkit/src/fetchers/github_repo.rs b/crates/fetchkit/src/fetchers/github_repo.rs new file mode 100644 index 0000000..f6e308d --- /dev/null +++ b/crates/fetchkit/src/fetchers/github_repo.rs @@ -0,0 +1,499 @@ +//! GitHub repository fetcher +//! +//! Handles GitHub repository root URLs, returning repo metadata and README content. + +use crate::client::FetchOptions; +use crate::error::FetchError; +use crate::fetchers::Fetcher; +use crate::types::{FetchRequest, FetchResponse}; +use crate::DEFAULT_USER_AGENT; +use async_trait::async_trait; +use reqwest::header::{HeaderValue, ACCEPT, USER_AGENT}; +use serde::Deserialize; +use std::time::Duration; +use url::Url; + +/// First-byte timeout for API requests +const API_TIMEOUT: Duration = Duration::from_secs(10); + +/// GitHub repository fetcher +/// +/// Matches GitHub repository root URLs (`https://github.com/{owner}/{repo}`) +/// and returns repository metadata along with README content. +pub struct GitHubRepoFetcher; + +impl GitHubRepoFetcher { + /// Create a new GitHub repo fetcher + pub fn new() -> Self { + Self + } + + /// Extract owner and repo from a GitHub URL + fn parse_github_url(url: &Url) -> Option<(String, String)> { + // Must be github.com + if url.host_str() != Some("github.com") { + return None; + } + + // Get path segments + let segments: Vec<&str> = url.path_segments().map(|s| s.collect()).unwrap_or_default(); + + // Must have exactly 2 segments (owner/repo) + // Ignore URLs like /owner/repo/issues, /owner/repo/blob/main/file.rs + if segments.len() != 2 { + return None; + } + + let owner = segments[0]; + let repo = segments[1]; + + // Basic validation + if owner.is_empty() || repo.is_empty() { + return None; + } + + // Ignore special GitHub paths + let reserved = [ + "settings", + "explore", + "trending", + "collections", + "events", + "sponsors", + "notifications", + "marketplace", + "pulls", + "issues", + "codespaces", + "features", + "enterprise", + "organizations", + "pricing", + "about", + "team", + "security", + "login", + "join", + ]; + if reserved.contains(&owner) { + return None; + } + + Some((owner.to_string(), repo.to_string())) + } +} + +impl Default for GitHubRepoFetcher { + fn default() -> Self { + Self::new() + } +} + +/// GitHub API repository response (partial) +#[derive(Debug, Deserialize)] +struct GitHubRepo { + #[allow(dead_code)] + name: String, + full_name: String, + description: Option, + html_url: String, + homepage: Option, + stargazers_count: u64, + forks_count: u64, + open_issues_count: u64, + language: Option, + license: Option, + default_branch: String, + created_at: String, + updated_at: String, + pushed_at: String, + topics: Option>, + archived: bool, + fork: bool, + owner: GitHubOwner, +} + +#[derive(Debug, Deserialize)] +struct GitHubLicense { + name: String, + spdx_id: Option, +} + +#[derive(Debug, Deserialize)] +struct GitHubOwner { + login: String, + #[serde(rename = "type")] + owner_type: String, +} + +/// GitHub API README response +#[derive(Debug, Deserialize)] +struct GitHubReadme { + content: String, + encoding: String, +} + +#[async_trait] +impl Fetcher for GitHubRepoFetcher { + fn name(&self) -> &'static str { + "github_repo" + } + + fn matches(&self, url: &Url) -> bool { + Self::parse_github_url(url).is_some() + } + + async fn fetch( + &self, + request: &FetchRequest, + options: &FetchOptions, + ) -> Result { + let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; + + let (owner, repo) = Self::parse_github_url(&url).ok_or_else(|| { + FetchError::FetcherError("Not a valid GitHub repository URL".to_string()) + })?; + + // Build HTTP client + let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT); + let client = reqwest::Client::builder() + .connect_timeout(API_TIMEOUT) + .timeout(API_TIMEOUT) + .build() + .map_err(FetchError::ClientBuildError)?; + + // Fetch repository metadata + let repo_url = format!("https://api.github.com/repos/{}/{}", owner, repo); + let repo_response = client + .get(&repo_url) + .header( + USER_AGENT, + HeaderValue::from_str(user_agent) + .unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT)), + ) + .header( + ACCEPT, + HeaderValue::from_static("application/vnd.github+json"), + ) + .send() + .await + .map_err(FetchError::from_reqwest)?; + + let status_code = repo_response.status().as_u16(); + + // Handle non-success status + if !repo_response.status().is_success() { + let error_msg = if status_code == 404 { + format!("Repository {}/{} not found", owner, repo) + } else if status_code == 403 { + "GitHub API rate limit exceeded".to_string() + } else { + format!("GitHub API error: HTTP {}", status_code) + }; + return Ok(FetchResponse { + url: request.url.clone(), + status_code, + error: Some(error_msg), + ..Default::default() + }); + } + + // Parse repository data + let repo_data: GitHubRepo = repo_response + .json() + .await + .map_err(|e| FetchError::FetcherError(format!("Failed to parse repo data: {}", e)))?; + + // Fetch README (optional - don't fail if missing) + let readme_url = format!("https://api.github.com/repos/{}/{}/readme", owner, repo); + let readme_content = match client + .get(&readme_url) + .header( + USER_AGENT, + HeaderValue::from_str(user_agent) + .unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT)), + ) + .header( + ACCEPT, + HeaderValue::from_static("application/vnd.github+json"), + ) + .send() + .await + { + Ok(resp) if resp.status().is_success() => { + match resp.json::().await { + Ok(readme) if readme.encoding == "base64" => { + // Decode base64 content + decode_base64_content(&readme.content) + } + _ => None, + } + } + _ => None, + }; + + // Format response as markdown + let content = format_github_repo_response(&repo_data, readme_content.as_deref()); + + Ok(FetchResponse { + url: request.url.clone(), + status_code: 200, + content_type: Some("text/markdown".to_string()), + format: Some("github_repo".to_string()), + content: Some(content), + ..Default::default() + }) + } +} + +/// Decode base64-encoded content (GitHub API returns README as base64) +fn decode_base64_content(encoded: &str) -> Option { + // GitHub base64 has newlines, remove them + let cleaned: String = encoded.chars().filter(|c| !c.is_whitespace()).collect(); + + // Simple base64 decode + let decoded = base64_decode(&cleaned)?; + String::from_utf8(decoded).ok() +} + +/// Basic base64 decoder (avoiding extra dependency) +fn base64_decode(input: &str) -> Option> { + const ALPHABET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + + fn decode_char(c: u8) -> Option { + if c == b'=' { + return Some(0); + } + ALPHABET.iter().position(|&x| x == c).map(|p| p as u8) + } + + let bytes: Vec = input.bytes().collect(); + if !bytes.len().is_multiple_of(4) { + return None; + } + + let mut result = Vec::with_capacity(bytes.len() * 3 / 4); + + for chunk in bytes.chunks(4) { + let a = decode_char(chunk[0])?; + let b = decode_char(chunk[1])?; + let c = decode_char(chunk[2])?; + let d = decode_char(chunk[3])?; + + result.push((a << 2) | (b >> 4)); + if chunk[2] != b'=' { + result.push((b << 4) | (c >> 2)); + } + if chunk[3] != b'=' { + result.push((c << 6) | d); + } + } + + Some(result) +} + +/// Format GitHub repo data as markdown +fn format_github_repo_response(repo: &GitHubRepo, readme: Option<&str>) -> String { + let mut output = String::new(); + + // Header + output.push_str(&format!("# {}\n\n", repo.full_name)); + + // Description + if let Some(ref desc) = repo.description { + output.push_str(&format!("{}\n\n", desc)); + } + + // Metadata section + output.push_str("## Repository Info\n\n"); + + // Stats + output.push_str(&format!( + "- **Stars:** {}\n- **Forks:** {}\n- **Open Issues:** {}\n", + repo.stargazers_count, repo.forks_count, repo.open_issues_count + )); + + // Language + if let Some(ref lang) = repo.language { + output.push_str(&format!("- **Language:** {}\n", lang)); + } + + // License + if let Some(ref license) = repo.license { + let license_str = license + .spdx_id + .as_ref() + .unwrap_or(&license.name) + .to_string(); + output.push_str(&format!("- **License:** {}\n", license_str)); + } + + // Topics + if let Some(ref topics) = repo.topics { + if !topics.is_empty() { + output.push_str(&format!("- **Topics:** {}\n", topics.join(", "))); + } + } + + // Links + output.push_str(&format!("- **URL:** {}\n", repo.html_url)); + if let Some(ref homepage) = repo.homepage { + if !homepage.is_empty() { + output.push_str(&format!("- **Homepage:** {}\n", homepage)); + } + } + + // Branch info + output.push_str(&format!("- **Default Branch:** {}\n", repo.default_branch)); + + // Owner info + output.push_str(&format!( + "- **Owner:** {} ({})\n", + repo.owner.login, repo.owner.owner_type + )); + + // Status flags + if repo.archived { + output.push_str("- **Status:** Archived\n"); + } + if repo.fork { + output.push_str("- **Fork:** Yes\n"); + } + + // Dates + output.push_str(&format!("- **Created:** {}\n", repo.created_at)); + output.push_str(&format!("- **Last Updated:** {}\n", repo.updated_at)); + output.push_str(&format!("- **Last Push:** {}\n", repo.pushed_at)); + + // README content + if let Some(readme_content) = readme { + output.push_str("\n---\n\n## README\n\n"); + output.push_str(readme_content); + } + + output +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_github_url_valid() { + let url = Url::parse("https://github.com/owner/repo").unwrap(); + assert_eq!( + GitHubRepoFetcher::parse_github_url(&url), + Some(("owner".to_string(), "repo".to_string())) + ); + } + + #[test] + fn test_parse_github_url_with_trailing_slash() { + // URL parser normalizes away trailing slash for path + let url = Url::parse("https://github.com/owner/repo/").unwrap(); + // This actually has 3 segments: ["owner", "repo", ""] + assert_eq!(GitHubRepoFetcher::parse_github_url(&url), None); + } + + #[test] + fn test_parse_github_url_too_many_segments() { + let url = Url::parse("https://github.com/owner/repo/issues").unwrap(); + assert_eq!(GitHubRepoFetcher::parse_github_url(&url), None); + + let url = Url::parse("https://github.com/owner/repo/blob/main/README.md").unwrap(); + assert_eq!(GitHubRepoFetcher::parse_github_url(&url), None); + } + + #[test] + fn test_parse_github_url_too_few_segments() { + let url = Url::parse("https://github.com/owner").unwrap(); + assert_eq!(GitHubRepoFetcher::parse_github_url(&url), None); + + let url = Url::parse("https://github.com/").unwrap(); + assert_eq!(GitHubRepoFetcher::parse_github_url(&url), None); + } + + #[test] + fn test_parse_github_url_reserved_paths() { + let url = Url::parse("https://github.com/settings/profile").unwrap(); + assert_eq!(GitHubRepoFetcher::parse_github_url(&url), None); + + let url = Url::parse("https://github.com/explore/topics").unwrap(); + assert_eq!(GitHubRepoFetcher::parse_github_url(&url), None); + } + + #[test] + fn test_parse_github_url_wrong_host() { + let url = Url::parse("https://gitlab.com/owner/repo").unwrap(); + assert_eq!(GitHubRepoFetcher::parse_github_url(&url), None); + } + + #[test] + fn test_fetcher_matches() { + let fetcher = GitHubRepoFetcher::new(); + + let url = Url::parse("https://github.com/rust-lang/rust").unwrap(); + assert!(fetcher.matches(&url)); + + let url = Url::parse("https://github.com/rust-lang/rust/issues").unwrap(); + assert!(!fetcher.matches(&url)); + + let url = Url::parse("https://example.com/foo/bar").unwrap(); + assert!(!fetcher.matches(&url)); + } + + #[test] + fn test_base64_decode() { + // "Hello, World!" in base64 + assert_eq!( + base64_decode("SGVsbG8sIFdvcmxkIQ=="), + Some(b"Hello, World!".to_vec()) + ); + + // Empty string + assert_eq!(base64_decode(""), Some(vec![])); + + // Invalid length + assert_eq!(base64_decode("abc"), None); + } + + #[test] + fn test_format_github_repo_response() { + let repo = GitHubRepo { + name: "test-repo".to_string(), + full_name: "owner/test-repo".to_string(), + description: Some("A test repository".to_string()), + html_url: "https://github.com/owner/test-repo".to_string(), + homepage: None, + stargazers_count: 100, + forks_count: 10, + open_issues_count: 5, + language: Some("Rust".to_string()), + license: Some(GitHubLicense { + name: "MIT License".to_string(), + spdx_id: Some("MIT".to_string()), + }), + default_branch: "main".to_string(), + created_at: "2024-01-01T00:00:00Z".to_string(), + updated_at: "2024-06-01T00:00:00Z".to_string(), + pushed_at: "2024-06-01T00:00:00Z".to_string(), + topics: Some(vec!["rust".to_string(), "cli".to_string()]), + archived: false, + fork: false, + owner: GitHubOwner { + login: "owner".to_string(), + owner_type: "User".to_string(), + }, + }; + + let output = format_github_repo_response(&repo, Some("# Test\n\nThis is a test README.")); + + assert!(output.contains("# owner/test-repo")); + assert!(output.contains("A test repository")); + assert!(output.contains("**Stars:** 100")); + assert!(output.contains("**Language:** Rust")); + assert!(output.contains("**License:** MIT")); + assert!(output.contains("## README")); + assert!(output.contains("This is a test README.")); + } +} diff --git a/crates/fetchkit/src/fetchers/mod.rs b/crates/fetchkit/src/fetchers/mod.rs new file mode 100644 index 0000000..6fc44d2 --- /dev/null +++ b/crates/fetchkit/src/fetchers/mod.rs @@ -0,0 +1,158 @@ +//! Fetcher system for specialized content fetching +//! +//! Design: Each fetcher handles specific URL patterns with custom logic. +//! FetcherRegistry dispatches to the first matching fetcher. + +mod default; +mod github_repo; + +pub use default::DefaultFetcher; +pub use github_repo::GitHubRepoFetcher; + +use crate::client::FetchOptions; +use crate::error::FetchError; +use crate::types::{FetchRequest, FetchResponse}; +use async_trait::async_trait; +use url::Url; + +/// Trait for specialized content fetchers +/// +/// Implement this trait to create custom fetchers for specific URL patterns. +/// Each fetcher declares what URLs it can handle via `matches()` and +/// performs the actual fetch via `fetch()`. +#[async_trait] +pub trait Fetcher: Send + Sync { + /// Unique identifier for this fetcher (for logging/debugging) + fn name(&self) -> &'static str; + + /// Returns true if this fetcher can handle the given URL + /// + /// Called by the registry to determine which fetcher to use. + /// More specific fetchers should be registered before generic ones. + fn matches(&self, url: &Url) -> bool; + + /// Fetch content from the URL + /// + /// Called only if `matches()` returned true. + /// Returns a FetchResponse on success or FetchError on failure. + async fn fetch( + &self, + request: &FetchRequest, + options: &FetchOptions, + ) -> Result; +} + +/// Registry of fetchers that dispatches to the appropriate handler +/// +/// Maintains an ordered list of fetchers. When fetching a URL, iterates +/// through fetchers and uses the first one that matches. +pub struct FetcherRegistry { + fetchers: Vec>, +} + +impl Default for FetcherRegistry { + fn default() -> Self { + Self::new() + } +} + +impl FetcherRegistry { + /// Create an empty registry + pub fn new() -> Self { + Self { + fetchers: Vec::new(), + } + } + + /// Create a registry with default fetchers pre-registered + /// + /// Includes (in order of priority): + /// 1. GitHubRepoFetcher - handles GitHub repository URLs + /// 2. DefaultFetcher - handles all HTTP/HTTPS URLs + pub fn with_defaults() -> Self { + let mut registry = Self::new(); + // Register specialized fetchers first (higher priority) + registry.register(Box::new(GitHubRepoFetcher::new())); + // Default fetcher last (catches all remaining URLs) + registry.register(Box::new(DefaultFetcher::new())); + registry + } + + /// Register a fetcher + /// + /// Fetchers are checked in registration order, so register more + /// specific fetchers before generic ones. + pub fn register(&mut self, fetcher: Box) { + self.fetchers.push(fetcher); + } + + /// Fetch a URL using the appropriate fetcher + /// + /// Iterates through registered fetchers and uses the first one + /// that matches the URL. Returns an error if no fetcher matches + /// (shouldn't happen with DefaultFetcher registered). + pub async fn fetch( + &self, + request: FetchRequest, + options: FetchOptions, + ) -> Result { + // Validate URL scheme early + if !request.url.starts_with("http://") && !request.url.starts_with("https://") { + return Err(FetchError::InvalidUrlScheme); + } + + // Parse URL for matching + let parsed_url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; + + // Check allow/block lists before fetcher matching + if !options.allow_prefixes.is_empty() { + let allowed = options + .allow_prefixes + .iter() + .any(|prefix| request.url.starts_with(prefix)); + if !allowed { + return Err(FetchError::BlockedUrl); + } + } + + if options + .block_prefixes + .iter() + .any(|prefix| request.url.starts_with(prefix)) + { + return Err(FetchError::BlockedUrl); + } + + // Find matching fetcher + for fetcher in &self.fetchers { + if fetcher.matches(&parsed_url) { + tracing::debug!(fetcher = fetcher.name(), url = %request.url, "Using fetcher"); + return fetcher.fetch(&request, &options).await; + } + } + + // No fetcher matched (shouldn't happen with DefaultFetcher) + Err(FetchError::FetcherError( + "No fetcher available for URL".to_string(), + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_registry_with_defaults() { + let registry = FetcherRegistry::with_defaults(); + assert_eq!(registry.fetchers.len(), 2); + assert_eq!(registry.fetchers[0].name(), "github_repo"); + assert_eq!(registry.fetchers[1].name(), "default"); + } + + #[test] + fn test_empty_registry() { + let registry = FetcherRegistry::new(); + assert!(registry.fetchers.is_empty()); + } +} diff --git a/crates/fetchkit/src/lib.rs b/crates/fetchkit/src/lib.rs index 5ae05e8..930f5e4 100644 --- a/crates/fetchkit/src/lib.rs +++ b/crates/fetchkit/src/lib.rs @@ -2,16 +2,28 @@ //! //! This crate provides a reusable library API for fetching web content, //! with optional HTML to markdown/text conversion. +//! +//! ## Fetcher System +//! +//! FetchKit uses a pluggable fetcher system where specialized fetchers +//! handle specific URL patterns. The [`FetcherRegistry`] dispatches +//! requests to the appropriate fetcher based on URL matching. +//! +//! Built-in fetchers: +//! - [`DefaultFetcher`] - General HTTP/HTTPS fetcher with HTML conversion +//! - [`GitHubRepoFetcher`] - GitHub repository metadata and README -mod client; +pub mod client; mod convert; mod error; +pub mod fetchers; mod tool; mod types; -pub use client::fetch; +pub use client::{fetch, fetch_with_options, FetchOptions}; pub use convert::{html_to_markdown, html_to_text}; pub use error::FetchError; +pub use fetchers::{DefaultFetcher, Fetcher, FetcherRegistry, GitHubRepoFetcher}; pub use tool::{Tool, ToolBuilder, ToolStatus}; pub use types::{FetchRequest, FetchResponse, HttpMethod}; diff --git a/crates/fetchkit/tests/integration.rs b/crates/fetchkit/tests/integration.rs index d2a4af6..156a731 100644 --- a/crates/fetchkit/tests/integration.rs +++ b/crates/fetchkit/tests/integration.rs @@ -1,6 +1,8 @@ //! Integration tests for FetchKit using wiremock -use fetchkit::{fetch, FetchRequest, HttpMethod, Tool}; +use fetchkit::{ + fetch, fetch_with_options, FetchOptions, FetchRequest, FetcherRegistry, HttpMethod, Tool, +}; use wiremock::matchers::{method, path}; use wiremock::{Mock, MockServer, ResponseTemplate}; @@ -442,3 +444,158 @@ async fn test_excessive_newlines_filtered() { // Should have at most 2 consecutive newlines assert!(!resp.content.unwrap().contains("\n\n\n")); } + +// ============================================================================ +// Fetcher System Integration Tests +// ============================================================================ + +#[tokio::test] +async fn test_fetcher_registry_with_defaults() { + let mock_server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/page")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string("

Test

") + .insert_header("content-type", "text/html"), + ) + .mount(&mock_server) + .await; + + let registry = FetcherRegistry::with_defaults(); + let options = FetchOptions { + enable_markdown: true, + enable_text: true, + ..Default::default() + }; + + let req = FetchRequest::new(format!("{}/page", mock_server.uri())).as_markdown(); + let resp = registry.fetch(req, options).await.unwrap(); + + assert_eq!(resp.status_code, 200); + assert_eq!(resp.format, Some("markdown".to_string())); + assert!(resp.content.unwrap().contains("# Test")); +} + +#[tokio::test] +async fn test_fetcher_registry_url_validation() { + let registry = FetcherRegistry::with_defaults(); + let options = FetchOptions::default(); + + // Invalid scheme + let req = FetchRequest::new("ftp://example.com"); + let result = registry.fetch(req, options.clone()).await; + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("http://")); + + // Empty URL handled by fetch_with_options before registry + let req = FetchRequest::new(""); + let result = fetch_with_options(req, options).await; + assert!(result.is_err()); +} + +#[tokio::test] +async fn test_fetcher_registry_allow_block_lists() { + let mock_server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/")) + .respond_with(ResponseTemplate::new(200).set_body_string("OK")) + .mount(&mock_server) + .await; + + let registry = FetcherRegistry::with_defaults(); + + // Block list + let options = FetchOptions { + block_prefixes: vec!["http://127.0.0.1".to_string()], + ..Default::default() + }; + let req = FetchRequest::new(format!("{}/", mock_server.uri())); + let result = registry.fetch(req, options).await; + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("Blocked")); + + // Allow list (not matching) + let options = FetchOptions { + allow_prefixes: vec!["https://allowed.com".to_string()], + ..Default::default() + }; + let req = FetchRequest::new(format!("{}/", mock_server.uri())); + let result = registry.fetch(req, options).await; + assert!(result.is_err()); +} + +#[tokio::test] +async fn test_github_fetcher_url_matching() { + // These URLs should NOT match GitHubRepoFetcher (will use DefaultFetcher) + let mock_server = MockServer::start().await; + + // Mock for non-GitHub URLs + Mock::given(method("GET")) + .and(path("/owner/repo/issues")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string("issues page") + .insert_header("content-type", "text/plain"), + ) + .mount(&mock_server) + .await; + + let req = FetchRequest::new(format!("{}/owner/repo/issues", mock_server.uri())); + let resp = fetch(req).await.unwrap(); + + // Should use default fetcher (format is "raw", not "github_repo") + assert_eq!(resp.format, Some("raw".to_string())); + assert!(resp.content.unwrap().contains("issues page")); +} + +#[tokio::test] +async fn test_fetch_enables_conversions_by_default() { + let mock_server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string("

Hello

") + .insert_header("content-type", "text/html"), + ) + .mount(&mock_server) + .await; + + // Using fetch() with as_markdown() should work + let req = FetchRequest::new(format!("{}/", mock_server.uri())).as_markdown(); + let resp = fetch(req).await.unwrap(); + + assert_eq!(resp.format, Some("markdown".to_string())); +} + +#[tokio::test] +async fn test_fetch_with_options_respects_disabled_conversion() { + let mock_server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string("

Hello

") + .insert_header("content-type", "text/html"), + ) + .mount(&mock_server) + .await; + + // Disable markdown conversion + let options = FetchOptions { + enable_markdown: false, + enable_text: false, + ..Default::default() + }; + + let req = FetchRequest::new(format!("{}/", mock_server.uri())).as_markdown(); + let resp = fetch_with_options(req, options).await.unwrap(); + + // Should be raw because conversion is disabled + assert_eq!(resp.format, Some("raw".to_string())); +} diff --git a/specs/fetchers.md b/specs/fetchers.md new file mode 100644 index 0000000..371f0cb --- /dev/null +++ b/specs/fetchers.md @@ -0,0 +1,152 @@ +# Fetcher System Specification + +## Abstract + +Fetcher system enables specialized content fetching based on URL patterns. Each fetcher handles specific URL types (e.g., GitHub repos, binary files) with custom logic, returning structured responses optimized for LLM consumption. + +## Requirements + +### Fetcher Trait + +Each fetcher must implement: + +1. **`name()`** - Unique identifier string for logging/debugging +2. **`matches(url)`** - Returns true if this fetcher handles the URL +3. **`fetch(request, options)`** - Async fetch returning `FetchResponse` or error + +### Fetcher Registry + +Central dispatcher that: + +1. Maintains ordered list of fetchers (most specific first) +2. Iterates fetchers, uses first matching one +3. Falls back to default fetcher if none match +4. Provides `register()` for adding custom fetchers +5. Validates URL scheme and allow/block lists before dispatching + +### Built-in Fetchers + +#### DefaultFetcher (lowest priority) + +- Matches: All HTTP/HTTPS URLs +- Behavior: Standard HTTP fetch with HTML conversion support +- Features: + - GET and HEAD methods + - HTML to markdown/text conversion (when enabled) + - Binary content detection (returns metadata only) + - Timeout handling with partial content support +- Returns: Standard `FetchResponse` with format `"markdown"`, `"text"`, or `"raw"` + +#### GitHubRepoFetcher + +- Matches: `https://github.com/{owner}/{repo}` (exactly 2 path segments) +- Excludes: Reserved paths (settings, explore, trending, etc.) +- Behavior: + 1. Fetch repo metadata via GitHub API (`/repos/{owner}/{repo}`) + 2. Fetch README content if exists (`/repos/{owner}/{repo}/readme`) + 3. Decode base64 README content + 4. Combine into structured markdown response +- Returns: Markdown with repo metadata + README content +- Response format field: `"github_repo"` +- Metadata includes: stars, forks, issues, language, license, topics, dates + +### Response Extensions + +`FetchResponse.format` values: +- `"markdown"` - HTML converted to markdown +- `"text"` - HTML converted to plain text +- `"raw"` - Original content unchanged +- `"github_repo"` - GitHub repository metadata + README + +### Configuration + +Fetchers receive `FetchOptions` for: +- `user_agent` - Custom User-Agent string +- `allow_prefixes` - URL prefix allow list +- `block_prefixes` - URL prefix block list +- `enable_markdown` - Enable markdown conversion +- `enable_text` - Enable text conversion + +### Extensibility + +Design supports hundreds of fetchers by: +- Each fetcher in separate file under `fetchers/` module +- Simple registration pattern via `registry.register()` +- No compile-time limit on fetcher count +- Priority determined by registration order + +### Error Handling + +- Fetcher errors bubble up as `FetchError` +- If specialized fetcher fails, does NOT fall back to default (explicit failure) +- `FetchError::FetcherError(String)` for fetcher-specific errors +- GitHub API errors return response with error field set + +## Module Structure + +``` +crates/fetchkit/src/ +├── fetchers/ +│ ├── mod.rs # Fetcher trait, FetcherRegistry +│ ├── default.rs # DefaultFetcher +│ └── github_repo.rs # GitHubRepoFetcher +``` + +## API + +```rust +// Fetcher trait +#[async_trait] +pub trait Fetcher: Send + Sync { + fn name(&self) -> &'static str; + fn matches(&self, url: &Url) -> bool; + async fn fetch(&self, request: &FetchRequest, options: &FetchOptions) + -> Result; +} + +// Registry +pub struct FetcherRegistry { + fetchers: Vec>, +} + +impl FetcherRegistry { + pub fn new() -> Self; // Empty registry + pub fn with_defaults() -> Self; // Pre-populated with built-in fetchers + pub fn register(&mut self, fetcher: Box); + pub async fn fetch(&self, request: FetchRequest, options: FetchOptions) + -> Result; +} + +// Convenience functions +pub async fn fetch(req: FetchRequest) -> Result; +pub async fn fetch_with_options(req: FetchRequest, options: FetchOptions) + -> Result; +``` + +## Testing + +### Unit Tests +- Per-fetcher tests with mocked HTTP (wiremock) +- URL matching logic tests +- Response parsing tests + +### Integration Tests +- Registry dispatch tests +- End-to-end fetch tests with mock server + +### Example-based Tests +Run with: `cargo run -p fetchkit --example fetch_urls` + +Tests real URLs: +- Simple HTML pages (example.com) +- JSON endpoints (httpbin.org) +- GitHub repositories +- Raw file content + +## Adding a New Fetcher + +1. Create `crates/fetchkit/src/fetchers/{name}.rs` +2. Implement `Fetcher` trait +3. Add `mod {name};` and `pub use {name}::*;` to `mod.rs` +4. Register in `FetcherRegistry::with_defaults()` (before DefaultFetcher) +5. Add test cases to `examples/fetch_urls.rs`