Skip to content

Commit d61cebc

Browse files
chaliyclaude
andauthored
feat: add pluggable fetcher system for URL-specific handling (#9)
## What Introduces a pluggable fetcher architecture that enables specialized content fetching based on URL patterns. The system is designed to scale to hundreds of fetchers. ## Why Different URL types require different handling strategies. For example, GitHub repository URLs should return structured metadata + README content, not raw HTML. This architecture enables building specialized fetchers for various content sources (GitHub, npm, documentation sites, etc.) while maintaining a clean API. ## How - **Fetcher trait**: Defines `name()`, `matches(url)`, and `fetch(request, options)` methods - **FetcherRegistry**: Dispatches URLs to the first matching fetcher in priority order - **DefaultFetcher**: Handles all HTTP/HTTPS URLs with HTML conversion (existing behavior) - **GitHubRepoFetcher**: Handles `github.com/{owner}/{repo}` URLs, returns repo metadata + README ### Changes - Add `crates/fetchkit/src/fetchers/` module with trait, registry, and built-in fetchers - Refactor `client.rs` to delegate to FetcherRegistry - Add `FetchError::FetcherError` variant for fetcher-specific errors - Add `specs/fetchers.md` specification - Add `examples/fetch_urls.rs` for testing different URL types - Add integration tests for fetcher system - Enable `json` feature for reqwest (GitHub API) ## Risk - Low - Changes are additive; existing `fetch()` API unchanged - All 73 tests pass ### Checklist - [x] Unit tests are passed - [x] Integration tests added - [x] Example-based tests added - [x] Documentation updated (specs/fetchers.md) - [x] Specs are up to date --------- Co-authored-by: Claude <noreply@anthropic.com>
1 parent 4016027 commit d61cebc

13 files changed

Lines changed: 1593 additions & 341 deletions

File tree

.github/workflows/ci.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ on:
66
pull_request:
77
branches: [main]
88

9+
permissions:
10+
contents: read
11+
912
env:
1013
CARGO_TERM_COLOR: always
1114
RUST_BACKTRACE: 1
@@ -80,3 +83,14 @@ jobs:
8083
run: cargo doc --workspace --no-deps
8184
env:
8285
RUSTDOCFLAGS: -D warnings
86+
87+
examples:
88+
name: Examples
89+
runs-on: ubuntu-latest
90+
steps:
91+
- uses: actions/checkout@v4
92+
- uses: dtolnay/rust-toolchain@stable
93+
- uses: Swatinem/rust-cache@v2
94+
- name: Run fetch_urls example
95+
run: cargo run -p fetchkit --example fetch_urls
96+
timeout-minutes: 2

Cargo.lock

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ description = "AI-friendly fetchkit tool, CLI, MCP server, and library"
1515
tokio = { version = "1", features = ["rt-multi-thread", "macros", "time", "sync"] }
1616

1717
# HTTP client
18-
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "gzip", "brotli", "deflate", "stream"] }
18+
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "gzip", "brotli", "deflate", "stream", "json"] }
1919

2020
# Serialization
2121
serde = { version = "1", features = ["derive"] }
@@ -42,3 +42,6 @@ bytes = "1"
4242

4343
# Testing
4444
wiremock = "0.6"
45+
46+
# Async traits
47+
async-trait = "0.1"

crates/fetchkit/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ tracing = { workspace = true }
1818
thiserror = { workspace = true }
1919
futures = { workspace = true }
2020
bytes = { workspace = true }
21+
async-trait = { workspace = true }
2122

2223
[dev-dependencies]
2324
wiremock = { workspace = true }
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
//! Example: Fetch various URLs and display results
2+
//!
3+
//! Run with: cargo run -p fetchkit --example fetch_urls
4+
//!
5+
//! This example demonstrates the fetcher system with different URL types.
6+
7+
use fetchkit::{fetch, FetchRequest, FetchResponse};
8+
9+
/// Test case definition
10+
struct TestCase {
11+
url: &'static str,
12+
description: &'static str,
13+
expect_format: Option<&'static str>,
14+
expect_contains: Option<&'static str>,
15+
}
16+
17+
/// Define test cases here
18+
const TEST_CASES: &[TestCase] = &[
19+
TestCase {
20+
url: "https://example.com",
21+
description: "Simple HTML page",
22+
expect_format: Some("markdown"),
23+
expect_contains: Some("Example Domain"),
24+
},
25+
TestCase {
26+
url: "https://httpbin.org/json",
27+
description: "JSON endpoint",
28+
expect_format: Some("raw"),
29+
expect_contains: Some("slideshow"),
30+
},
31+
TestCase {
32+
url: "https://httpbin.org/html",
33+
description: "HTML endpoint",
34+
expect_format: Some("markdown"),
35+
expect_contains: Some("Herman Melville"),
36+
},
37+
TestCase {
38+
url: "https://github.com/rust-lang/rust",
39+
description: "GitHub repository (uses GitHubRepoFetcher)",
40+
expect_format: Some("github_repo"),
41+
expect_contains: Some("rust-lang/rust"),
42+
},
43+
TestCase {
44+
url: "https://raw.githubusercontent.com/rust-lang/rust/master/README.md",
45+
description: "Raw markdown file",
46+
expect_format: Some("raw"),
47+
expect_contains: Some("Rust"),
48+
},
49+
];
50+
51+
#[tokio::main]
52+
async fn main() {
53+
println!("FetchKit URL Examples");
54+
println!("=====================\n");
55+
56+
let mut passed = 0;
57+
let mut failed = 0;
58+
59+
for (i, case) in TEST_CASES.iter().enumerate() {
60+
println!("{}. {}", i + 1, case.description);
61+
println!(" URL: {}", case.url);
62+
63+
let request = FetchRequest::new(case.url).as_markdown();
64+
65+
match fetch(request).await {
66+
Ok(response) => {
67+
let check_result = check_expectations(case, &response);
68+
print_response_summary(&response);
69+
70+
if check_result {
71+
println!(" ✓ PASS\n");
72+
passed += 1;
73+
} else {
74+
println!(" ✗ FAIL (expectations not met)\n");
75+
failed += 1;
76+
}
77+
}
78+
Err(e) => {
79+
println!(" Error: {}", e);
80+
println!(" ✗ FAIL\n");
81+
failed += 1;
82+
}
83+
}
84+
}
85+
86+
println!("=====================");
87+
println!("Results: {} passed, {} failed", passed, failed);
88+
89+
if failed > 0 {
90+
std::process::exit(1);
91+
}
92+
}
93+
94+
fn print_response_summary(response: &FetchResponse) {
95+
println!(" Status: {}", response.status_code);
96+
97+
if let Some(ref format) = response.format {
98+
println!(" Format: {}", format);
99+
}
100+
101+
if let Some(ref ct) = response.content_type {
102+
println!(" Content-Type: {}", ct);
103+
}
104+
105+
if let Some(size) = response.size {
106+
println!(" Size: {} bytes", size);
107+
}
108+
109+
if let Some(ref content) = response.content {
110+
let preview = content.chars().take(100).collect::<String>();
111+
let preview = preview.replace('\n', " ");
112+
println!(
113+
" Preview: {}{}",
114+
preview,
115+
if content.len() > 100 { "..." } else { "" }
116+
);
117+
}
118+
119+
if let Some(ref error) = response.error {
120+
println!(" Error: {}", error);
121+
}
122+
}
123+
124+
fn check_expectations(case: &TestCase, response: &FetchResponse) -> bool {
125+
// Check format
126+
if let Some(expected_format) = case.expect_format {
127+
if response.format.as_deref() != Some(expected_format) {
128+
println!(
129+
" Expected format '{}', got '{:?}'",
130+
expected_format, response.format
131+
);
132+
return false;
133+
}
134+
}
135+
136+
// Check content contains
137+
if let Some(expected_text) = case.expect_contains {
138+
let content = response.content.as_deref().unwrap_or("");
139+
if !content.contains(expected_text) {
140+
println!(" Expected content to contain '{}'", expected_text);
141+
return false;
142+
}
143+
}
144+
145+
true
146+
}

0 commit comments

Comments
 (0)