Skip to content

Commit 08d3153

Browse files
committed
docs: update fetcher spec and add integration tests
- Update specs/fetchers.md with complete API documentation - Add section on how to create new fetchers - Add integration tests for FetcherRegistry - Test URL validation, allow/block lists, conversion options
1 parent a299998 commit 08d3153

2 files changed

Lines changed: 217 additions & 16 deletions

File tree

crates/fetchkit/tests/integration.rs

Lines changed: 158 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
//! Integration tests for FetchKit using wiremock
22
3-
use fetchkit::{fetch, FetchRequest, HttpMethod, Tool};
3+
use fetchkit::{
4+
fetch, fetch_with_options, FetchOptions, FetchRequest, FetcherRegistry, HttpMethod, Tool,
5+
};
46
use wiremock::matchers::{method, path};
57
use wiremock::{Mock, MockServer, ResponseTemplate};
68

@@ -442,3 +444,158 @@ async fn test_excessive_newlines_filtered() {
442444
// Should have at most 2 consecutive newlines
443445
assert!(!resp.content.unwrap().contains("\n\n\n"));
444446
}
447+
448+
// ============================================================================
449+
// Fetcher System Integration Tests
450+
// ============================================================================
451+
452+
#[tokio::test]
453+
async fn test_fetcher_registry_with_defaults() {
454+
let mock_server = MockServer::start().await;
455+
456+
Mock::given(method("GET"))
457+
.and(path("/page"))
458+
.respond_with(
459+
ResponseTemplate::new(200)
460+
.set_body_string("<html><body><h1>Test</h1></body></html>")
461+
.insert_header("content-type", "text/html"),
462+
)
463+
.mount(&mock_server)
464+
.await;
465+
466+
let registry = FetcherRegistry::with_defaults();
467+
let options = FetchOptions {
468+
enable_markdown: true,
469+
enable_text: true,
470+
..Default::default()
471+
};
472+
473+
let req = FetchRequest::new(format!("{}/page", mock_server.uri())).as_markdown();
474+
let resp = registry.fetch(req, options).await.unwrap();
475+
476+
assert_eq!(resp.status_code, 200);
477+
assert_eq!(resp.format, Some("markdown".to_string()));
478+
assert!(resp.content.unwrap().contains("# Test"));
479+
}
480+
481+
#[tokio::test]
482+
async fn test_fetcher_registry_url_validation() {
483+
let registry = FetcherRegistry::with_defaults();
484+
let options = FetchOptions::default();
485+
486+
// Invalid scheme
487+
let req = FetchRequest::new("ftp://example.com");
488+
let result = registry.fetch(req, options.clone()).await;
489+
assert!(result.is_err());
490+
assert!(result.unwrap_err().to_string().contains("http://"));
491+
492+
// Empty URL handled by fetch_with_options before registry
493+
let req = FetchRequest::new("");
494+
let result = fetch_with_options(req, options).await;
495+
assert!(result.is_err());
496+
}
497+
498+
#[tokio::test]
499+
async fn test_fetcher_registry_allow_block_lists() {
500+
let mock_server = MockServer::start().await;
501+
502+
Mock::given(method("GET"))
503+
.and(path("/"))
504+
.respond_with(ResponseTemplate::new(200).set_body_string("OK"))
505+
.mount(&mock_server)
506+
.await;
507+
508+
let registry = FetcherRegistry::with_defaults();
509+
510+
// Block list
511+
let options = FetchOptions {
512+
block_prefixes: vec!["http://127.0.0.1".to_string()],
513+
..Default::default()
514+
};
515+
let req = FetchRequest::new(format!("{}/", mock_server.uri()));
516+
let result = registry.fetch(req, options).await;
517+
assert!(result.is_err());
518+
assert!(result.unwrap_err().to_string().contains("Blocked"));
519+
520+
// Allow list (not matching)
521+
let options = FetchOptions {
522+
allow_prefixes: vec!["https://allowed.com".to_string()],
523+
..Default::default()
524+
};
525+
let req = FetchRequest::new(format!("{}/", mock_server.uri()));
526+
let result = registry.fetch(req, options).await;
527+
assert!(result.is_err());
528+
}
529+
530+
#[tokio::test]
531+
async fn test_github_fetcher_url_matching() {
532+
// These URLs should NOT match GitHubRepoFetcher (will use DefaultFetcher)
533+
let mock_server = MockServer::start().await;
534+
535+
// Mock for non-GitHub URLs
536+
Mock::given(method("GET"))
537+
.and(path("/owner/repo/issues"))
538+
.respond_with(
539+
ResponseTemplate::new(200)
540+
.set_body_string("issues page")
541+
.insert_header("content-type", "text/plain"),
542+
)
543+
.mount(&mock_server)
544+
.await;
545+
546+
let req = FetchRequest::new(format!("{}/owner/repo/issues", mock_server.uri()));
547+
let resp = fetch(req).await.unwrap();
548+
549+
// Should use default fetcher (format is "raw", not "github_repo")
550+
assert_eq!(resp.format, Some("raw".to_string()));
551+
assert!(resp.content.unwrap().contains("issues page"));
552+
}
553+
554+
#[tokio::test]
555+
async fn test_fetch_enables_conversions_by_default() {
556+
let mock_server = MockServer::start().await;
557+
558+
Mock::given(method("GET"))
559+
.and(path("/"))
560+
.respond_with(
561+
ResponseTemplate::new(200)
562+
.set_body_string("<html><body><p>Hello</p></body></html>")
563+
.insert_header("content-type", "text/html"),
564+
)
565+
.mount(&mock_server)
566+
.await;
567+
568+
// Using fetch() with as_markdown() should work
569+
let req = FetchRequest::new(format!("{}/", mock_server.uri())).as_markdown();
570+
let resp = fetch(req).await.unwrap();
571+
572+
assert_eq!(resp.format, Some("markdown".to_string()));
573+
}
574+
575+
#[tokio::test]
576+
async fn test_fetch_with_options_respects_disabled_conversion() {
577+
let mock_server = MockServer::start().await;
578+
579+
Mock::given(method("GET"))
580+
.and(path("/"))
581+
.respond_with(
582+
ResponseTemplate::new(200)
583+
.set_body_string("<html><body><p>Hello</p></body></html>")
584+
.insert_header("content-type", "text/html"),
585+
)
586+
.mount(&mock_server)
587+
.await;
588+
589+
// Disable markdown conversion
590+
let options = FetchOptions {
591+
enable_markdown: false,
592+
enable_text: false,
593+
..Default::default()
594+
};
595+
596+
let req = FetchRequest::new(format!("{}/", mock_server.uri())).as_markdown();
597+
let resp = fetch_with_options(req, options).await.unwrap();
598+
599+
// Should be raw because conversion is disabled
600+
assert_eq!(resp.format, Some("raw".to_string()));
601+
}

specs/fetchers.md

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,48 +22,65 @@ Central dispatcher that:
2222
2. Iterates fetchers, uses first matching one
2323
3. Falls back to default fetcher if none match
2424
4. Provides `register()` for adding custom fetchers
25+
5. Validates URL scheme and allow/block lists before dispatching
2526

2627
### Built-in Fetchers
2728

2829
#### DefaultFetcher (lowest priority)
2930

3031
- Matches: All HTTP/HTTPS URLs
31-
- Behavior: Current `client.rs` fetch logic
32-
- Returns: Standard `FetchResponse` with HTML conversion
32+
- Behavior: Standard HTTP fetch with HTML conversion support
33+
- Features:
34+
- GET and HEAD methods
35+
- HTML to markdown/text conversion (when enabled)
36+
- Binary content detection (returns metadata only)
37+
- Timeout handling with partial content support
38+
- Returns: Standard `FetchResponse` with format `"markdown"`, `"text"`, or `"raw"`
3339

3440
#### GitHubRepoFetcher
3541

36-
- Matches: `https://github.com/{owner}/{repo}` (exactly 2 path segments, no file paths)
42+
- Matches: `https://github.com/{owner}/{repo}` (exactly 2 path segments)
43+
- Excludes: Reserved paths (settings, explore, trending, etc.)
3744
- Behavior:
3845
1. Fetch repo metadata via GitHub API (`/repos/{owner}/{repo}`)
3946
2. Fetch README content if exists (`/repos/{owner}/{repo}/readme`)
40-
3. Combine into structured response
41-
- Returns: Markdown with repo metadata header + README content
47+
3. Decode base64 README content
48+
4. Combine into structured markdown response
49+
- Returns: Markdown with repo metadata + README content
4250
- Response format field: `"github_repo"`
51+
- Metadata includes: stars, forks, issues, language, license, topics, dates
4352

4453
### Response Extensions
4554

46-
`FetchResponse.format` gains new values:
55+
`FetchResponse.format` values:
56+
- `"markdown"` - HTML converted to markdown
57+
- `"text"` - HTML converted to plain text
58+
- `"raw"` - Original content unchanged
4759
- `"github_repo"` - GitHub repository metadata + README
4860

4961
### Configuration
5062

5163
Fetchers receive `FetchOptions` for:
52-
- User-Agent configuration
53-
- Allow/block URL lists (applied before fetcher matching)
64+
- `user_agent` - Custom User-Agent string
65+
- `allow_prefixes` - URL prefix allow list
66+
- `block_prefixes` - URL prefix block list
67+
- `enable_markdown` - Enable markdown conversion
68+
- `enable_text` - Enable text conversion
5469

5570
### Extensibility
5671

5772
Design supports hundreds of fetchers by:
5873
- Each fetcher in separate file under `fetchers/` module
59-
- Simple registration pattern
74+
- Simple registration pattern via `registry.register()`
6075
- No compile-time limit on fetcher count
76+
- Priority determined by registration order
6177

6278
### Error Handling
6379

6480
- Fetcher errors bubble up as `FetchError`
6581
- If specialized fetcher fails, does NOT fall back to default (explicit failure)
66-
- Add `FetchError::FetcherError(String)` for fetcher-specific errors
82+
- `FetchError::FetcherError(String)` for fetcher-specific errors
83+
- GitHub API errors return response with error field set
6784

6885
## Module Structure
6986

@@ -75,10 +92,10 @@ crates/fetchkit/src/
7592
│ └── github_repo.rs # GitHubRepoFetcher
7693
```
7794

78-
## API Changes
95+
## API
7996

8097
```rust
81-
// New trait
98+
// Fetcher trait
8299
#[async_trait]
83100
pub trait Fetcher: Send + Sync {
84101
fn name(&self) -> &'static str;
@@ -99,10 +116,37 @@ impl FetcherRegistry {
99116
pub async fn fetch(&self, request: FetchRequest, options: FetchOptions)
100117
-> Result<FetchResponse, FetchError>;
101118
}
119+
120+
// Convenience functions
121+
pub async fn fetch(req: FetchRequest) -> Result<FetchResponse, FetchError>;
122+
pub async fn fetch_with_options(req: FetchRequest, options: FetchOptions)
123+
-> Result<FetchResponse, FetchError>;
102124
```
103125

104126
## Testing
105127

106-
- Unit tests per fetcher with mocked HTTP
107-
- Integration tests for registry dispatch
108-
- GitHub fetcher tests with mocked GitHub API responses
128+
### Unit Tests
129+
- Per-fetcher tests with mocked HTTP (wiremock)
130+
- URL matching logic tests
131+
- Response parsing tests
132+
133+
### Integration Tests
134+
- Registry dispatch tests
135+
- End-to-end fetch tests with mock server
136+
137+
### Example-based Tests
138+
Run with: `cargo run -p fetchkit --example fetch_urls`
139+
140+
Tests real URLs:
141+
- Simple HTML pages (example.com)
142+
- JSON endpoints (httpbin.org)
143+
- GitHub repositories
144+
- Raw file content
145+
146+
## Adding a New Fetcher
147+
148+
1. Create `crates/fetchkit/src/fetchers/{name}.rs`
149+
2. Implement `Fetcher` trait
150+
3. Add `mod {name};` and `pub use {name}::*;` to `mod.rs`
151+
4. Register in `FetcherRegistry::with_defaults()` (before DefaultFetcher)
152+
5. Add test cases to `examples/fetch_urls.rs`

0 commit comments

Comments
 (0)