From 55abacab58113fd51ffd7ff246d3956a70217dcd Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 28 May 2026 21:58:42 +0000 Subject: [PATCH] fix: harden engine against path traversal and allowed_domains bypass - Sanitize spider.name() before interpolating it into cache/checkpoint paths so a name like "../etc/passwd" cannot escape the .scrapling/ directory - Reject requests whose domain cannot be parsed (data:, file://, malformed URLs) when allowed_domains is set, instead of silently allowing them Closes #1 Closes #2 https://claude.ai/code/session_012RmdaovmNWZVAim4XxCWwn --- src/spiders/engine.rs | 65 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 5 deletions(-) diff --git a/src/spiders/engine.rs b/src/spiders/engine.rs index 6fbab41..7000e5b 100644 --- a/src/spiders/engine.rs +++ b/src/spiders/engine.rs @@ -15,6 +15,27 @@ use crate::spiders::scheduler::Scheduler; use crate::spiders::session::SessionManager; use crate::spiders::spider::Spider; +/// Sanitize a string for safe use as a single filesystem path segment. +/// Replaces any character that is not alphanumeric, `_`, or `-` with `_` so +/// values like `"../etc"` cannot escape the intended directory. +fn sanitize_path_segment(name: &str) -> String { + let cleaned: String = name + .chars() + .map(|c| { + if c.is_ascii_alphanumeric() || c == '_' || c == '-' { + c + } else { + '_' + } + }) + .collect(); + if cleaned.is_empty() { + "_".to_string() + } else { + cleaned + } +} + pub struct CrawlerEngine { spider: Arc, session_manager: Arc, @@ -48,7 +69,9 @@ impl CrawlerEngine { let cache = if spider.development_mode() { let dir = crawl_dir .map(|d| format!("{}/cache", d)) - .unwrap_or_else(|| format!(".scrapling/{}/cache", spider.name())); + .unwrap_or_else(|| { + format!(".scrapling/{}/cache", sanitize_path_segment(spider.name())) + }); ResponseCache::new(&dir).ok().map(Arc::new) } else { None @@ -57,7 +80,12 @@ impl CrawlerEngine { let checkpoint = { let dir = crawl_dir .map(|d| format!("{}/checkpoints", d)) - .unwrap_or_else(|| format!(".scrapling/{}/checkpoints", spider.name())); + .unwrap_or_else(|| { + format!( + ".scrapling/{}/checkpoints", + sanitize_path_segment(spider.name()) + ) + }); CheckpointManager::new(&dir).ok().map(Arc::new) }; @@ -270,11 +298,14 @@ impl CrawlerEngine { } } - // Check allowed_domains + // Check allowed_domains. Reject requests whose domain cannot be + // parsed (e.g. `data:`, `file://`, malformed URLs) so the whitelist + // cannot be bypassed by non-HTTP schemes. let allowed = spider.allowed_domains(); if !allowed.is_empty() { - if let Some(domain) = request.domain() { - if !allowed.contains(&domain) { + match request.domain() { + Some(domain) if allowed.contains(&domain) => {} + _ => { stats.lock().await.offsite_requests_count += 1; return; } @@ -398,3 +429,27 @@ impl CrawlerEngine { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sanitize_path_segment_replaces_traversal_chars() { + assert_eq!(sanitize_path_segment("../etc/passwd"), "___etc_passwd"); + assert_eq!(sanitize_path_segment("..\\windows"), "___windows"); + assert_eq!(sanitize_path_segment("spider.name"), "spider_name"); + } + + #[test] + fn sanitize_path_segment_keeps_safe_chars() { + assert_eq!(sanitize_path_segment("my-spider_1"), "my-spider_1"); + assert_eq!(sanitize_path_segment("Spider123"), "Spider123"); + } + + #[test] + fn sanitize_path_segment_handles_empty() { + assert_eq!(sanitize_path_segment(""), "_"); + assert_eq!(sanitize_path_segment("///"), "___"); + } +}