From 55abacab58113fd51ffd7ff246d3956a70217dcd Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 28 May 2026 21:58:42 +0000
Subject: [PATCH] fix: harden engine against path traversal and allowed_domains
 bypass

- Sanitize spider.name() before interpolating it into cache/checkpoint paths
  so a name like "../etc/passwd" cannot escape the .scrapling/ directory
- Reject requests whose domain cannot be parsed (data:, file://, malformed
  URLs) when allowed_domains is set, instead of silently allowing them

Closes #1
Closes #2

https://claude.ai/code/session_012RmdaovmNWZVAim4XxCWwn
---
 src/spiders/engine.rs | 65 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 60 insertions(+), 5 deletions(-)
diff --git a/src/spiders/engine.rs b/src/spiders/engine.rs
index 6fbab41..7000e5b 100644
--- a/src/spiders/engine.rs
+++ b/src/spiders/engine.rs
@@ -15,6 +15,27 @@ use crate::spiders::scheduler::Scheduler;
 use crate::spiders::session::SessionManager;
 use crate::spiders::spider::Spider;
 
+/// Sanitize a string for safe use as a single filesystem path segment.
+/// Replaces any character that is not alphanumeric, `_`, or `-` with `_` so
+/// values like `"../etc"` cannot escape the intended directory.
+fn sanitize_path_segment(name: &str) -> String {
+    let cleaned: String = name
+        .chars()
+        .map(|c| {
+            if c.is_ascii_alphanumeric() || c == '_' || c == '-' {
+                c
+            } else {
+                '_'
+            }
+        })
+        .collect();
+    if cleaned.is_empty() {
+        "_".to_string()
+    } else {
+        cleaned
+    }
+}
+
 pub struct CrawlerEngine<S: Spider> {
     spider: Arc<S>,
     session_manager: Arc<SessionManager>,
@@ -48,7 +69,9 @@ impl<S: Spider> CrawlerEngine<S> {
         let cache = if spider.development_mode() {
             let dir = crawl_dir
                 .map(|d| format!("{}/cache", d))
-                .unwrap_or_else(|| format!(".scrapling/{}/cache", spider.name()));
+                .unwrap_or_else(|| {
+                    format!(".scrapling/{}/cache", sanitize_path_segment(spider.name()))
+                });
             ResponseCache::new(&dir).ok().map(Arc::new)
         } else {
             None
@@ -57,7 +80,12 @@ impl<S: Spider> CrawlerEngine<S> {
         let checkpoint = {
             let dir = crawl_dir
                 .map(|d| format!("{}/checkpoints", d))
-                .unwrap_or_else(|| format!(".scrapling/{}/checkpoints", spider.name()));
+                .unwrap_or_else(|| {
+                    format!(
+                        ".scrapling/{}/checkpoints",
+                        sanitize_path_segment(spider.name())
+                    )
+                });
             CheckpointManager::new(&dir).ok().map(Arc::new)
         };
 
@@ -270,11 +298,14 @@ impl<S: Spider> CrawlerEngine<S> {
             }
         }
 
-        // Check allowed_domains
+        // Check allowed_domains. Reject requests whose domain cannot be
+        // parsed (e.g. `data:`, `file://`, malformed URLs) so the whitelist
+        // cannot be bypassed by non-HTTP schemes.
         let allowed = spider.allowed_domains();
         if !allowed.is_empty() {
-            if let Some(domain) = request.domain() {
-                if !allowed.contains(&domain) {
+            match request.domain() {
+                Some(domain) if allowed.contains(&domain) => {}
+                _ => {
                     stats.lock().await.offsite_requests_count += 1;
                     return;
                 }
@@ -398,3 +429,27 @@ impl<S: Spider> CrawlerEngine<S> {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn sanitize_path_segment_replaces_traversal_chars() {
+        assert_eq!(sanitize_path_segment("../etc/passwd"), "___etc_passwd");
+        assert_eq!(sanitize_path_segment("..\\windows"), "___windows");
+        assert_eq!(sanitize_path_segment("spider.name"), "spider_name");
+    }
+
+    #[test]
+    fn sanitize_path_segment_keeps_safe_chars() {
+        assert_eq!(sanitize_path_segment("my-spider_1"), "my-spider_1");
+        assert_eq!(sanitize_path_segment("Spider123"), "Spider123");
+    }
+
+    #[test]
+    fn sanitize_path_segment_handles_empty() {
+        assert_eq!(sanitize_path_segment(""), "_");
+        assert_eq!(sanitize_path_segment("///"), "___");
+    }
+}