Liohtml · Liohtml · May 28, 2026 · May 28, 2026
diff --git a/src/spiders/robots.rs b/src/spiders/robots.rs
@@ -1,4 +1,5 @@
 use std::collections::HashMap;
+use std::time::Duration;
 use url::Url;
 
 pub struct RobotsTxtManager {
@@ -11,6 +12,14 @@ struct RobotsRules {
     crawl_delay: Option<f64>,
 }
 
+/// One contiguous `User-agent:` group from a robots.txt file: a set of
+/// applicable agents followed by their directives.
+struct RobotsGroup {
+    agents: Vec<String>,
+    disallow: Vec<String>,
+    crawl_delay: Option<f64>,
+}
+
 impl RobotsTxtManager {
     pub fn new(user_agent: &str) -> Self {
         Self {
@@ -19,34 +28,22 @@ impl RobotsTxtManager {
         }
     }
 
-    /// Fetch and parse robots.txt for a domain.
+    /// Fetch and parse robots.txt for a domain. Uses a purpose-built HTTP
+    /// client with a short timeout so a hanging endpoint cannot block the
+    /// crawl setup indefinitely.
     pub async fn fetch_robots(&mut self, domain: &str) {
         let url = format!("https://{}/robots.txt", domain);
-        let rules = match reqwest::get(&url).await {
-            Ok(resp) => {
-                if resp.status().is_success() {
-                    match resp.text().await {
-                        Ok(text) => Self::parse_robots(&text, &self.user_agent),
-                        Err(_) => RobotsRules {
-                            disallowed: Vec::new(),
-                            crawl_delay: None,
-                        },
-                    }
-                } else {
-                    // Non-success status: allow all
-                    RobotsRules {
-                        disallowed: Vec::new(),
-                        crawl_delay: None,
-                    }
-                }
-            }
-            Err(_) => {
-                // On error, allow all
-                RobotsRules {
-                    disallowed: Vec::new(),
-                    crawl_delay: None,
-                }
-            }
+        let client = reqwest::Client::builder()
+            .timeout(Duration::from_secs(10))
+            .build()
+            .unwrap_or_else(|_| reqwest::Client::new());
+
+        let rules = match client.get(&url).send().await {
+            Ok(resp) if resp.status().is_success() => match resp.text().await {
+                Ok(text) => Self::parse_robots(&text, &self.user_agent),
+                Err(_) => RobotsRules::allow_all(),
+            },
+            _ => RobotsRules::allow_all(),
         };
         self.cache.insert(domain.to_string(), rules);
     }
@@ -76,7 +73,7 @@ impl RobotsTxtManager {
                 }
                 true
             }
-            None => true, // No rules cached = allow
+            None => true,
         }
     }
 
@@ -85,63 +82,131 @@ impl RobotsTxtManager {
     }
 
     fn parse_robots(text: &str, user_agent: &str) -> RobotsRules {
-        let mut disallowed = Vec::new();
-        let mut crawl_delay = None;
-        let mut in_matching_section = false;
-        let mut found_specific = false;
         let ua_lower = user_agent.to_lowercase();
+        let groups = parse_groups(text);
 
-        // First pass: look for specific user-agent match
-        for line in text.lines() {
-            let line = line.trim();
-            if line.is_empty() || line.starts_with('#') {
-                continue;
-            }
+        // Specific agent match wins over the wildcard group.
+        let specific = groups
+            .iter()
+            .find(|g| g.agents.iter().any(|a| a == &ua_lower));
+        let chosen = specific.or_else(|| groups.iter().find(|g| g.agents.iter().any(|a| a == "*")));
 
-            if let Some(rest) = line
-                .strip_prefix("User-agent:")
-                .or_else(|| line.strip_prefix("user-agent:"))
-            {
-                let agent = rest.trim().to_lowercase();
-                if agent == ua_lower || agent == "*" {
-                    // Prefer specific match over wildcard
-                    if agent == ua_lower {
-                        found_specific = true;
-                        disallowed.clear();
-                        crawl_delay = None;
-                        in_matching_section = true;
-                    } else {
-                        in_matching_section = !found_specific;
-                    }
-                } else {
-                    if !found_specific {
-                        // Could be another agent section
-                    }
-                    in_matching_section = false;
+        match chosen {
+            Some(g) => RobotsRules {
+                disallowed: g.disallow.clone(),
+                crawl_delay: g.crawl_delay,
+            },
+            None => RobotsRules::allow_all(),
+        }
+    }
+}
+
+impl RobotsRules {
+    fn allow_all() -> Self {
+        Self {
+            disallowed: Vec::new(),
+            crawl_delay: None,
+        }
+    }
+}
+
+/// Parse robots.txt into groups. A group starts at one or more consecutive
+/// `User-agent:` lines and ends when the next `User-agent:` line appears
+/// after at least one directive (handling the standard multi-agent grouping
+/// where several `User-agent:` lines share a single rule block).
+fn parse_groups(text: &str) -> Vec<RobotsGroup> {
+    let mut groups: Vec<RobotsGroup> = Vec::new();
+    let mut current: Option<RobotsGroup> = None;
+    let mut saw_directive = false;
+
+    for raw in text.lines() {
+        let line = raw.trim();
+        if line.is_empty() || line.starts_with('#') {
+            continue;
+        }
+
+        if let Some(rest) = strip_prefix_ci(line, "user-agent:") {
+            if saw_directive {
+                if let Some(g) = current.take() {
+                    groups.push(g);
                 }
-            } else if in_matching_section {
-                if let Some(path) = line
-                    .strip_prefix("Disallow:")
-                    .or_else(|| line.strip_prefix("disallow:"))
-                {
-                    let path = path.trim();
-                    if !path.is_empty() {
-                        disallowed.push(path.to_string());
-                    }
-                } else if let Some(delay) = line
-                    .strip_prefix("Crawl-delay:")
-                    .or_else(|| line.strip_prefix("crawl-delay:"))
-                {
-                    if let Ok(d) = delay.trim().parse::<f64>() {
-                        crawl_delay = Some(d);
-                    }
+                saw_directive = false;
+            }
+            let g = current.get_or_insert_with(|| RobotsGroup {
+                agents: Vec::new(),
+                disallow: Vec::new(),
+                crawl_delay: None,
+            });
+            g.agents.push(rest.trim().to_lowercase());
+        } else if let Some(g) = current.as_mut() {
+            if let Some(rest) = strip_prefix_ci(line, "disallow:") {
+                let path = rest.trim();
+                if !path.is_empty() {
+                    g.disallow.push(path.to_string());
+                }
+                saw_directive = true;
+            } else if let Some(rest) = strip_prefix_ci(line, "crawl-delay:") {
+                if let Ok(d) = rest.trim().parse::<f64>() {
+                    g.crawl_delay = Some(d);
                 }
+                saw_directive = true;
             }
         }
+    }
+    if let Some(g) = current.take() {
+        groups.push(g);
+    }
+    groups
+}
 
-        RobotsRules {
-            disallowed,
-            crawl_delay,
-        }
+fn strip_prefix_ci<'a>(line: &'a str, prefix: &str) -> Option<&'a str> {
+    if line.len() >= prefix.len()
+        && line.as_bytes()[..prefix.len()].eq_ignore_ascii_case(prefix.as_bytes())
+    {
+        Some(&line[prefix.len()..])
+    } else {
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn multi_agent_group_applies_to_each_agent() {
+        let txt = "User-agent: MyBot\nUser-agent: Googlebot\nDisallow: /private\n";
+        let rules = RobotsTxtManager::parse_robots(txt, "MyBot");
+        assert_eq!(rules.disallowed, vec!["/private".to_string()]);
+        let rules = RobotsTxtManager::parse_robots(txt, "Googlebot");
+        assert_eq!(rules.disallowed, vec!["/private".to_string()]);
+    }
+
+    #[test]
+    fn specific_agent_wins_over_wildcard() {
+        let txt = "User-agent: *\nDisallow: /all\n\nUser-agent: MyBot\nDisallow: /mine\n";
+        let rules = RobotsTxtManager::parse_robots(txt, "MyBot");
+        assert_eq!(rules.disallowed, vec!["/mine".to_string()]);
+    }
+
+    #[test]
+    fn wildcard_applies_when_no_specific_match() {
+        let txt = "User-agent: *\nDisallow: /all\n";
+        let rules = RobotsTxtManager::parse_robots(txt, "OtherBot");
+        assert_eq!(rules.disallowed, vec!["/all".to_string()]);
+    }
+
+    #[test]
+    fn unknown_agent_with_no_wildcard_is_allow_all() {
+        let txt = "User-agent: SomeoneElse\nDisallow: /\n";
+        let rules = RobotsTxtManager::parse_robots(txt, "MyBot");
+        assert!(rules.disallowed.is_empty());
+    }
+
+    #[test]
+    fn crawl_delay_is_parsed() {
+        let txt = "User-agent: *\nCrawl-delay: 5\n";
+        let rules = RobotsTxtManager::parse_robots(txt, "MyBot");
+        assert_eq!(rules.crawl_delay, Some(5.0));
     }
 }