diff --git a/src/spiders/robots.rs b/src/spiders/robots.rs index 02c3f96..764c3f7 100644 --- a/src/spiders/robots.rs +++ b/src/spiders/robots.rs @@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::time::Duration; use url::Url; pub struct RobotsTxtManager { @@ -11,6 +12,14 @@ struct RobotsRules { crawl_delay: Option, } +/// One contiguous `User-agent:` group from a robots.txt file: a set of +/// applicable agents followed by their directives. +struct RobotsGroup { + agents: Vec, + disallow: Vec, + crawl_delay: Option, +} + impl RobotsTxtManager { pub fn new(user_agent: &str) -> Self { Self { @@ -19,34 +28,22 @@ impl RobotsTxtManager { } } - /// Fetch and parse robots.txt for a domain. + /// Fetch and parse robots.txt for a domain. Uses a purpose-built HTTP + /// client with a short timeout so a hanging endpoint cannot block the + /// crawl setup indefinitely. pub async fn fetch_robots(&mut self, domain: &str) { let url = format!("https://{}/robots.txt", domain); - let rules = match reqwest::get(&url).await { - Ok(resp) => { - if resp.status().is_success() { - match resp.text().await { - Ok(text) => Self::parse_robots(&text, &self.user_agent), - Err(_) => RobotsRules { - disallowed: Vec::new(), - crawl_delay: None, - }, - } - } else { - // Non-success status: allow all - RobotsRules { - disallowed: Vec::new(), - crawl_delay: None, - } - } - } - Err(_) => { - // On error, allow all - RobotsRules { - disallowed: Vec::new(), - crawl_delay: None, - } - } + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(10)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()); + + let rules = match client.get(&url).send().await { + Ok(resp) if resp.status().is_success() => match resp.text().await { + Ok(text) => Self::parse_robots(&text, &self.user_agent), + Err(_) => RobotsRules::allow_all(), + }, + _ => RobotsRules::allow_all(), }; self.cache.insert(domain.to_string(), rules); } @@ -76,7 +73,7 @@ impl RobotsTxtManager { } true } - None => true, // No rules cached = allow + None => true, } } @@ -85,63 +82,131 @@ impl RobotsTxtManager { } fn parse_robots(text: &str, user_agent: &str) -> RobotsRules { - let mut disallowed = Vec::new(); - let mut crawl_delay = None; - let mut in_matching_section = false; - let mut found_specific = false; let ua_lower = user_agent.to_lowercase(); + let groups = parse_groups(text); - // First pass: look for specific user-agent match - for line in text.lines() { - let line = line.trim(); - if line.is_empty() || line.starts_with('#') { - continue; - } + // Specific agent match wins over the wildcard group. + let specific = groups + .iter() + .find(|g| g.agents.iter().any(|a| a == &ua_lower)); + let chosen = specific.or_else(|| groups.iter().find(|g| g.agents.iter().any(|a| a == "*"))); - if let Some(rest) = line - .strip_prefix("User-agent:") - .or_else(|| line.strip_prefix("user-agent:")) - { - let agent = rest.trim().to_lowercase(); - if agent == ua_lower || agent == "*" { - // Prefer specific match over wildcard - if agent == ua_lower { - found_specific = true; - disallowed.clear(); - crawl_delay = None; - in_matching_section = true; - } else { - in_matching_section = !found_specific; - } - } else { - if !found_specific { - // Could be another agent section - } - in_matching_section = false; + match chosen { + Some(g) => RobotsRules { + disallowed: g.disallow.clone(), + crawl_delay: g.crawl_delay, + }, + None => RobotsRules::allow_all(), + } + } +} + +impl RobotsRules { + fn allow_all() -> Self { + Self { + disallowed: Vec::new(), + crawl_delay: None, + } + } +} + +/// Parse robots.txt into groups. A group starts at one or more consecutive +/// `User-agent:` lines and ends when the next `User-agent:` line appears +/// after at least one directive (handling the standard multi-agent grouping +/// where several `User-agent:` lines share a single rule block). +fn parse_groups(text: &str) -> Vec { + let mut groups: Vec = Vec::new(); + let mut current: Option = None; + let mut saw_directive = false; + + for raw in text.lines() { + let line = raw.trim(); + if line.is_empty() || line.starts_with('#') { + continue; + } + + if let Some(rest) = strip_prefix_ci(line, "user-agent:") { + if saw_directive { + if let Some(g) = current.take() { + groups.push(g); } - } else if in_matching_section { - if let Some(path) = line - .strip_prefix("Disallow:") - .or_else(|| line.strip_prefix("disallow:")) - { - let path = path.trim(); - if !path.is_empty() { - disallowed.push(path.to_string()); - } - } else if let Some(delay) = line - .strip_prefix("Crawl-delay:") - .or_else(|| line.strip_prefix("crawl-delay:")) - { - if let Ok(d) = delay.trim().parse::() { - crawl_delay = Some(d); - } + saw_directive = false; + } + let g = current.get_or_insert_with(|| RobotsGroup { + agents: Vec::new(), + disallow: Vec::new(), + crawl_delay: None, + }); + g.agents.push(rest.trim().to_lowercase()); + } else if let Some(g) = current.as_mut() { + if let Some(rest) = strip_prefix_ci(line, "disallow:") { + let path = rest.trim(); + if !path.is_empty() { + g.disallow.push(path.to_string()); + } + saw_directive = true; + } else if let Some(rest) = strip_prefix_ci(line, "crawl-delay:") { + if let Ok(d) = rest.trim().parse::() { + g.crawl_delay = Some(d); } + saw_directive = true; } } + } + if let Some(g) = current.take() { + groups.push(g); + } + groups +} - RobotsRules { - disallowed, - crawl_delay, - } +fn strip_prefix_ci<'a>(line: &'a str, prefix: &str) -> Option<&'a str> { + if line.len() >= prefix.len() + && line.as_bytes()[..prefix.len()].eq_ignore_ascii_case(prefix.as_bytes()) + { + Some(&line[prefix.len()..]) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn multi_agent_group_applies_to_each_agent() { + let txt = "User-agent: MyBot\nUser-agent: Googlebot\nDisallow: /private\n"; + let rules = RobotsTxtManager::parse_robots(txt, "MyBot"); + assert_eq!(rules.disallowed, vec!["/private".to_string()]); + let rules = RobotsTxtManager::parse_robots(txt, "Googlebot"); + assert_eq!(rules.disallowed, vec!["/private".to_string()]); + } + + #[test] + fn specific_agent_wins_over_wildcard() { + let txt = "User-agent: *\nDisallow: /all\n\nUser-agent: MyBot\nDisallow: /mine\n"; + let rules = RobotsTxtManager::parse_robots(txt, "MyBot"); + assert_eq!(rules.disallowed, vec!["/mine".to_string()]); + } + + #[test] + fn wildcard_applies_when_no_specific_match() { + let txt = "User-agent: *\nDisallow: /all\n"; + let rules = RobotsTxtManager::parse_robots(txt, "OtherBot"); + assert_eq!(rules.disallowed, vec!["/all".to_string()]); + } + + #[test] + fn unknown_agent_with_no_wildcard_is_allow_all() { + let txt = "User-agent: SomeoneElse\nDisallow: /\n"; + let rules = RobotsTxtManager::parse_robots(txt, "MyBot"); + assert!(rules.disallowed.is_empty()); + } + + #[test] + fn crawl_delay_is_parsed() { + let txt = "User-agent: *\nCrawl-delay: 5\n"; + let rules = RobotsTxtManager::parse_robots(txt, "MyBot"); + assert_eq!(rules.crawl_delay, Some(5.0)); } }