Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 142 additions & 77 deletions src/spiders/robots.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::collections::HashMap;
use std::time::Duration;
use url::Url;

pub struct RobotsTxtManager {
Expand All @@ -11,6 +12,14 @@ struct RobotsRules {
crawl_delay: Option<f64>,
}

/// One contiguous `User-agent:` group from a robots.txt file: a set of
/// applicable agents followed by their directives.
struct RobotsGroup {
agents: Vec<String>,
disallow: Vec<String>,
crawl_delay: Option<f64>,
}

impl RobotsTxtManager {
pub fn new(user_agent: &str) -> Self {
Self {
Expand All @@ -19,34 +28,22 @@ impl RobotsTxtManager {
}
}

/// Fetch and parse robots.txt for a domain.
/// Fetch and parse robots.txt for a domain. Uses a purpose-built HTTP
/// client with a short timeout so a hanging endpoint cannot block the
/// crawl setup indefinitely.
pub async fn fetch_robots(&mut self, domain: &str) {
let url = format!("https://{}/robots.txt", domain);
let rules = match reqwest::get(&url).await {
Ok(resp) => {
if resp.status().is_success() {
match resp.text().await {
Ok(text) => Self::parse_robots(&text, &self.user_agent),
Err(_) => RobotsRules {
disallowed: Vec::new(),
crawl_delay: None,
},
}
} else {
// Non-success status: allow all
RobotsRules {
disallowed: Vec::new(),
crawl_delay: None,
}
}
}
Err(_) => {
// On error, allow all
RobotsRules {
disallowed: Vec::new(),
crawl_delay: None,
}
}
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(10))
.build()
.unwrap_or_else(|_| reqwest::Client::new());

let rules = match client.get(&url).send().await {
Ok(resp) if resp.status().is_success() => match resp.text().await {
Ok(text) => Self::parse_robots(&text, &self.user_agent),
Err(_) => RobotsRules::allow_all(),
},
_ => RobotsRules::allow_all(),
};
self.cache.insert(domain.to_string(), rules);
}
Expand Down Expand Up @@ -76,7 +73,7 @@ impl RobotsTxtManager {
}
true
}
None => true, // No rules cached = allow
None => true,
}
}

Expand All @@ -85,63 +82,131 @@ impl RobotsTxtManager {
}

fn parse_robots(text: &str, user_agent: &str) -> RobotsRules {
let mut disallowed = Vec::new();
let mut crawl_delay = None;
let mut in_matching_section = false;
let mut found_specific = false;
let ua_lower = user_agent.to_lowercase();
let groups = parse_groups(text);

// First pass: look for specific user-agent match
for line in text.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
// Specific agent match wins over the wildcard group.
let specific = groups
.iter()
.find(|g| g.agents.iter().any(|a| a == &ua_lower));
let chosen = specific.or_else(|| groups.iter().find(|g| g.agents.iter().any(|a| a == "*")));

if let Some(rest) = line
.strip_prefix("User-agent:")
.or_else(|| line.strip_prefix("user-agent:"))
{
let agent = rest.trim().to_lowercase();
if agent == ua_lower || agent == "*" {
// Prefer specific match over wildcard
if agent == ua_lower {
found_specific = true;
disallowed.clear();
crawl_delay = None;
in_matching_section = true;
} else {
in_matching_section = !found_specific;
}
} else {
if !found_specific {
// Could be another agent section
}
in_matching_section = false;
match chosen {
Some(g) => RobotsRules {
disallowed: g.disallow.clone(),
crawl_delay: g.crawl_delay,
},
None => RobotsRules::allow_all(),
}
}
}

impl RobotsRules {
fn allow_all() -> Self {
Self {
disallowed: Vec::new(),
crawl_delay: None,
}
}
}

/// Parse robots.txt into groups. A group starts at one or more consecutive
/// `User-agent:` lines and ends when the next `User-agent:` line appears
/// after at least one directive (handling the standard multi-agent grouping
/// where several `User-agent:` lines share a single rule block).
fn parse_groups(text: &str) -> Vec<RobotsGroup> {
let mut groups: Vec<RobotsGroup> = Vec::new();
let mut current: Option<RobotsGroup> = None;
let mut saw_directive = false;

for raw in text.lines() {
let line = raw.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}

if let Some(rest) = strip_prefix_ci(line, "user-agent:") {
if saw_directive {
if let Some(g) = current.take() {
groups.push(g);
}
} else if in_matching_section {
if let Some(path) = line
.strip_prefix("Disallow:")
.or_else(|| line.strip_prefix("disallow:"))
{
let path = path.trim();
if !path.is_empty() {
disallowed.push(path.to_string());
}
} else if let Some(delay) = line
.strip_prefix("Crawl-delay:")
.or_else(|| line.strip_prefix("crawl-delay:"))
{
if let Ok(d) = delay.trim().parse::<f64>() {
crawl_delay = Some(d);
}
saw_directive = false;
}
let g = current.get_or_insert_with(|| RobotsGroup {
agents: Vec::new(),
disallow: Vec::new(),
crawl_delay: None,
});
g.agents.push(rest.trim().to_lowercase());
} else if let Some(g) = current.as_mut() {
if let Some(rest) = strip_prefix_ci(line, "disallow:") {
let path = rest.trim();
if !path.is_empty() {
g.disallow.push(path.to_string());
}
saw_directive = true;
} else if let Some(rest) = strip_prefix_ci(line, "crawl-delay:") {
if let Ok(d) = rest.trim().parse::<f64>() {
g.crawl_delay = Some(d);
}
saw_directive = true;
}
}
}
if let Some(g) = current.take() {
groups.push(g);
}
groups
}

RobotsRules {
disallowed,
crawl_delay,
}
fn strip_prefix_ci<'a>(line: &'a str, prefix: &str) -> Option<&'a str> {
if line.len() >= prefix.len()
&& line.as_bytes()[..prefix.len()].eq_ignore_ascii_case(prefix.as_bytes())
{
Some(&line[prefix.len()..])
} else {
None
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn multi_agent_group_applies_to_each_agent() {
let txt = "User-agent: MyBot\nUser-agent: Googlebot\nDisallow: /private\n";
let rules = RobotsTxtManager::parse_robots(txt, "MyBot");
assert_eq!(rules.disallowed, vec!["/private".to_string()]);
let rules = RobotsTxtManager::parse_robots(txt, "Googlebot");
assert_eq!(rules.disallowed, vec!["/private".to_string()]);
}

#[test]
fn specific_agent_wins_over_wildcard() {
let txt = "User-agent: *\nDisallow: /all\n\nUser-agent: MyBot\nDisallow: /mine\n";
let rules = RobotsTxtManager::parse_robots(txt, "MyBot");
assert_eq!(rules.disallowed, vec!["/mine".to_string()]);
}

#[test]
fn wildcard_applies_when_no_specific_match() {
let txt = "User-agent: *\nDisallow: /all\n";
let rules = RobotsTxtManager::parse_robots(txt, "OtherBot");
assert_eq!(rules.disallowed, vec!["/all".to_string()]);
}

#[test]
fn unknown_agent_with_no_wildcard_is_allow_all() {
let txt = "User-agent: SomeoneElse\nDisallow: /\n";
let rules = RobotsTxtManager::parse_robots(txt, "MyBot");
assert!(rules.disallowed.is_empty());
}

#[test]
fn crawl_delay_is_parsed() {
let txt = "User-agent: *\nCrawl-delay: 5\n";
let rules = RobotsTxtManager::parse_robots(txt, "MyBot");
assert_eq!(rules.crawl_delay, Some(5.0));
}
}
Loading