Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .githooks/pre-push
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash
# Runs the local CI checks before every push. Enable once with:
# git config core.hooksPath .githooks
# Bypass in an emergency with: git push --no-verify
set -euo pipefail

exec "$(git rev-parse --show-toplevel)/scripts/ci.sh"
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
with:
components: clippy
- name: Run clippy
run: cargo clippy -- -W clippy::all -D warnings
run: cargo clippy --all-targets -- -W clippy::all -D warnings

fmt:
name: Format
Expand Down
20 changes: 20 additions & 0 deletions scripts/ci.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env bash
# Local CI: mirrors .github/workflows/ci.yml so the same checks can run
# without GitHub Actions (locally, in a git hook, or any environment).
set -euo pipefail

cd "$(git rev-parse --show-toplevel)"

echo "==> cargo fmt -- --check"
cargo fmt -- --check

echo "==> cargo clippy --all-targets -- -W clippy::all -D warnings"
cargo clippy --all-targets -- -W clippy::all -D warnings
Comment thread
coderabbitai[bot] marked this conversation as resolved.

echo "==> cargo build --verbose"
cargo build --verbose

echo "==> cargo test --verbose"
cargo test --verbose

echo "All checks passed."
52 changes: 41 additions & 11 deletions src/core/attributes_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,63 @@ impl AttributesHandler {
Self { inner }
}

pub fn get(&self, key: &str) -> Option<&TextHandler> { self.inner.get(key) }
pub fn contains_key(&self, key: &str) -> bool { self.inner.contains_key(key) }
pub fn len(&self) -> usize { self.inner.len() }
pub fn is_empty(&self) -> bool { self.inner.is_empty() }
pub fn keys(&self) -> impl Iterator<Item = &str> { self.inner.keys().map(|k| k.as_str()) }
pub fn values(&self) -> impl Iterator<Item = &TextHandler> { self.inner.values() }
pub fn get(&self, key: &str) -> Option<&TextHandler> {
self.inner.get(key)
}
pub fn contains_key(&self, key: &str) -> bool {
self.inner.contains_key(key)
}
pub fn len(&self) -> usize {
self.inner.len()
}
pub fn is_empty(&self) -> bool {
self.inner.is_empty()
}
pub fn keys(&self) -> impl Iterator<Item = &str> {
self.inner.keys().map(|k| k.as_str())
}
pub fn values(&self) -> impl Iterator<Item = &TextHandler> {
self.inner.values()
}

pub fn iter(&self) -> impl Iterator<Item = (&str, &TextHandler)> {
self.inner.iter().map(|(k, v)| (k.as_str(), v))
}

/// Search for attributes whose values match a keyword (exact or partial).
pub fn search_values<'a>(&'a self, keyword: &'a str, partial: bool) -> impl Iterator<Item = (&'a str, &'a TextHandler)> {
pub fn search_values<'a>(
&'a self,
keyword: &'a str,
partial: bool,
) -> impl Iterator<Item = (&'a str, &'a TextHandler)> {
self.inner.iter().filter_map(move |(k, v)| {
let matches = if partial { v.as_str().contains(keyword) } else { v.as_str() == keyword };
if matches { Some((k.as_str(), v)) } else { None }
let matches = if partial {
v.as_str().contains(keyword)
} else {
v.as_str() == keyword
};
if matches {
Some((k.as_str(), v))
} else {
None
}
})
}

/// Serialize attributes to JSON string.
pub fn json_string(&self) -> String {
let map: IndexMap<&str, &str> = self.inner.iter().map(|(k, v)| (k.as_str(), v.as_str())).collect();
let map: IndexMap<&str, &str> = self
.inner
.iter()
.map(|(k, v)| (k.as_str(), v.as_str()))
.collect();
serde_json::to_string(&map).unwrap_or_default()
}
}

impl std::ops::Index<&str> for AttributesHandler {
type Output = TextHandler;
fn index(&self, key: &str) -> &Self::Output { &self.inner[key] }
fn index(&self, key: &str) -> &Self::Output {
&self.inner[key]
}
}
6 changes: 3 additions & 3 deletions src/core/mod.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
pub mod text_handler;
pub mod text_handlers;
pub mod attributes_handler;
pub mod storage;
pub mod text_handler;
pub mod text_handlers;

pub use attributes_handler::AttributesHandler;
pub use text_handler::TextHandler;
pub use text_handlers::TextHandlers;
pub use attributes_handler::AttributesHandler;
5 changes: 2 additions & 3 deletions src/core/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,8 @@ impl SqliteStorage {
) -> Result<Option<HashMap<String, serde_json::Value>>, StorageError> {
let hash = Self::get_hash(identifier);
let conn = self.conn.lock().unwrap();
let mut stmt = conn.prepare(
"SELECT element_data FROM storage WHERE url = ?1 AND identifier = ?2",
)?;
let mut stmt =
conn.prepare("SELECT element_data FROM storage WHERE url = ?1 AND identifier = ?2")?;
let result: Option<String> = stmt
.query_row(params![self.url, hash], |row| row.get(0))
.ok();
Expand Down
84 changes: 75 additions & 9 deletions src/fetchers/client.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
use crate::fetchers::config::FetcherConfig;
use crate::fetchers::proxy::ProxyRotator;
use crate::fetchers::response::Response;
use std::collections::HashMap;
use std::time::Duration;

pub struct Fetcher {
config: FetcherConfig,
client: reqwest::Client,
/// One client per rotating proxy when rotation is enabled, otherwise a
/// single client. Indexed by `rotator` when present.
clients: Vec<reqwest::Client>,
rotator: Option<ProxyRotator>,
}

#[derive(Debug, thiserror::Error)]
Expand All @@ -18,6 +22,32 @@ pub enum FetcherError {

impl Fetcher {
pub fn new(config: FetcherConfig) -> Self {
let (clients, rotator) = if config.proxy_list.is_empty() {
// No rotation: a single client honouring `proxy` and the
// per-protocol `proxies` map.
(vec![Self::build_client(&config, None)], None)
} else {
// Rotation: one client bound to each proxy, selected round-robin.
let clients = config
.proxy_list
.iter()
.map(|p| Self::build_client(&config, Some(p)))
.collect();
let rotator = ProxyRotator::new(config.proxy_list.clone());
(clients, rotator)
};

Self {
config,
clients,
rotator,
}
}

/// Build a single reqwest client. When `proxy_override` is `Some`, that
/// proxy is applied for all protocols; otherwise the config's `proxy` and
/// per-protocol `proxies` map are applied.
fn build_client(config: &FetcherConfig, proxy_override: Option<&str>) -> reqwest::Client {
let mut builder = reqwest::Client::builder()
.timeout(Duration::from_secs(config.timeout_secs))
.danger_accept_invalid_certs(!config.verify_ssl);
Expand All @@ -32,17 +62,54 @@ impl Fetcher {
}

// Configure proxy
if let Some(ref proxy_url) = config.proxy {
if let Some(proxy_url) = proxy_override {
if let Ok(proxy) = reqwest::Proxy::all(proxy_url) {
builder = builder.proxy(proxy);
}
} else {
// Apply scheme-specific proxies before any wildcard so the specific
// ones win (reqwest uses the first matching proxy). `proxies` is a
// HashMap, so iterate in a deterministic order.
if let Some(proxy_url) = config.proxies.get("http") {
if let Ok(proxy) = reqwest::Proxy::http(proxy_url) {
builder = builder.proxy(proxy);
}
}
if let Some(proxy_url) = config.proxies.get("https") {
if let Ok(proxy) = reqwest::Proxy::https(proxy_url) {
builder = builder.proxy(proxy);
}
}
let mut wildcard_keys: Vec<&String> = config
.proxies
.keys()
.filter(|k| k.as_str() != "http" && k.as_str() != "https")
.collect();
wildcard_keys.sort();
for key in wildcard_keys {
if let Ok(proxy) = reqwest::Proxy::all(&config.proxies[key]) {
builder = builder.proxy(proxy);
}
}
// Single wildcard proxy applied last as a general fallback.
if let Some(ref proxy_url) = config.proxy {
if let Ok(proxy) = reqwest::Proxy::all(proxy_url) {
builder = builder.proxy(proxy);
}
}
}

let client = builder
.build()
.expect("Failed to build reqwest client");
builder.build().expect("Failed to build reqwest client")
}

Self { config, client }
/// Select the client to use for the next request attempt. With rotation
/// enabled this advances the round-robin cursor so a failing proxy is
/// swapped on retry.
fn next_client(&self) -> &reqwest::Client {
match &self.rotator {
Some(rotator) => &self.clients[rotator.next_index()],
None => &self.clients[0],
}
}

pub async fn get(&self, url: &str) -> Result<Response, FetcherError> {
Expand Down Expand Up @@ -84,7 +151,7 @@ impl Fetcher {
let mut last_error = String::new();

for attempt in 0..=self.config.retries {
let mut req = self.client.request(method.clone(), url);
let mut req = self.next_client().request(method.clone(), url);

// Set headers
for (key, value) in &headers {
Expand Down Expand Up @@ -130,8 +197,7 @@ impl Fetcher {
Err(e) => {
last_error = e.to_string();
if attempt < self.config.retries {
tokio::time::sleep(Duration::from_secs(self.config.retry_delay_secs))
.await;
tokio::time::sleep(Duration::from_secs(self.config.retry_delay_secs)).await;
}
}
}
Expand Down
43 changes: 37 additions & 6 deletions src/fetchers/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ pub struct FetcherConfig {
pub verify_ssl: bool,
pub proxy: Option<String>,
pub proxies: HashMap<String, String>,
/// Proxy URLs to rotate through, one HTTP client is built per entry and
/// selected round-robin per request. Takes precedence over `proxy` /
/// `proxies` when non-empty.
pub proxy_list: Vec<String>,
pub headers: HashMap<String, String>,
pub stealthy_headers: bool,
pub user_agent: Option<String>,
Expand All @@ -28,6 +32,7 @@ impl Default for FetcherConfig {
verify_ssl: true,
proxy: None,
proxies: HashMap::new(),
proxy_list: Vec::new(),
headers: HashMap::new(),
stealthy_headers: true,
user_agent: None,
Expand Down Expand Up @@ -74,11 +79,10 @@ impl FetcherConfig {
.entry("accept-encoding".to_string())
.or_insert_with(|| constants::ACCEPT_ENCODING.to_string());

headers
.entry("sec-ch-ua".to_string())
.or_insert_with(|| {
"\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"".to_string()
});
headers.entry("sec-ch-ua".to_string()).or_insert_with(|| {
"\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\""
.to_string()
});

headers
.entry("sec-ch-ua-mobile".to_string())
Expand Down Expand Up @@ -147,9 +151,36 @@ impl FetcherConfigBuilder {
self
}

/// Set a per-protocol proxy override. The scheme is lowercased; `"http"`
/// and `"https"` are routed to their respective protocols, any other key
/// (e.g. `"all"`) applies to all protocols.
pub fn protocol_proxy(
mut self,
scheme: impl Into<String>,
proxy_url: impl Into<String>,
) -> Self {
self.inner
.proxies
.insert(scheme.into().to_lowercase(), proxy_url.into());
self
}

/// Set the list of proxies to rotate through. When non-empty, requests are
/// distributed round-robin across one HTTP client per proxy.
pub fn rotating_proxies<I, S>(mut self, proxies: I) -> Self
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
self.inner.proxy_list = proxies.into_iter().map(Into::into).collect();
self
}

/// Add a per-header override. Key is lowercased automatically.
pub fn header(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
self.inner.headers.insert(key.into().to_lowercase(), value.into());
self.inner
.headers
.insert(key.into().to_lowercase(), value.into());
self
}

Expand Down
12 changes: 10 additions & 2 deletions src/fetchers/constants.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
pub const BLOCKED_RESOURCE_TYPES: &[&str] = &[
"font", "image", "media", "beacon", "object", "imageset",
"texttrack", "websocket", "csp_report", "stylesheet",
"font",
"image",
"media",
"beacon",
"object",
"imageset",
"texttrack",
"websocket",
"csp_report",
"stylesheet",
];

pub const USER_AGENTS: &[&str] = &[
Expand Down
4 changes: 2 additions & 2 deletions src/fetchers/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
pub mod client;
pub mod config;
pub mod constants;
pub mod client;
pub mod response;
pub mod proxy;
pub mod response;
10 changes: 8 additions & 2 deletions src/fetchers/proxy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,14 @@ impl ProxyRotator {

/// Return the next proxy in round-robin order.
pub fn next(&self) -> &str {
let idx = self.cursor.fetch_add(1, Ordering::Relaxed) % self.proxies.len();
&self.proxies[idx]
&self.proxies[self.next_index()]
}

/// Advance the cursor and return the index of the next proxy in
/// round-robin order. Useful for indexing a parallel collection (e.g. a
/// pool of pre-built HTTP clients) that shares the rotator's ordering.
pub fn next_index(&self) -> usize {
self.cursor.fetch_add(1, Ordering::Relaxed) % self.proxies.len()
}

/// Return a pseudo-random proxy based on the current cursor position.
Expand Down
Loading
Loading