-
Notifications
You must be signed in to change notification settings - Fork 4
Metadata providers #10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: folder-linkers
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,3 @@ | ||
| [workspace] | ||
| resolver = "3" | ||
| members = ["server", "mlm_db", "mlm_parse", "mlm_mam"] | ||
|
|
||
| members = ["server", "mlm_db", "mlm_parse", "mlm_mam", "mlm_meta"] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| [package] | ||
| name = "mlm_meta" | ||
| version = "0.1.0" | ||
| edition = "2024" | ||
|
|
||
| [dependencies] | ||
| anyhow = "1.0" | ||
| async-trait = "0.1" | ||
| serde = { version = "1.0", features = ["derive"] } | ||
| reqwest = { version = "0.12.20", default-features = false, features = ["json", "rustls-tls", "gzip"] } | ||
| tokio = { version = "1", features = ["rt-multi-thread", "sync", "macros"] } | ||
| serde_json = "1.0" | ||
| scraper = "0.14" | ||
| mlm_db = { path = "../mlm_db" } | ||
| mlm_parse = { path = "../mlm_parse" } | ||
| strsim = "0.11" | ||
| tracing = "0.1" | ||
|
|
||
| urlencoding = "2.1" | ||
| url = "2.4" | ||
|
|
||
| [dev-dependencies] | ||
| httpmock = "0.7" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| mlm_meta | ||
| ======== | ||
|
|
||
| Small crate defining the Provider trait and helper types for external | ||
| metadata providers (Goodreads, Hardcover, ...). | ||
|
|
||
| Purpose | ||
| - Provide a stable trait so server can query multiple providers and map | ||
| results into existing `TorrentMeta`. | ||
|
|
||
| How to add a provider | ||
| - Implement `mlm_meta::Provider` and return `TorrentMeta` from `fetch`. | ||
| - Register the provider in server's `MetadataService` and map fields into | ||
| `TorrentMeta` before persisting. | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,169 @@ | ||
| use mlm_parse::{clean_name, normalize_title}; | ||
|
|
||
| pub use anyhow; | ||
| pub use tracing::{Level, debug, enabled, trace}; | ||
|
|
||
| /// Search query with optional author. Providers can decide how to use these fields. | ||
| #[derive(Debug, Clone)] | ||
| pub struct SearchQuery { | ||
| pub title: String, | ||
| pub author: Option<String>, | ||
| } | ||
|
|
||
| impl SearchQuery { | ||
| pub fn new(title: String, author: Option<String>) -> Self { | ||
| Self { title, author } | ||
| } | ||
|
|
||
| /// Build a combined search string for providers that use a single query string. | ||
| pub fn to_combined_string(&self) -> String { | ||
| match &self.author { | ||
| Some(author) if !self.title.is_empty() && !author.is_empty() => { | ||
| format!("{} {}", self.title, author) | ||
| } | ||
| _ if !self.title.is_empty() => self.title.clone(), | ||
| _ => String::new(), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Build SearchQuery with author included | ||
| pub fn query_with_author(title: &str, authors: &[String]) -> SearchQuery { | ||
| let author = authors | ||
| .iter() | ||
| .map(|a| a.trim()) | ||
| .find(|a| !a.is_empty()) | ||
| .map(|a| a.to_string()); | ||
| SearchQuery::new(title.to_string(), author) | ||
| } | ||
|
|
||
| /// Build SearchQuery without author (title-only search) | ||
| pub fn query_title_only(title: &str) -> SearchQuery { | ||
| SearchQuery::new(title.to_string(), None) | ||
| } | ||
|
|
||
| /// Normalized string similarity 0.0..1.0 | ||
| pub fn token_similarity(a: &str, b: &str) -> f64 { | ||
| strsim::normalized_levenshtein(a, b) | ||
| } | ||
|
|
||
| /// Normalize author names (clean and lowercase) | ||
| pub fn normalize_authors(auths: &[String]) -> Vec<String> { | ||
| auths | ||
| .iter() | ||
| .map(|a| { | ||
| let mut s = a.clone(); | ||
| let _ = clean_name(&mut s); | ||
| s.to_lowercase() | ||
| }) | ||
| .collect() | ||
| } | ||
|
|
||
| /// Score a candidate by title and author similarity. Candidate title and | ||
| /// candidate authors are provided directly as strings (the caller extracts | ||
| /// them from JSON). The query title/authors are the original query values. | ||
| pub fn score_candidate( | ||
| cand_title: Option<&str>, | ||
| cand_auths: &[String], | ||
| q_title: &Option<String>, | ||
| q_auths: &[String], | ||
| ) -> f64 { | ||
| let q_title_norm = q_title.as_ref().map(|t| normalize_title(t)); | ||
|
|
||
| let mut title_score = 0.0f64; | ||
| if let Some(qt_norm) = q_title_norm.as_ref() | ||
| && let Some(ct) = cand_title | ||
| { | ||
| let cand = normalize_title(ct); | ||
| if cand == *qt_norm { | ||
| title_score = 1.0; | ||
| } else if cand.contains(qt_norm.as_str()) || qt_norm.contains(cand.as_str()) { | ||
| title_score = 0.9; | ||
| } else { | ||
| title_score = token_similarity(&cand, qt_norm); | ||
| } | ||
| } | ||
|
|
||
| let mut author_score = 0.0f64; | ||
| if !q_auths.is_empty() { | ||
| let q_auths_norm = normalize_authors(q_auths); | ||
| let mut best = 0.0f64; | ||
|
Comment on lines
+87
to
+90
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Avoid repeated query-author normalization inside per-candidate scoring. Line 89 recomputes normalized query authors on every 🤖 Prompt for AI Agents |
||
| for a in cand_auths { | ||
| let mut n = a.clone(); | ||
| let _ = clean_name(&mut n); | ||
| let n = n.to_lowercase(); | ||
| for qa in &q_auths_norm { | ||
| if n.contains(qa) || qa.contains(&n) { | ||
| best = best.max(1.0); | ||
| } else { | ||
| best = best.max(token_similarity(&n, qa)); | ||
| } | ||
| } | ||
| } | ||
| author_score = best; | ||
| } | ||
|
|
||
| // Require minimum author match score when query has authors. | ||
| // This prevents false positives from exact title matches with wrong authors | ||
| // (e.g., "Boss of the Year" by Nicole French matching "Boss of the Year" by T. Funny) | ||
| // and prevents loose title matches (e.g., "Book Title" matching "Book Title: A Novel") | ||
| // when the author doesn't match at all. | ||
| if !q_auths.is_empty() && author_score < 0.5 { | ||
| return 0.0; | ||
| } | ||
|
|
||
| if q_title_norm.is_some() && !q_auths.is_empty() { | ||
| 0.7 * title_score + 0.3 * author_score | ||
| } else if q_title_norm.is_some() { | ||
| title_score | ||
| } else { | ||
| author_score | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use super::*; | ||
| use mlm_parse::normalize_title; | ||
|
|
||
| #[test] | ||
| fn test_token_similarity() { | ||
| assert!(token_similarity("great adventure", "great adventure") > 0.999); | ||
| assert!(token_similarity("great adventure", "great adventures") > 0.8); | ||
| assert!(token_similarity("great adventure", "completely different") < 0.3); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_score_candidate_title_pref() { | ||
| let q_title = Some(normalize_title("The Great Adventure")); | ||
| let q_auths: Vec<String> = vec![]; | ||
|
|
||
| let cand_exact_title = Some("The Great Adventure"); | ||
| let cand_sim_title = Some("Great Adventure"); | ||
| let cand_auths_exact: Vec<String> = vec!["Alice".to_string()]; | ||
| let cand_auths_sim: Vec<String> = vec!["Bob Smith".to_string()]; | ||
|
|
||
| let s_exact = score_candidate(cand_exact_title, &cand_auths_exact, &q_title, &q_auths); | ||
| let s_sim = score_candidate(cand_sim_title, &cand_auths_sim, &q_title, &q_auths); | ||
| assert!(s_exact >= s_sim, "expected exact title to score >= similar"); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_score_candidate_author_influence() { | ||
| let q_title = Some(normalize_title("Great Adventure")); | ||
| let q_auths: Vec<String> = vec!["bob smith".to_string()]; | ||
|
|
||
| let cand_title_only = Some("Great Adventure"); | ||
| let cand_both = Some("Great Adventur"); | ||
| let cand_auths_title_only: Vec<String> = vec!["Alice".to_string()]; | ||
| let cand_auths_both: Vec<String> = vec!["Bob Smith".to_string()]; | ||
|
|
||
| let s_title_only = | ||
| score_candidate(cand_title_only, &cand_auths_title_only, &q_title, &q_auths); | ||
| let s_both = score_candidate(cand_both, &cand_auths_both, &q_title, &q_auths); | ||
| assert!( | ||
| s_both > s_title_only, | ||
| "expected candidate with matching author to score higher" | ||
| ); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,100 @@ | ||
| use anyhow::Result; | ||
| use async_trait::async_trait; | ||
| use reqwest::Client; | ||
|
|
||
| #[async_trait] | ||
| pub trait HttpClient: Send + Sync { | ||
| async fn get(&self, url: &str) -> Result<String>; | ||
|
|
||
| async fn post(&self, url: &str, body: Option<&str>, headers: &[(&str, &str)]) | ||
| -> Result<String>; | ||
| } | ||
|
|
||
| pub struct ReqwestClient { | ||
| client: Client, | ||
| } | ||
|
|
||
| impl ReqwestClient { | ||
| pub fn new() -> Self { | ||
| use reqwest::header::{ | ||
| ACCEPT, ACCEPT_LANGUAGE, CONNECTION, HeaderMap, HeaderName, HeaderValue, | ||
| }; | ||
|
|
||
| let mut headers = HeaderMap::new(); | ||
| headers.insert( | ||
| ACCEPT, | ||
| HeaderValue::from_static( | ||
| "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | ||
| ), | ||
| ); | ||
| headers.insert( | ||
| ACCEPT_LANGUAGE, | ||
| HeaderValue::from_static("en,en-US;q=0.9,en-GB;q=0.8,sv;q=0.7"), | ||
| ); | ||
| headers.insert(CONNECTION, HeaderValue::from_static("keep-alive")); | ||
| headers.insert( | ||
| HeaderName::from_static("dnt"), | ||
| HeaderValue::from_static("1"), | ||
| ); | ||
| headers.insert( | ||
| HeaderName::from_static("priority"), | ||
| HeaderValue::from_static("u=0, i"), | ||
| ); | ||
| headers.insert( | ||
| HeaderName::from_static("sec-fetch-dest"), | ||
| HeaderValue::from_static("document"), | ||
| ); | ||
| headers.insert( | ||
| HeaderName::from_static("sec-fetch-mode"), | ||
| HeaderValue::from_static("navigate"), | ||
| ); | ||
| headers.insert( | ||
| HeaderName::from_static("sec-fetch-site"), | ||
| HeaderValue::from_static("none"), | ||
| ); | ||
| headers.insert( | ||
| HeaderName::from_static("sec-fetch-user"), | ||
| HeaderValue::from_static("?1"), | ||
| ); | ||
|
|
||
| Self { | ||
| client: Client::builder() | ||
| .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36") | ||
| .default_headers(headers) | ||
| .gzip(true) | ||
| .build() | ||
| .unwrap() | ||
| } | ||
|
Comment on lines
+60
to
+67
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Potential panic on client construction.
🛡️ Proposed fix-impl ReqwestClient {
- pub fn new() -> Self {
+impl ReqwestClient {
+ pub fn new() -> Self {
+ Self::try_new().expect("failed to build HTTP client")
+ }
+
+ pub fn try_new() -> Result<Self> {
// ... header setup ...
- Self {
- client: Client::builder()
+ Ok(Self {
+ client: Client::builder()
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36")
.default_headers(headers)
.gzip(true)
- .build()
- .unwrap()
- }
+ .build()?
+ })
}
}🤖 Prompt for AI Agents |
||
| } | ||
| } | ||
|
|
||
| impl Default for ReqwestClient { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
|
|
||
| #[async_trait] | ||
| impl HttpClient for ReqwestClient { | ||
| async fn get(&self, url: &str) -> Result<String> { | ||
| let res = self.client.get(url).send().await?.text().await?; | ||
| Ok(res) | ||
| } | ||
|
Comment on lines
+79
to
+82
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Consider adding error context for debugging. Adding context to HTTP errors helps identify which URL failed when debugging production issues. ♻️ Proposed improvement+use anyhow::Context;
+
async fn get(&self, url: &str) -> Result<String> {
- let res = self.client.get(url).send().await?.text().await?;
+ let res = self.client.get(url).send().await
+ .with_context(|| format!("GET request failed: {}", url))?
+ .text().await
+ .with_context(|| format!("failed to read response body from: {}", url))?;
Ok(res)
}🤖 Prompt for AI Agents |
||
|
|
||
| async fn post( | ||
| &self, | ||
| url: &str, | ||
| body: Option<&str>, | ||
| headers: &[(&str, &str)], | ||
| ) -> Result<String> { | ||
| let mut req = self.client.post(url); | ||
| for (k, v) in headers { | ||
| req = req.header(*k, *v); | ||
| } | ||
| if let Some(b) = body { | ||
| req = req.body(b.to_string()); | ||
| } | ||
| let res = req.send().await?.text().await?; | ||
| Ok(res) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| pub mod helpers; | ||
| pub mod http; | ||
| pub mod providers; | ||
| pub mod tag_category_map; | ||
| pub mod traits; | ||
|
|
||
| pub use helpers::*; | ||
| pub use http::*; | ||
| pub use providers::*; | ||
| pub use tag_category_map::*; | ||
| pub use traits::*; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Update provider examples to match implemented providers.
Line 5 still cites Goodreads, but this PR wires Hardcover/OpenLibrary/RomanceIo. Keeping this aligned avoids confusion for new provider contributors.
📝 Suggested doc update
📝 Committable suggestion
🤖 Prompt for AI Agents