|
| 1 | +package services |
| 2 | + |
| 3 | +import jakarta.inject.Inject |
| 4 | +import models.domain.publications.PublicationCandidate |
| 5 | +import play.api.libs.json.{JsArray, JsValue} |
| 6 | +import play.api.{Configuration, Logging} |
| 7 | + |
| 8 | +import scala.concurrent.ExecutionContext |
| 9 | + |
| 10 | +/** |
| 11 | + * Calculates relevance scores for publication candidates using multiple signals: |
| 12 | + * |
| 13 | + * 1. Keyword matching — title/abstract matched against domain-relevant terms |
| 14 | + * 2. OpenAlex concept weighting — concepts with scores from OpenAlex taxonomy |
| 15 | + * 3. Citation impact — normalized citation percentile and raw count |
| 16 | + * 4. Journal relevance — bonus for known high-value journals |
| 17 | + * |
| 18 | + * Final score is a weighted combination normalized to [0, 1]. |
| 19 | + */ |
| 20 | +class RelevanceScoringService @Inject()( |
| 21 | + configuration: Configuration |
| 22 | +)(implicit ec: ExecutionContext) extends Logging { |
| 23 | + |
| 24 | + // Weights for each scoring component (should sum to 1.0) |
| 25 | + private val keywordWeight: Double = configuration.getOptional[Double]("publication-discovery.scoring.keywordWeight").getOrElse(0.35) |
| 26 | + private val conceptWeight: Double = configuration.getOptional[Double]("publication-discovery.scoring.conceptWeight").getOrElse(0.25) |
| 27 | + private val citationWeight: Double = configuration.getOptional[Double]("publication-discovery.scoring.citationWeight").getOrElse(0.20) |
| 28 | + private val journalWeight: Double = configuration.getOptional[Double]("publication-discovery.scoring.journalWeight").getOrElse(0.20) |
| 29 | + |
| 30 | + // High-value keywords for genomics/phylogenetics domain |
| 31 | + private[services] val primaryKeywords: Set[String] = Set( |
| 32 | + "haplogroup", "y-dna", "y-chromosome", "mtdna", "mitochondrial dna", |
| 33 | + "phylogenetic", "phylogeny", "ancient dna", "adna", |
| 34 | + "y-str", "snp", "whole genome sequencing", "population genetics", |
| 35 | + "human migration", "coalescent", "tmrca", "molecular clock" |
| 36 | + ) |
| 37 | + |
| 38 | + private[services] val secondaryKeywords: Set[String] = Set( |
| 39 | + "genetic genealogy", "paternal lineage", "maternal lineage", |
| 40 | + "uniparental", "non-recombining", "nry", |
| 41 | + "demographic history", "founder effect", "genetic drift", |
| 42 | + "admixture", "archaeogenetics", "paleogenomics", |
| 43 | + "short tandem repeat", "microsatellite" |
| 44 | + ) |
| 45 | + |
| 46 | + // OpenAlex concepts that indicate high relevance |
| 47 | + private[services] val highValueConcepts: Set[String] = Set( |
| 48 | + "haplogroup", "y chromosome", "mitochondrial dna", "human y-chromosome dna haplogroup", |
| 49 | + "phylogenetics", "ancient dna", "population genetics", |
| 50 | + "genetic genealogy", "molecular phylogenetics" |
| 51 | + ) |
| 52 | + |
| 53 | + private[services] val mediumValueConcepts: Set[String] = Set( |
| 54 | + "genetics", "genomics", "human genetics", "molecular biology", |
| 55 | + "single-nucleotide polymorphism", "dna sequencing", |
| 56 | + "biological anthropology", "archaeogenetics" |
| 57 | + ) |
| 58 | + |
| 59 | + // Known high-value journals for this domain |
| 60 | + private[services] val highValueJournals: Set[String] = Set( |
| 61 | + "nature", "science", "nature genetics", "nature communications", |
| 62 | + "molecular biology and evolution", "american journal of human genetics", |
| 63 | + "european journal of human genetics", "genome research", |
| 64 | + "human genetics", "human mutation", "genome biology", |
| 65 | + "plos genetics", "current biology", "cell", |
| 66 | + "proceedings of the national academy of sciences", |
| 67 | + "annals of human genetics", "genes", "forensic science international: genetics" |
| 68 | + ).map(_.toLowerCase) |
| 69 | + |
| 70 | + /** |
| 71 | + * Calculate the composite relevance score for a candidate. |
| 72 | + */ |
| 73 | + def score(candidate: PublicationCandidate): Double = { |
| 74 | + val keywordScore = calculateKeywordScore(candidate) |
| 75 | + val conceptScore = calculateConceptScore(candidate.rawMetadata) |
| 76 | + val citationScore = calculateCitationScore(candidate.rawMetadata) |
| 77 | + val journalScore = calculateJournalScore(candidate.journalName) |
| 78 | + |
| 79 | + val composite = keywordWeight * keywordScore + |
| 80 | + conceptWeight * conceptScore + |
| 81 | + citationWeight * citationScore + |
| 82 | + journalWeight * journalScore |
| 83 | + |
| 84 | + // Clamp to [0, 1] |
| 85 | + math.max(0.0, math.min(1.0, composite)) |
| 86 | + } |
| 87 | + |
| 88 | + /** |
| 89 | + * Score a batch of candidates, returning them with updated relevance scores. |
| 90 | + */ |
| 91 | + def scoreCandidates(candidates: Seq[PublicationCandidate]): Seq[PublicationCandidate] = { |
| 92 | + candidates.map { c => |
| 93 | + val newScore = score(c) |
| 94 | + c.copy(relevanceScore = Some(newScore)) |
| 95 | + } |
| 96 | + } |
| 97 | + |
| 98 | + /** |
| 99 | + * Keyword-based scoring: check title and abstract for domain-relevant terms. |
| 100 | + * Primary keywords score higher than secondary keywords. |
| 101 | + */ |
| 102 | + private[services] def calculateKeywordScore(candidate: PublicationCandidate): Double = { |
| 103 | + val text = (candidate.title + " " + candidate.`abstract`.getOrElse("")).toLowerCase |
| 104 | + |
| 105 | + val primaryHits = primaryKeywords.count(kw => text.contains(kw)) |
| 106 | + val secondaryHits = secondaryKeywords.count(kw => text.contains(kw)) |
| 107 | + |
| 108 | + // Each primary keyword contributes 0.15, each secondary 0.08, capped at 1.0 |
| 109 | + val rawScore = primaryHits * 0.15 + secondaryHits * 0.08 |
| 110 | + math.min(1.0, rawScore) |
| 111 | + } |
| 112 | + |
| 113 | + /** |
| 114 | + * OpenAlex concept-based scoring: extract concepts from raw metadata |
| 115 | + * and weight by concept relevance and OpenAlex-assigned score. |
| 116 | + * |
| 117 | + * OpenAlex concepts have structure: [{display_name: "...", score: 0.8, ...}, ...] |
| 118 | + */ |
| 119 | + private[services] def calculateConceptScore(rawMetadata: Option[JsValue]): Double = { |
| 120 | + rawMetadata.flatMap { json => |
| 121 | + // Try both "concepts" (older API) and "topics" (newer API) |
| 122 | + val concepts = (json \ "concepts").asOpt[JsArray] |
| 123 | + .orElse((json \ "topics").asOpt[JsArray]) |
| 124 | + .map(_.value.toSeq) |
| 125 | + .getOrElse(Seq.empty) |
| 126 | + |
| 127 | + if (concepts.isEmpty) None |
| 128 | + else { |
| 129 | + var totalScore = 0.0 |
| 130 | + |
| 131 | + for (concept <- concepts) { |
| 132 | + val name = (concept \ "display_name").asOpt[String].getOrElse("").toLowerCase |
| 133 | + val apiScore = (concept \ "score").asOpt[Double].getOrElse(0.0) |
| 134 | + |
| 135 | + if (highValueConcepts.exists(hvc => name.contains(hvc))) { |
| 136 | + totalScore += apiScore * 1.0 // Full weight for high-value |
| 137 | + } else if (mediumValueConcepts.exists(mvc => name.contains(mvc))) { |
| 138 | + totalScore += apiScore * 0.4 // Reduced weight for medium-value |
| 139 | + } |
| 140 | + } |
| 141 | + |
| 142 | + Some(math.min(1.0, totalScore)) |
| 143 | + } |
| 144 | + }.getOrElse(0.0) |
| 145 | + } |
| 146 | + |
| 147 | + /** |
| 148 | + * Citation-based scoring using OpenAlex citation metrics. |
| 149 | + * |
| 150 | + * Uses citation_normalized_percentile (0-1) if available, |
| 151 | + * otherwise falls back to cited_by_count with logarithmic scaling. |
| 152 | + */ |
| 153 | + private[services] def calculateCitationScore(rawMetadata: Option[JsValue]): Double = { |
| 154 | + rawMetadata.flatMap { json => |
| 155 | + // Prefer normalized percentile (already 0-1) |
| 156 | + val percentile = (json \ "citation_normalized_percentile" \ "value").asOpt[Double] |
| 157 | + .orElse((json \ "cited_by_percentile_year" \ "max").asOpt[Double].map(_ / 100.0)) |
| 158 | + |
| 159 | + percentile.orElse { |
| 160 | + // Fallback: logarithmic scaling of raw citation count |
| 161 | + (json \ "cited_by_count").asOpt[Int].map { count => |
| 162 | + if (count <= 0) 0.0 |
| 163 | + else math.min(1.0, math.log10(count.toDouble + 1) / 3.0) // log10(1001)/3 ≈ 1.0 |
| 164 | + } |
| 165 | + } |
| 166 | + }.getOrElse(0.0) |
| 167 | + } |
| 168 | + |
| 169 | + /** |
| 170 | + * Journal-based scoring: bonus for publications in known high-value journals. |
| 171 | + */ |
| 172 | + private[services] def calculateJournalScore(journalName: Option[String]): Double = { |
| 173 | + journalName.map(_.toLowerCase) match { |
| 174 | + case Some(name) if highValueJournals.exists(j => name.contains(j)) => 1.0 |
| 175 | + case Some(_) => 0.3 // Known journal, but not high-value |
| 176 | + case None => 0.0 |
| 177 | + } |
| 178 | + } |
| 179 | + |
| 180 | + /** |
| 181 | + * Get a breakdown of scoring components for debugging/display. |
| 182 | + */ |
| 183 | + def scoreBreakdown(candidate: PublicationCandidate): ScoringBreakdown = { |
| 184 | + ScoringBreakdown( |
| 185 | + keywordScore = calculateKeywordScore(candidate), |
| 186 | + conceptScore = calculateConceptScore(candidate.rawMetadata), |
| 187 | + citationScore = calculateCitationScore(candidate.rawMetadata), |
| 188 | + journalScore = calculateJournalScore(candidate.journalName), |
| 189 | + compositeScore = score(candidate), |
| 190 | + keywordWeight = keywordWeight, |
| 191 | + conceptWeight = conceptWeight, |
| 192 | + citationWeight = citationWeight, |
| 193 | + journalWeight = journalWeight |
| 194 | + ) |
| 195 | + } |
| 196 | +} |
| 197 | + |
| 198 | +case class ScoringBreakdown( |
| 199 | + keywordScore: Double, |
| 200 | + conceptScore: Double, |
| 201 | + citationScore: Double, |
| 202 | + journalScore: Double, |
| 203 | + compositeScore: Double, |
| 204 | + keywordWeight: Double, |
| 205 | + conceptWeight: Double, |
| 206 | + citationWeight: Double, |
| 207 | + journalWeight: Double |
| 208 | +) |
0 commit comments