Skip to content

Commit 16c4b08

Browse files
committed
feat(services): Add RelevanceScoringService for multi-signal publication candidate scoring
- Introduced `RelevanceScoringService` to calculate relevance scores based on keyword matching, concept weighting, citation impact, and journal relevance. - Added comprehensive scoring logic with domain-specific weights and signal normalization. - Updated `PublicationDiscoveryService` to integrate `RelevanceScoringService` for candidate scoring. - Added unit tests in `RelevanceScoringServiceSpec` and updated `PublicationDiscoveryServiceSpec` to validate scoring integration and ensure accuracy.
1 parent 31c53e5 commit 16c4b08

4 files changed

Lines changed: 468 additions & 11 deletions

File tree

app/services/PublicationDiscoveryService.scala

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ class PublicationDiscoveryService @Inject()(
1414
candidateRepository: PublicationCandidateRepository,
1515
runRepository: PublicationSearchRunRepository,
1616
publicationRepository: PublicationRepository,
17-
publicationService: PublicationService, // Injected
18-
openAlexService: OpenAlexService
17+
publicationService: PublicationService,
18+
openAlexService: OpenAlexService,
19+
relevanceScoringService: RelevanceScoringService
1920
)(implicit ec: ExecutionContext) extends Logging {
2021

2122
def acceptCandidate(candidateId: Int, reviewedBy: java.util.UUID): Future[Option[models.domain.publications.Publication]] = {
@@ -85,13 +86,8 @@ class PublicationDiscoveryService @Inject()(
8586
c.doi.exists(existingDois.contains)
8687
}
8788

88-
// 3. Calculate Relevance Score (Placeholder logic)
89-
// For now, let's just use 0.5 as a base score, or maybe look at citation counts if available in rawMetadata
90-
val scoredCandidates = newCandidates.map { c =>
91-
// extract simple score from raw metadata if possible, else default
92-
val percentile = (c.rawMetadata.get \ "citation_normalized_percentile" \ "value").asOpt[Double]
93-
c.copy(relevanceScore = percentile.orElse(Some(0.5)))
94-
}
89+
// 3. Calculate Relevance Score using multi-signal scoring
90+
val scoredCandidates = relevanceScoringService.scoreCandidates(newCandidates)
9591

9692
// 4. Save Candidates
9793
candidateRepository.saveCandidates(scoredCandidates).flatMap { savedCandidates =>
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
package services
2+
3+
import jakarta.inject.Inject
4+
import models.domain.publications.PublicationCandidate
5+
import play.api.libs.json.{JsArray, JsValue}
6+
import play.api.{Configuration, Logging}
7+
8+
import scala.concurrent.ExecutionContext
9+
10+
/**
11+
* Calculates relevance scores for publication candidates using multiple signals:
12+
*
13+
* 1. Keyword matching — title/abstract matched against domain-relevant terms
14+
* 2. OpenAlex concept weighting — concepts with scores from OpenAlex taxonomy
15+
* 3. Citation impact — normalized citation percentile and raw count
16+
* 4. Journal relevance — bonus for known high-value journals
17+
*
18+
* Final score is a weighted combination normalized to [0, 1].
19+
*/
20+
class RelevanceScoringService @Inject()(
21+
configuration: Configuration
22+
)(implicit ec: ExecutionContext) extends Logging {
23+
24+
// Weights for each scoring component (should sum to 1.0)
25+
private val keywordWeight: Double = configuration.getOptional[Double]("publication-discovery.scoring.keywordWeight").getOrElse(0.35)
26+
private val conceptWeight: Double = configuration.getOptional[Double]("publication-discovery.scoring.conceptWeight").getOrElse(0.25)
27+
private val citationWeight: Double = configuration.getOptional[Double]("publication-discovery.scoring.citationWeight").getOrElse(0.20)
28+
private val journalWeight: Double = configuration.getOptional[Double]("publication-discovery.scoring.journalWeight").getOrElse(0.20)
29+
30+
// High-value keywords for genomics/phylogenetics domain
31+
private[services] val primaryKeywords: Set[String] = Set(
32+
"haplogroup", "y-dna", "y-chromosome", "mtdna", "mitochondrial dna",
33+
"phylogenetic", "phylogeny", "ancient dna", "adna",
34+
"y-str", "snp", "whole genome sequencing", "population genetics",
35+
"human migration", "coalescent", "tmrca", "molecular clock"
36+
)
37+
38+
private[services] val secondaryKeywords: Set[String] = Set(
39+
"genetic genealogy", "paternal lineage", "maternal lineage",
40+
"uniparental", "non-recombining", "nry",
41+
"demographic history", "founder effect", "genetic drift",
42+
"admixture", "archaeogenetics", "paleogenomics",
43+
"short tandem repeat", "microsatellite"
44+
)
45+
46+
// OpenAlex concepts that indicate high relevance
47+
private[services] val highValueConcepts: Set[String] = Set(
48+
"haplogroup", "y chromosome", "mitochondrial dna", "human y-chromosome dna haplogroup",
49+
"phylogenetics", "ancient dna", "population genetics",
50+
"genetic genealogy", "molecular phylogenetics"
51+
)
52+
53+
private[services] val mediumValueConcepts: Set[String] = Set(
54+
"genetics", "genomics", "human genetics", "molecular biology",
55+
"single-nucleotide polymorphism", "dna sequencing",
56+
"biological anthropology", "archaeogenetics"
57+
)
58+
59+
// Known high-value journals for this domain
60+
private[services] val highValueJournals: Set[String] = Set(
61+
"nature", "science", "nature genetics", "nature communications",
62+
"molecular biology and evolution", "american journal of human genetics",
63+
"european journal of human genetics", "genome research",
64+
"human genetics", "human mutation", "genome biology",
65+
"plos genetics", "current biology", "cell",
66+
"proceedings of the national academy of sciences",
67+
"annals of human genetics", "genes", "forensic science international: genetics"
68+
).map(_.toLowerCase)
69+
70+
/**
71+
* Calculate the composite relevance score for a candidate.
72+
*/
73+
def score(candidate: PublicationCandidate): Double = {
74+
val keywordScore = calculateKeywordScore(candidate)
75+
val conceptScore = calculateConceptScore(candidate.rawMetadata)
76+
val citationScore = calculateCitationScore(candidate.rawMetadata)
77+
val journalScore = calculateJournalScore(candidate.journalName)
78+
79+
val composite = keywordWeight * keywordScore +
80+
conceptWeight * conceptScore +
81+
citationWeight * citationScore +
82+
journalWeight * journalScore
83+
84+
// Clamp to [0, 1]
85+
math.max(0.0, math.min(1.0, composite))
86+
}
87+
88+
/**
89+
* Score a batch of candidates, returning them with updated relevance scores.
90+
*/
91+
def scoreCandidates(candidates: Seq[PublicationCandidate]): Seq[PublicationCandidate] = {
92+
candidates.map { c =>
93+
val newScore = score(c)
94+
c.copy(relevanceScore = Some(newScore))
95+
}
96+
}
97+
98+
/**
99+
* Keyword-based scoring: check title and abstract for domain-relevant terms.
100+
* Primary keywords score higher than secondary keywords.
101+
*/
102+
private[services] def calculateKeywordScore(candidate: PublicationCandidate): Double = {
103+
val text = (candidate.title + " " + candidate.`abstract`.getOrElse("")).toLowerCase
104+
105+
val primaryHits = primaryKeywords.count(kw => text.contains(kw))
106+
val secondaryHits = secondaryKeywords.count(kw => text.contains(kw))
107+
108+
// Each primary keyword contributes 0.15, each secondary 0.08, capped at 1.0
109+
val rawScore = primaryHits * 0.15 + secondaryHits * 0.08
110+
math.min(1.0, rawScore)
111+
}
112+
113+
/**
114+
* OpenAlex concept-based scoring: extract concepts from raw metadata
115+
* and weight by concept relevance and OpenAlex-assigned score.
116+
*
117+
* OpenAlex concepts have structure: [{display_name: "...", score: 0.8, ...}, ...]
118+
*/
119+
private[services] def calculateConceptScore(rawMetadata: Option[JsValue]): Double = {
120+
rawMetadata.flatMap { json =>
121+
// Try both "concepts" (older API) and "topics" (newer API)
122+
val concepts = (json \ "concepts").asOpt[JsArray]
123+
.orElse((json \ "topics").asOpt[JsArray])
124+
.map(_.value.toSeq)
125+
.getOrElse(Seq.empty)
126+
127+
if (concepts.isEmpty) None
128+
else {
129+
var totalScore = 0.0
130+
131+
for (concept <- concepts) {
132+
val name = (concept \ "display_name").asOpt[String].getOrElse("").toLowerCase
133+
val apiScore = (concept \ "score").asOpt[Double].getOrElse(0.0)
134+
135+
if (highValueConcepts.exists(hvc => name.contains(hvc))) {
136+
totalScore += apiScore * 1.0 // Full weight for high-value
137+
} else if (mediumValueConcepts.exists(mvc => name.contains(mvc))) {
138+
totalScore += apiScore * 0.4 // Reduced weight for medium-value
139+
}
140+
}
141+
142+
Some(math.min(1.0, totalScore))
143+
}
144+
}.getOrElse(0.0)
145+
}
146+
147+
/**
148+
* Citation-based scoring using OpenAlex citation metrics.
149+
*
150+
* Uses citation_normalized_percentile (0-1) if available,
151+
* otherwise falls back to cited_by_count with logarithmic scaling.
152+
*/
153+
private[services] def calculateCitationScore(rawMetadata: Option[JsValue]): Double = {
154+
rawMetadata.flatMap { json =>
155+
// Prefer normalized percentile (already 0-1)
156+
val percentile = (json \ "citation_normalized_percentile" \ "value").asOpt[Double]
157+
.orElse((json \ "cited_by_percentile_year" \ "max").asOpt[Double].map(_ / 100.0))
158+
159+
percentile.orElse {
160+
// Fallback: logarithmic scaling of raw citation count
161+
(json \ "cited_by_count").asOpt[Int].map { count =>
162+
if (count <= 0) 0.0
163+
else math.min(1.0, math.log10(count.toDouble + 1) / 3.0) // log10(1001)/3 ≈ 1.0
164+
}
165+
}
166+
}.getOrElse(0.0)
167+
}
168+
169+
/**
170+
* Journal-based scoring: bonus for publications in known high-value journals.
171+
*/
172+
private[services] def calculateJournalScore(journalName: Option[String]): Double = {
173+
journalName.map(_.toLowerCase) match {
174+
case Some(name) if highValueJournals.exists(j => name.contains(j)) => 1.0
175+
case Some(_) => 0.3 // Known journal, but not high-value
176+
case None => 0.0
177+
}
178+
}
179+
180+
/**
181+
* Get a breakdown of scoring components for debugging/display.
182+
*/
183+
def scoreBreakdown(candidate: PublicationCandidate): ScoringBreakdown = {
184+
ScoringBreakdown(
185+
keywordScore = calculateKeywordScore(candidate),
186+
conceptScore = calculateConceptScore(candidate.rawMetadata),
187+
citationScore = calculateCitationScore(candidate.rawMetadata),
188+
journalScore = calculateJournalScore(candidate.journalName),
189+
compositeScore = score(candidate),
190+
keywordWeight = keywordWeight,
191+
conceptWeight = conceptWeight,
192+
citationWeight = citationWeight,
193+
journalWeight = journalWeight
194+
)
195+
}
196+
}
197+
198+
case class ScoringBreakdown(
199+
keywordScore: Double,
200+
conceptScore: Double,
201+
citationScore: Double,
202+
journalScore: Double,
203+
compositeScore: Double,
204+
keywordWeight: Double,
205+
conceptWeight: Double,
206+
citationWeight: Double,
207+
journalWeight: Double
208+
)

test/services/PublicationDiscoveryServiceSpec.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,16 @@ class PublicationDiscoveryServiceSpec extends ServiceSpec {
1818
val mockPubRepo: PublicationRepository = mock[PublicationRepository]
1919
val mockPubService: PublicationService = mock[PublicationService]
2020
val mockOpenAlexService: OpenAlexService = mock[OpenAlexService]
21+
val mockRelevanceScoringService: RelevanceScoringService = mock[RelevanceScoringService]
2122

2223
val service = new PublicationDiscoveryService(
2324
mockSearchConfigRepo, mockCandidateRepo, mockRunRepo,
24-
mockPubRepo, mockPubService, mockOpenAlexService
25+
mockPubRepo, mockPubService, mockOpenAlexService, mockRelevanceScoringService
2526
)
2627

2728
override def beforeEach(): Unit = {
2829
reset(mockSearchConfigRepo, mockCandidateRepo, mockRunRepo,
29-
mockPubRepo, mockPubService, mockOpenAlexService)
30+
mockPubRepo, mockPubService, mockOpenAlexService, mockRelevanceScoringService)
3031
}
3132

3233
val reviewerId: UUID = UUID.randomUUID()

0 commit comments

Comments
 (0)