diff --git a/bioscancast/datasets/biosecurity_sources.py b/bioscancast/datasets/biosecurity_sources.py index ddfd11c..2a47288 100644 --- a/bioscancast/datasets/biosecurity_sources.py +++ b/bioscancast/datasets/biosecurity_sources.py @@ -1,34 +1,112 @@ """Known biosecurity dashboard URLs by pathogen. -v1 — flagged for iteration after first benchmark run. -The dashboard list and routing logic will need updating as new outbreaks emerge -and data portals change. +v1 — flagged for iteration after first benchmark run. The dashboard list +and routing logic will need updating as new outbreaks emerge and data +portals change. + +Each entry carries a pathogen-specific ``title`` and ``snippet`` so that +the heuristic filter and the LLM-rescue path have real signal to work +with. The earlier convention ("Dashboard: cdc.gov" with a generic +snippet) produced keyword_overlap_score = 0.000 across the board — see +issue #14 and the q7/q12 live-run findings. """ -DASHBOARD_LOOKUP: dict[str, list[str]] = { +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class DashboardEntry: + """A curated authoritative source for a pathogen. + + The title and snippet are intended to be readable as a search result + in their own right: pathogen name, the kind of data the page hosts, + and the publisher. They feed both the keyword-overlap heuristic and + the LLM-rescue path. + """ + + url: str + title: str + snippet: str + + +DASHBOARD_LOOKUP: dict[str, list[DashboardEntry]] = { "h5n1": [ - "https://www.cdc.gov/bird-flu/situation-summary/", - "https://www.who.int/teams/global-influenza-programme/avian-influenza", + DashboardEntry( + url="https://www.cdc.gov/bird-flu/situation-summary/", + title="CDC H5N1 bird flu situation summary: human cases and outbreaks in the United States", + snippet="CDC tracking of H5N1 avian influenza human cases, affected livestock herds, and public-health response in the US.", + ), + DashboardEntry( + url="https://www.who.int/teams/global-influenza-programme/avian-influenza", + title="WHO Global Influenza Programme: avian influenza A(H5N1) human cases and surveillance", + snippet="WHO monitoring of human H5N1 cases, animal-to-human spillover events, and global surveillance reporting.", + ), ], "avian influenza": [ - "https://www.cdc.gov/bird-flu/situation-summary/", - "https://www.who.int/teams/global-influenza-programme/avian-influenza", + DashboardEntry( + url="https://www.cdc.gov/bird-flu/situation-summary/", + title="CDC H5N1 bird flu situation summary: human cases and outbreaks in the United States", + snippet="CDC tracking of H5N1 avian influenza human cases, affected livestock herds, and public-health response in the US.", + ), + DashboardEntry( + url="https://www.who.int/teams/global-influenza-programme/avian-influenza", + title="WHO Global Influenza Programme: avian influenza A(H5N1) human cases and surveillance", + snippet="WHO monitoring of human H5N1 cases, animal-to-human spillover events, and global surveillance reporting.", + ), ], "mpox": [ - "https://ourworldindata.org/mpox", - "https://www.who.int/emergencies/situation-reports", - "https://www.cdc.gov/mpox/data-research/index.html", + DashboardEntry( + url="https://ourworldindata.org/mpox", + title="Our World in Data mpox tracker: global confirmed cases and deaths", + snippet="OWID dashboard tracking cumulative confirmed mpox cases and deaths globally, broken down by country and region, updated from national health agencies.", + ), + DashboardEntry( + url="https://www.who.int/emergencies/situation-reports", + title="WHO situation reports including the multi-country mpox outbreak", + snippet="WHO situation reports with weekly case counts, country breakdowns, and public-health guidance for ongoing outbreaks including mpox.", + ), + DashboardEntry( + url="https://www.cdc.gov/monkeypox/situation-summary/index.html", + title="CDC mpox current situation summary: confirmed cases in the United States", + snippet="CDC current situation summary for mpox, with US confirmed case counts, clade information, demographics, and outbreak response.", + ), ], "ebola": [ - "https://www.afro.who.int/health-topics/ebola-virus-disease", - "https://www.cdc.gov/ebola/index.html", + DashboardEntry( + url="https://www.afro.who.int/health-topics/ebola-disease", + title="WHO Africa Ebola virus disease outbreak surveillance and case counts", + snippet="WHO regional office for Africa tracking of Ebola virus disease outbreaks, confirmed and suspected cases, deaths, and response across African countries.", + ), + DashboardEntry( + url="https://www.cdc.gov/ebola/about/index.html", + title="CDC Ebola virus disease outbreak history and case counts", + snippet="CDC information on current and historical Ebola virus disease outbreaks worldwide, with case counts, deaths, and US public-health response.", + ), ], "covid-19": [ - "https://ourworldindata.org/coronavirus", - "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports", + DashboardEntry( + url="https://ourworldindata.org/coronavirus", + title="Our World in Data COVID-19 tracker: global cases, deaths, and vaccinations", + snippet="OWID dashboard tracking cumulative COVID-19 confirmed cases, deaths, hospitalizations, and vaccination coverage globally by country.", + ), + DashboardEntry( + url="https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports", + title="WHO COVID-19 situation reports and global case counts", + snippet="WHO situation reports with updates on COVID-19 confirmed cases, deaths, variant tracking, and country-level data.", + ), ], "marburg": [ - "https://www.who.int/news-room/fact-sheets/detail/marburg-virus-disease", - "https://www.cdc.gov/marburg/index.html", + DashboardEntry( + url="https://www.who.int/news-room/fact-sheets/detail/marburg-virus-disease", + title="WHO Marburg virus disease facts and outbreak case counts", + snippet="WHO factsheet on Marburg virus disease including transmission, symptoms, case-fatality ratio, and historical outbreak case and death counts.", + ), + DashboardEntry( + url="https://www.cdc.gov/marburg/index.html", + title="CDC Marburg virus disease outbreaks and surveillance", + snippet="CDC information on Marburg virus disease outbreaks worldwide, case counts, deaths, and US public-health surveillance.", + ), ], } diff --git a/bioscancast/datasets/source_tiers.py b/bioscancast/datasets/source_tiers.py index 93a0013..8abadcc 100644 --- a/bioscancast/datasets/source_tiers.py +++ b/bioscancast/datasets/source_tiers.py @@ -59,6 +59,36 @@ "wikipedia.org", "sciencedirect.com", "pubmed.ncbi.nlm.nih.gov", + # National/international news with established newsrooms. Added after the + # #13 tier-coverage audit (data/investigations/findings-issues-3-4-13.md): + # live pools showed reputable outbreak reporting from these outlets + # resolving to "unknown" (domain_score 0.2), which sank them below the + # filter's credibility floor. Second-level-domain matching in + # resolve_tier() covers subdomains (edition.cnn.com, ca.news.yahoo.com, + # africa.businessinsider.com, etc.). + "cnn.com", + "nbcnews.com", + "cbsnews.com", + "abcnews.go.com", + "abcnews.com", + "npr.org", + "pbs.org", + "usatoday.com", + "latimes.com", + "politico.com", + "politico.eu", + "axios.com", + "thehill.com", + "forbes.com", + "bloomberg.com", + "ft.com", + "wsj.com", + "economist.com", + "time.com", + "theatlantic.com", + "newyorker.com", + "arstechnica.com", + "businessinsider.com", } TIER_4_DOMAINS: set[str] = { diff --git a/bioscancast/filtering/config.py b/bioscancast/filtering/config.py index f0477aa..187dee4 100644 --- a/bioscancast/filtering/config.py +++ b/bioscancast/filtering/config.py @@ -38,7 +38,14 @@ "domain": 0.20, "official_bonus": 0.20, }, - "heuristic_keep_threshold": 0.72, + # Lowered from 0.72 to 0.65 after q7/q12 live runs showed filter + # survival of 4.7% / 13.5% — the threshold was tighter than the + # heuristic's actual signal supports. Borderline candidates that + # cross the new threshold still go to the LLM rescue path; this + # change just stops dropping high-credibility-but-low-keyword-overlap + # results pre-LLM (e.g. apnews/theguardian/washingtonpost in q7). + # See issue #13. + "heuristic_keep_threshold": 0.65, "heuristic_borderline_threshold": 0.45, "reranker_weights": { @@ -49,6 +56,18 @@ "auto_reject_after_rerank": 0.30, "max_llm_filter_candidates": 10, + # When no LLM client is configured, the ambiguous "llm_needed" band + # (reranked priority between auto_reject and auto_keep) is normally + # rejected outright (fail-closed). With this flag enabled — for dev / + # offline / no-API-key runs — a borderline candidate is instead KEPT if it + # is an official domain OR its keyword-overlap relevance clears + # ``no_llm_fallback_relevance_threshold``. This approximates the LLM-rescue + # path without an API call, recovering the on-topic / authoritative tail + # without admitting the generic-news mass. Default OFF so production (which + # always has an LLM client) is unchanged. See issue #13. + "no_llm_soft_fallback": False, + "no_llm_fallback_relevance_threshold": 0.5, + "max_docs_per_domain": 2, "max_docs_per_type": 5, diff --git a/bioscancast/filtering/heuristics.py b/bioscancast/filtering/heuristics.py index 8aa67bc..fe41dc5 100644 --- a/bioscancast/filtering/heuristics.py +++ b/bioscancast/filtering/heuristics.py @@ -121,6 +121,27 @@ def heuristic_filter( ) continue + # Dashboard-injected results are hand-curated in + # ``bioscancast/datasets/biosecurity_sources.py``; they bypass the + # keyword-overlap-driven heuristic which structurally undervalues + # their generic titles. See issue #14 and live-run data on q7/q12 + # where 4/4 injected dashboards had keyword_overlap == 0.000. + if result.retrieval_reason == "dashboard_lookup": + relevance_score = compute_heuristic_relevance(result, question) + credibility_score = compute_heuristic_credibility(result) + keep_list.append( + make_decision( + result=result, + keep=True, + stage="heuristic", + relevance_score=relevance_score, + credibility_score=credibility_score, + priority_score=1.0, + reason_codes=["dashboard_lookup_bypass"], + ) + ) + continue + relevance_score = compute_heuristic_relevance(result, question) credibility_score = compute_heuristic_credibility(result) priority_score = compute_priority_score(result, relevance_score, credibility_score) diff --git a/bioscancast/filtering/pipeline.py b/bioscancast/filtering/pipeline.py index e8c2e5b..80dc0d7 100644 --- a/bioscancast/filtering/pipeline.py +++ b/bioscancast/filtering/pipeline.py @@ -37,11 +37,24 @@ def run( llm_decisions: list[FilterDecision] = [] if llm_needed: if self.llm_client is None: - # Fail closed: reject ambiguous cases if no LLM client is configured. + # No LLM client. Default is fail-closed (reject the ambiguous + # band). When the soft-fallback flag is enabled, keep candidates + # that are official-domain or sufficiently relevant — see + # FILTER_CONFIG["no_llm_soft_fallback"] and issue #13. + soft = FILTER_CONFIG.get("no_llm_soft_fallback", False) + rel_threshold = FILTER_CONFIG.get( + "no_llm_fallback_relevance_threshold", 0.5 + ) for d in llm_needed: - d.keep = False d.stage = "llm_skipped" - d.reason_codes.append("no_llm_client_configured") + result = result_map.get(d.result_id) + is_official = bool(result and result.is_official_domain) + if soft and (is_official or d.relevance_score >= rel_threshold): + d.keep = True + d.reason_codes.append("no_llm_soft_fallback_kept") + else: + d.keep = False + d.reason_codes.append("no_llm_client_configured") llm_decisions = llm_needed else: llm_decisions = llm_filter_candidates( diff --git a/bioscancast/filtering/postprocess.py b/bioscancast/filtering/postprocess.py index c0caba5..fd8d07c 100644 --- a/bioscancast/filtering/postprocess.py +++ b/bioscancast/filtering/postprocess.py @@ -49,6 +49,17 @@ def cap_per_domain_and_type( max_docs_per_domain: int, max_docs_per_type: int, ) -> List[FilteredDocument]: + """Limit how many docs from a single domain or file type survive. + + Dashboard-bypassed docs (selection_reasons contains + ``"dashboard_lookup_bypass"``) are always kept and do not consume a + slot against either cap. Curated dashboard injections are a separate + channel from organic search results; without this carve-out, a + dashboard sitting at synthetic priority 1.0 displaces a genuine + organic candidate on the same domain - which is exactly what + happened on q7 (WHO sitreps dashboard squeezed out the WHO research + event page that the baseline extracted records from). + """ kept: list[FilteredDocument] = [] domain_counts = defaultdict(int) type_counts = defaultdict(int) @@ -56,14 +67,20 @@ def cap_per_domain_and_type( for doc in docs: doc_type = doc.file_type or "unknown" - if domain_counts[doc.domain] >= max_docs_per_domain: - continue - if type_counts[doc_type] >= max_docs_per_type: - continue + is_dashboard_bypass = "dashboard_lookup_bypass" in ( + doc.selection_reasons or [] + ) + + if not is_dashboard_bypass: + if domain_counts[doc.domain] >= max_docs_per_domain: + continue + if type_counts[doc_type] >= max_docs_per_type: + continue kept.append(doc) - domain_counts[doc.domain] += 1 - type_counts[doc_type] += 1 + if not is_dashboard_bypass: + domain_counts[doc.domain] += 1 + type_counts[doc_type] += 1 return kept diff --git a/bioscancast/stages/search_stage/dashboard_lookup.py b/bioscancast/stages/search_stage/dashboard_lookup.py index e3784c3..2d3048c 100644 --- a/bioscancast/stages/search_stage/dashboard_lookup.py +++ b/bioscancast/stages/search_stage/dashboard_lookup.py @@ -31,6 +31,46 @@ logger = logging.getLogger(__name__) +# Common name variants that should route to a canonical DASHBOARD_LOOKUP key. +# The canonical-key substring fallback in ``_resolve_pathogen_key`` already +# handles suffixes like "marburg virus disease" -> "marburg"; this map covers +# synonyms where the canonical key is NOT a substring of the alias. +_PATHOGEN_ALIASES: dict[str, str] = { + "monkeypox": "mpox", + "sars-cov-2": "covid-19", + "sars-cov2": "covid-19", + "covid": "covid-19", + "covid19": "covid-19", + "coronavirus": "covid-19", + "bird flu": "h5n1", + "avian flu": "h5n1", +} + + +def _resolve_pathogen_key(pathogen: str) -> str | None: + """Map a free-text pathogen string to a DASHBOARD_LOOKUP key, tolerantly. + + Resolution order: exact key, exact alias, alias-substring, then + canonical-key substring (longest match wins, so "ebola virus disease" + resolves to "ebola" and "marburg virus disease" to "marburg"). Returns + None if nothing matches. + """ + key = pathogen.strip().lower() + if not key: + return None + if key in DASHBOARD_LOOKUP: + return key + if key in _PATHOGEN_ALIASES and _PATHOGEN_ALIASES[key] in DASHBOARD_LOOKUP: + return _PATHOGEN_ALIASES[key] + for alias, canon in _PATHOGEN_ALIASES.items(): + if alias in key and canon in DASHBOARD_LOOKUP: + return canon + matches = [k for k in DASHBOARD_LOOKUP if k in key] + if matches: + return max(matches, key=len) + return None + + def lookup_dashboards(question: ForecastQuestion) -> List[SearchResult]: """Generate synthetic SearchResult entries for known pathogen dashboards. @@ -47,22 +87,22 @@ def lookup_dashboards(question: ForecastQuestion) -> List[SearchResult]: if not question.pathogen: return [] - pathogen_key = question.pathogen.strip().lower() - urls = DASHBOARD_LOOKUP.get(pathogen_key, []) - if not urls: + pathogen_key = _resolve_pathogen_key(question.pathogen) + if not pathogen_key: return [] + entries = DASHBOARD_LOOKUP[pathogen_key] as_of = question.as_of_date results: list[SearchResult] = [] now = datetime.now(timezone.utc) - for url in urls: + for entry in entries: if as_of is not None: - snapshot = closest_snapshot_before(url, as_of) + snapshot = closest_snapshot_before(entry.url, as_of) if snapshot is None: logger.info( "Suppressing dashboard %s — no Wayback snapshot at-or-before %s", - url, as_of.isoformat(), + entry.url, as_of.isoformat(), ) continue snapshot_dt, snapshot_url = snapshot @@ -71,12 +111,12 @@ def lookup_dashboards(question: ForecastQuestion) -> List[SearchResult]: published_date_source = "wayback_snapshot" # Keep ``domain`` as the original publisher for tier scoring; # the URL itself points at archive.org for fetching. - domain = extract_domain(url) + domain = extract_domain(entry.url) else: - effective_url = url + effective_url = entry.url published_date = None published_date_source = None - domain = extract_domain(url) + domain = extract_domain(entry.url) tier_num, domain_score, source_tier = resolve_tier(domain) @@ -89,8 +129,8 @@ def lookup_dashboards(question: ForecastQuestion) -> List[SearchResult]: url=effective_url, canonical_url=normalize_url(effective_url), domain=domain, - title=f"Dashboard: {domain}", - snippet=f"Known {pathogen_key} monitoring dashboard", + title=entry.title, + snippet=entry.snippet, rank=0, retrieved_at=now, published_date=published_date, diff --git a/bioscancast/stages/search_stage/pipeline.py b/bioscancast/stages/search_stage/pipeline.py index edc573f..1477634 100644 --- a/bioscancast/stages/search_stage/pipeline.py +++ b/bioscancast/stages/search_stage/pipeline.py @@ -13,7 +13,9 @@ from typing import List, Optional from bioscancast.filtering.config import FILTER_CONFIG +from bioscancast.filtering.heuristics import build_query_terms from bioscancast.filtering.models import ForecastQuestion, SearchResult +from bioscancast.filtering.utils import keyword_overlap_score from bioscancast.llm.base import LLMClient from bioscancast.stages.search_stage.backends.base import RawSearchResult, SearchBackend from bioscancast.stages.search_stage.cache import SearchCache @@ -83,10 +85,39 @@ def _compute_freshness( return max(0.0, min(1.0, 1.0 - (days_old / 365.0))) -def _compute_search_stage_score(domain_score: float, freshness_score: float, rank: int) -> float: - """search_stage_score = 0.5 * domain_score + 0.3 * freshness_score + 0.2 * (1/rank)""" +# search_stage_score weights (sum to 1.0). Relevance (keyword overlap of +# title/snippet/domain against the question terms) is the dominant term: +# domain/freshness/rank alone rank off-topic high-authority content too highly, +# because freshness is ~uniform in live mode and domain score is too coarse to +# separate on-topic from off-topic within a tier. Freshness is kept low for that +# reason. See data/investigations/findings-issues-3-4-13.md (#4). +_SCORE_W_RELEVANCE = 0.45 +_SCORE_W_DOMAIN = 0.30 +_SCORE_W_FRESHNESS = 0.10 +_SCORE_W_RANK = 0.15 + + +def _compute_relevance(result: SearchResult, question: ForecastQuestion) -> float: + """Keyword overlap of the result against the question terms. + + Mirrors ``bioscancast.filtering.heuristics.compute_heuristic_relevance`` so + the search stage and the filter stage use the same relevance signal. + """ + text = f"{result.title} {result.snippet} {result.domain}" + return keyword_overlap_score(text, build_query_terms(question)) + + +def _compute_search_stage_score( + relevance: float, domain_score: float, freshness_score: float, rank: int +) -> float: + """search_stage_score = 0.45*relevance + 0.30*domain + 0.10*freshness + 0.15*(1/rank)""" rank_score = 1.0 / max(rank, 1) - raw = 0.5 * domain_score + 0.3 * freshness_score + 0.2 * rank_score + raw = ( + _SCORE_W_RELEVANCE * relevance + + _SCORE_W_DOMAIN * domain_score + + _SCORE_W_FRESHNESS * freshness_score + + _SCORE_W_RANK * rank_score + ) return max(0.0, min(1.0, raw)) @@ -260,7 +291,10 @@ def run(self, question: ForecastQuestion) -> List[SearchResult]: r.published_date, reference_date=as_of ) r.search_stage_score = _compute_search_stage_score( - r.domain_score, r.freshness_score, r.rank + _compute_relevance(r, question), + r.domain_score, + r.freshness_score, + r.rank, ) # 8. Sort and cap diff --git a/bioscancast/tests/test_dashboard_lookup.py b/bioscancast/tests/test_dashboard_lookup.py index c244e49..73163b0 100644 --- a/bioscancast/tests/test_dashboard_lookup.py +++ b/bioscancast/tests/test_dashboard_lookup.py @@ -46,6 +46,23 @@ def test_case_insensitive(self): results = lookup_dashboards(q) assert len(results) > 0 + def test_multiword_pathogen_routes_via_substring(self): + # CSV-natural "Marburg Virus Disease" -> pathogen "marburg virus disease" + # must still route to the "marburg" dashboard key. + canonical = lookup_dashboards(_make_question(pathogen="marburg")) + multiword = lookup_dashboards(_make_question(pathogen="marburg virus disease")) + assert len(multiword) > 0 + assert [r.url for r in multiword] == [r.url for r in canonical] + + def test_alias_routes_to_canonical(self): + # "monkeypox" -> "mpox"; "bird flu" -> "h5n1". + assert len(lookup_dashboards(_make_question(pathogen="monkeypox"))) > 0 + assert ( + [r.url for r in lookup_dashboards(_make_question(pathogen="monkeypox"))] + == [r.url for r in lookup_dashboards(_make_question(pathogen="mpox"))] + ) + assert len(lookup_dashboards(_make_question(pathogen="bird flu"))) > 0 + def test_results_have_required_fields(self): q = _make_question(pathogen="ebola") results = lookup_dashboards(q) diff --git a/bioscancast/tests/test_pipeline.py b/bioscancast/tests/test_pipeline.py index f65bceb..f914225 100644 --- a/bioscancast/tests/test_pipeline.py +++ b/bioscancast/tests/test_pipeline.py @@ -1,5 +1,6 @@ from datetime import datetime +from bioscancast.filtering.config import FILTER_CONFIG from bioscancast.filtering.models import ForecastQuestion, SearchResult from bioscancast.filtering.pipeline import FilteringPipeline @@ -36,4 +37,59 @@ def test_pipeline_keeps_official_result(): docs = pipeline.run(question, [result]) assert len(docs) == 1 - assert docs[0].domain == "who.int" \ No newline at end of file + assert docs[0].domain == "who.int" + + +def _borderline_question(): + return ForecastQuestion( + id="q1", + text="How many confirmed Ebola cases in the DRC outbreak?", + created_at=datetime(2026, 5, 1), + pathogen="ebola", + region="DRC", + ) + + +def _borderline_result(): + # trusted_media (domain_score 0.6, non-official) with partial term overlap → + # lands in the heuristic borderline band, then the no-LLM "llm_needed" band. + return SearchResult( + id="r-border", + question_id="q1", + query_id="sq1", + engine="google", + url="https://www.cnn.com/ebola-drc", + canonical_url="https://www.cnn.com/ebola-drc", + domain="cnn.com", + title="Ebola cases climb in the DRC outbreak", + snippet="Confirmed Ebola cases reported in the Democratic Republic of the Congo outbreak.", + rank=2, + retrieved_at=datetime(2026, 5, 1), + source_tier="trusted_media", + is_official_domain=False, + domain_score=0.6, + freshness_score=1.0, + search_stage_score=0.6, + ) + + +def test_no_llm_soft_fallback_flag_changes_borderline_outcome(): + question = _borderline_question() + result = _borderline_result() + + saved = dict(FILTER_CONFIG) + try: + # Flag OFF (default): fail closed → borderline candidate dropped. + FILTER_CONFIG["no_llm_soft_fallback"] = False + docs_off = FilteringPipeline(llm_client=None).run(question, [result]) + + # Flag ON: relevant borderline candidate kept without an LLM call. + FILTER_CONFIG["no_llm_soft_fallback"] = True + FILTER_CONFIG["no_llm_fallback_relevance_threshold"] = 0.0 + docs_on = FilteringPipeline(llm_client=None).run(question, [result]) + finally: + FILTER_CONFIG.clear() + FILTER_CONFIG.update(saved) + + assert {d.result_id for d in docs_off} == set() + assert {d.result_id for d in docs_on} == {"r-border"} \ No newline at end of file diff --git a/bioscancast/tests/test_search_pipeline.py b/bioscancast/tests/test_search_pipeline.py index 907d6a4..d140fab 100644 --- a/bioscancast/tests/test_search_pipeline.py +++ b/bioscancast/tests/test_search_pipeline.py @@ -160,9 +160,14 @@ def test_total_cap_enforced(self): def test_scoring_formula(self): """Verify the search_stage_score formula for a known result.""" + from bioscancast.stages.search_stage.pipeline import _compute_relevance + + question = _make_question() results = self._run_pipeline() for r in results: - expected = 0.5 * r.domain_score + 0.3 * r.freshness_score + 0.2 * (1.0 / max(r.rank, 1)) + rel = _compute_relevance(r, question) + rank_score = 1.0 / max(r.rank, 1) + expected = 0.45 * rel + 0.30 * r.domain_score + 0.10 * r.freshness_score + 0.15 * rank_score expected = max(0.0, min(1.0, expected)) assert abs(r.search_stage_score - expected) < 1e-9, ( f"Score mismatch for {r.url}: {r.search_stage_score} != {expected}" diff --git a/bioscancast/tests/test_tier_resolution.py b/bioscancast/tests/test_tier_resolution.py index d8aa72f..d291303 100644 --- a/bioscancast/tests/test_tier_resolution.py +++ b/bioscancast/tests/test_tier_resolution.py @@ -54,6 +54,18 @@ def test_unknown_domain(self): assert score == 0.2 assert label == "unknown" + def test_national_news_is_trusted_media(self): + for domain in ("cnn.com", "nbcnews.com", "forbes.com", "latimes.com", "npr.org"): + tier, score, label = resolve_tier(domain) + assert tier == 3, domain + assert score == 0.6, domain + assert label == "trusted_media", domain + + def test_national_news_subdomain_match(self): + # edition.cnn.com / africa.businessinsider.com resolve via SLD. + assert resolve_tier("edition.cnn.com")[2] == "trusted_media" + assert resolve_tier("africa.businessinsider.com")[2] == "trusted_media" + def test_subdomain_match(self): """wwwnc.cdc.gov should match cdc.gov via second-level domain.""" tier, score, label = resolve_tier("wwwnc.cdc.gov")