From 696d02778cc232caa268e0895c3ffde68704a87a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 23:28:46 +0200 Subject: [PATCH 1/8] Bypass heuristic scoring for dashboard-injected results (#14) Dashboard URLs from the curated registry in bioscancast/datasets/biosecurity_sources.py have hand-picked titles ("Dashboard: cdc.gov") and generic snippets that produce keyword_overlap_score = 0.000 against any real forecast question. The heuristic priority score drags them under the 0.72 keep threshold even though they are by construction high-value sources. Live-run evidence: q7 and q12 each injected two dashboards. All four had keyword_overlap = 0.000. Two of those four were dropped pre-LLM, including ourworldindata.org for q7 - which is the resolution source named in the question's relevant_links column. Fix: in heuristic_filter, detect retrieval_reason == "dashboard_lookup" and auto-keep with reason_code "dashboard_lookup_bypass" and a synthetic priority_score of 1.0. The dashboards still go through the rest of the filtering pipeline (dedup, per-domain cap, extraction-hint assignment) unchanged - this is the keyword-overlap chokepoint only. Implements item 1 from the Tier 1 roadmap. Pairs with the dashboard title/snippet enrichment in the next commit. --- bioscancast/filtering/heuristics.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/bioscancast/filtering/heuristics.py b/bioscancast/filtering/heuristics.py index 8aa67bc..fe41dc5 100644 --- a/bioscancast/filtering/heuristics.py +++ b/bioscancast/filtering/heuristics.py @@ -121,6 +121,27 @@ def heuristic_filter( ) continue + # Dashboard-injected results are hand-curated in + # ``bioscancast/datasets/biosecurity_sources.py``; they bypass the + # keyword-overlap-driven heuristic which structurally undervalues + # their generic titles. See issue #14 and live-run data on q7/q12 + # where 4/4 injected dashboards had keyword_overlap == 0.000. + if result.retrieval_reason == "dashboard_lookup": + relevance_score = compute_heuristic_relevance(result, question) + credibility_score = compute_heuristic_credibility(result) + keep_list.append( + make_decision( + result=result, + keep=True, + stage="heuristic", + relevance_score=relevance_score, + credibility_score=credibility_score, + priority_score=1.0, + reason_codes=["dashboard_lookup_bypass"], + ) + ) + continue + relevance_score = compute_heuristic_relevance(result, question) credibility_score = compute_heuristic_credibility(result) priority_score = compute_priority_score(result, relevance_score, credibility_score) From 57f1c62da75e4e1da8547a9fdd752827e0b1efbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 23:30:54 +0200 Subject: [PATCH 2/8] Enrich dashboard titles and snippets with pathogen-specific text (#14) The previous dashboard injection used generic strings ("Dashboard: cdc.gov", "Known mpox monitoring dashboard") that produced keyword_overlap_score = 0.000 against every real forecast question - 4/4 injected dashboards in the q7/q12 live runs had this exact failure mode. The fix: turn DASHBOARD_LOOKUP into a list of DashboardEntry dataclasses carrying url + title + snippet, with hand-written pathogen-specific text for each entry. The titles read as real search-result titles ("CDC H5N1 bird flu situation summary: human cases and outbreaks in the United States") and the snippets describe what data the page hosts. Pairs with the previous commit's dashboard heuristic bypass: even with the bypass in place, better titles still help (a) the keyword-overlap score for downstream scoring, and (b) the LLM rescue path when it encounters other pathogen-specific dashboards we add later. The bypass keeps low-keyword-overlap dashboards alive; this commit makes them discoverable on their own merits. Implements item 5 from the Tier 1/2 roadmap. --- bioscancast/datasets/biosecurity_sources.py | 112 +++++++++++++++--- .../stages/search_stage/dashboard_lookup.py | 20 ++-- 2 files changed, 105 insertions(+), 27 deletions(-) diff --git a/bioscancast/datasets/biosecurity_sources.py b/bioscancast/datasets/biosecurity_sources.py index ddfd11c..f4b9878 100644 --- a/bioscancast/datasets/biosecurity_sources.py +++ b/bioscancast/datasets/biosecurity_sources.py @@ -1,34 +1,112 @@ """Known biosecurity dashboard URLs by pathogen. -v1 — flagged for iteration after first benchmark run. -The dashboard list and routing logic will need updating as new outbreaks emerge -and data portals change. +v1 — flagged for iteration after first benchmark run. The dashboard list +and routing logic will need updating as new outbreaks emerge and data +portals change. + +Each entry carries a pathogen-specific ``title`` and ``snippet`` so that +the heuristic filter and the LLM-rescue path have real signal to work +with. The earlier convention ("Dashboard: cdc.gov" with a generic +snippet) produced keyword_overlap_score = 0.000 across the board — see +issue #14 and the q7/q12 live-run findings. """ -DASHBOARD_LOOKUP: dict[str, list[str]] = { +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class DashboardEntry: + """A curated authoritative source for a pathogen. + + The title and snippet are intended to be readable as a search result + in their own right: pathogen name, the kind of data the page hosts, + and the publisher. They feed both the keyword-overlap heuristic and + the LLM-rescue path. + """ + + url: str + title: str + snippet: str + + +DASHBOARD_LOOKUP: dict[str, list[DashboardEntry]] = { "h5n1": [ - "https://www.cdc.gov/bird-flu/situation-summary/", - "https://www.who.int/teams/global-influenza-programme/avian-influenza", + DashboardEntry( + url="https://www.cdc.gov/bird-flu/situation-summary/", + title="CDC H5N1 bird flu situation summary: human cases and outbreaks in the United States", + snippet="CDC tracking of H5N1 avian influenza human cases, affected livestock herds, and public-health response in the US.", + ), + DashboardEntry( + url="https://www.who.int/teams/global-influenza-programme/avian-influenza", + title="WHO Global Influenza Programme: avian influenza A(H5N1) human cases and surveillance", + snippet="WHO monitoring of human H5N1 cases, animal-to-human spillover events, and global surveillance reporting.", + ), ], "avian influenza": [ - "https://www.cdc.gov/bird-flu/situation-summary/", - "https://www.who.int/teams/global-influenza-programme/avian-influenza", + DashboardEntry( + url="https://www.cdc.gov/bird-flu/situation-summary/", + title="CDC H5N1 bird flu situation summary: human cases and outbreaks in the United States", + snippet="CDC tracking of H5N1 avian influenza human cases, affected livestock herds, and public-health response in the US.", + ), + DashboardEntry( + url="https://www.who.int/teams/global-influenza-programme/avian-influenza", + title="WHO Global Influenza Programme: avian influenza A(H5N1) human cases and surveillance", + snippet="WHO monitoring of human H5N1 cases, animal-to-human spillover events, and global surveillance reporting.", + ), ], "mpox": [ - "https://ourworldindata.org/mpox", - "https://www.who.int/emergencies/situation-reports", - "https://www.cdc.gov/mpox/data-research/index.html", + DashboardEntry( + url="https://ourworldindata.org/mpox", + title="Our World in Data mpox tracker: global confirmed cases and deaths", + snippet="OWID dashboard tracking cumulative confirmed mpox cases and deaths globally, broken down by country and region, updated from national health agencies.", + ), + DashboardEntry( + url="https://www.who.int/emergencies/situation-reports", + title="WHO situation reports including the multi-country mpox outbreak", + snippet="WHO situation reports with weekly case counts, country breakdowns, and public-health guidance for ongoing outbreaks including mpox.", + ), + DashboardEntry( + url="https://www.cdc.gov/mpox/data-research/index.html", + title="CDC mpox data and research dashboard for the United States", + snippet="CDC tracking of US mpox cases, demographic data, vaccination coverage, and outbreak response.", + ), ], "ebola": [ - "https://www.afro.who.int/health-topics/ebola-virus-disease", - "https://www.cdc.gov/ebola/index.html", + DashboardEntry( + url="https://www.afro.who.int/health-topics/ebola-virus-disease", + title="WHO Africa Ebola virus disease outbreak surveillance and case counts", + snippet="WHO regional office for Africa tracking of Ebola virus disease outbreaks, confirmed and suspected cases, deaths, and response across African countries.", + ), + DashboardEntry( + url="https://www.cdc.gov/ebola/index.html", + title="CDC Ebola virus disease outbreak history and case counts", + snippet="CDC information on current and historical Ebola virus disease outbreaks worldwide, with case counts, deaths, and US public-health response.", + ), ], "covid-19": [ - "https://ourworldindata.org/coronavirus", - "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports", + DashboardEntry( + url="https://ourworldindata.org/coronavirus", + title="Our World in Data COVID-19 tracker: global cases, deaths, and vaccinations", + snippet="OWID dashboard tracking cumulative COVID-19 confirmed cases, deaths, hospitalizations, and vaccination coverage globally by country.", + ), + DashboardEntry( + url="https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports", + title="WHO COVID-19 situation reports and global case counts", + snippet="WHO situation reports with updates on COVID-19 confirmed cases, deaths, variant tracking, and country-level data.", + ), ], "marburg": [ - "https://www.who.int/news-room/fact-sheets/detail/marburg-virus-disease", - "https://www.cdc.gov/marburg/index.html", + DashboardEntry( + url="https://www.who.int/news-room/fact-sheets/detail/marburg-virus-disease", + title="WHO Marburg virus disease facts and outbreak case counts", + snippet="WHO factsheet on Marburg virus disease including transmission, symptoms, case-fatality ratio, and historical outbreak case and death counts.", + ), + DashboardEntry( + url="https://www.cdc.gov/marburg/index.html", + title="CDC Marburg virus disease outbreaks and surveillance", + snippet="CDC information on Marburg virus disease outbreaks worldwide, case counts, deaths, and US public-health surveillance.", + ), ], } diff --git a/bioscancast/stages/search_stage/dashboard_lookup.py b/bioscancast/stages/search_stage/dashboard_lookup.py index e3784c3..87fb9a5 100644 --- a/bioscancast/stages/search_stage/dashboard_lookup.py +++ b/bioscancast/stages/search_stage/dashboard_lookup.py @@ -48,21 +48,21 @@ def lookup_dashboards(question: ForecastQuestion) -> List[SearchResult]: return [] pathogen_key = question.pathogen.strip().lower() - urls = DASHBOARD_LOOKUP.get(pathogen_key, []) - if not urls: + entries = DASHBOARD_LOOKUP.get(pathogen_key, []) + if not entries: return [] as_of = question.as_of_date results: list[SearchResult] = [] now = datetime.now(timezone.utc) - for url in urls: + for entry in entries: if as_of is not None: - snapshot = closest_snapshot_before(url, as_of) + snapshot = closest_snapshot_before(entry.url, as_of) if snapshot is None: logger.info( "Suppressing dashboard %s — no Wayback snapshot at-or-before %s", - url, as_of.isoformat(), + entry.url, as_of.isoformat(), ) continue snapshot_dt, snapshot_url = snapshot @@ -71,12 +71,12 @@ def lookup_dashboards(question: ForecastQuestion) -> List[SearchResult]: published_date_source = "wayback_snapshot" # Keep ``domain`` as the original publisher for tier scoring; # the URL itself points at archive.org for fetching. - domain = extract_domain(url) + domain = extract_domain(entry.url) else: - effective_url = url + effective_url = entry.url published_date = None published_date_source = None - domain = extract_domain(url) + domain = extract_domain(entry.url) tier_num, domain_score, source_tier = resolve_tier(domain) @@ -89,8 +89,8 @@ def lookup_dashboards(question: ForecastQuestion) -> List[SearchResult]: url=effective_url, canonical_url=normalize_url(effective_url), domain=domain, - title=f"Dashboard: {domain}", - snippet=f"Known {pathogen_key} monitoring dashboard", + title=entry.title, + snippet=entry.snippet, rank=0, retrieved_at=now, published_date=published_date, From 29201465599531e3a78c463d2db9811f201204f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 23:32:05 +0200 Subject: [PATCH 3/8] Lower heuristic_keep_threshold from 0.72 to 0.65 (#13) Live runs on q7 and q12 showed filter survival of 4.7% and 13.5% respectively, even with LLM rescue enabled. The 0.72 threshold was set without benchmarking against real Tavily output and is too tight for the heuristic's actual signal. With the new threshold, priority_scores in the 0.65-0.72 band are auto-kept by heuristics instead of routed to the LLM rescue path. The borderline threshold (0.45) is unchanged, so the LLM filter still gates 0.45-0.65 candidates - the change just moves the auto-keep line to better match what the heuristic can actually distinguish. Implements item 2 from the Tier 1 roadmap. Pairs with the dashboard bypass + enrichment commits to attack the filter chokepoint from multiple angles. --- bioscancast/filtering/config.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bioscancast/filtering/config.py b/bioscancast/filtering/config.py index f0477aa..4bf99c2 100644 --- a/bioscancast/filtering/config.py +++ b/bioscancast/filtering/config.py @@ -38,7 +38,14 @@ "domain": 0.20, "official_bonus": 0.20, }, - "heuristic_keep_threshold": 0.72, + # Lowered from 0.72 to 0.65 after q7/q12 live runs showed filter + # survival of 4.7% / 13.5% — the threshold was tighter than the + # heuristic's actual signal supports. Borderline candidates that + # cross the new threshold still go to the LLM rescue path; this + # change just stops dropping high-credibility-but-low-keyword-overlap + # results pre-LLM (e.g. apnews/theguardian/washingtonpost in q7). + # See issue #13. + "heuristic_keep_threshold": 0.65, "heuristic_borderline_threshold": 0.45, "reranker_weights": { From 0d2a2064166e9711d564c6f133a111306696fefa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 23:57:01 +0200 Subject: [PATCH 4/8] Exempt dashboard-bypassed docs from cap_per_domain_and_type q7's second live run on this branch surfaced an interaction between the new dashboard heuristic bypass and the per-domain cap. With max_docs_per_domain=2 and the dashboard bypass injecting one who.int slot at synthetic priority 1.0, the cap was effectively reducing who.int to ONE organic slot - and the slot was going to a priority-0.7097 strategic-plan announcement page, squeezing out the priority-0.6966 WHO mpox research event page that the baseline run had extracted records from. Offline filter replay on the saved q7 search.json confirms the mechanism: Heuristic-keep (4 who.int / ourworldindata.org docs): 1.0000 WHO sitreps dashboard (bypass) 1.0000 OWID mpox dashboard (bypass) 0.7097 WHO global strategic preparedness plan (organic) 0.6966 WHO mpox research event (organic) <- baseline's data source After old cap_per_domain (max=2 per domain): Dashboards displace one organic each; research event capped out. The fix: dashboard-bypass docs (selection_reasons contains "dashboard_lookup_bypass") are always kept and do not consume a slot against the per-domain or per-type caps. They are curated additions, not competing organic results. After the change all four candidates survive, and the WHO research event page reaches insight as it did in the baseline. 447 tests still passing. --- bioscancast/filtering/postprocess.py | 29 ++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/bioscancast/filtering/postprocess.py b/bioscancast/filtering/postprocess.py index c0caba5..fd8d07c 100644 --- a/bioscancast/filtering/postprocess.py +++ b/bioscancast/filtering/postprocess.py @@ -49,6 +49,17 @@ def cap_per_domain_and_type( max_docs_per_domain: int, max_docs_per_type: int, ) -> List[FilteredDocument]: + """Limit how many docs from a single domain or file type survive. + + Dashboard-bypassed docs (selection_reasons contains + ``"dashboard_lookup_bypass"``) are always kept and do not consume a + slot against either cap. Curated dashboard injections are a separate + channel from organic search results; without this carve-out, a + dashboard sitting at synthetic priority 1.0 displaces a genuine + organic candidate on the same domain - which is exactly what + happened on q7 (WHO sitreps dashboard squeezed out the WHO research + event page that the baseline extracted records from). + """ kept: list[FilteredDocument] = [] domain_counts = defaultdict(int) type_counts = defaultdict(int) @@ -56,14 +67,20 @@ def cap_per_domain_and_type( for doc in docs: doc_type = doc.file_type or "unknown" - if domain_counts[doc.domain] >= max_docs_per_domain: - continue - if type_counts[doc_type] >= max_docs_per_type: - continue + is_dashboard_bypass = "dashboard_lookup_bypass" in ( + doc.selection_reasons or [] + ) + + if not is_dashboard_bypass: + if domain_counts[doc.domain] >= max_docs_per_domain: + continue + if type_counts[doc_type] >= max_docs_per_type: + continue kept.append(doc) - domain_counts[doc.domain] += 1 - type_counts[doc_type] += 1 + if not is_dashboard_bypass: + domain_counts[doc.domain] += 1 + type_counts[doc_type] += 1 return kept From be45701980cc9c3184fb040fa37186d7dbd304ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Thu, 28 May 2026 13:02:59 +0200 Subject: [PATCH 5/8] Add a relevance term to the search-stage score search_stage_score was 0.5*domain + 0.3*freshness + 0.2*rank with no topical-relevance signal, so high-authority but off-topic results ranked at the top (e.g. sports/legal/unrelated-pathogen news). It is now 0.45*relevance + 0.30*domain + 0.10*freshness + 0.15*rank, reusing the filter's keyword_overlap_score/build_query_terms. Freshness is kept low because it is near-uniform in live mode. Addresses #4. Co-Authored-By: Claude Opus 4.7 --- bioscancast/stages/search_stage/pipeline.py | 42 +++++++++++++++++++-- bioscancast/tests/test_search_pipeline.py | 7 +++- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/bioscancast/stages/search_stage/pipeline.py b/bioscancast/stages/search_stage/pipeline.py index edc573f..1477634 100644 --- a/bioscancast/stages/search_stage/pipeline.py +++ b/bioscancast/stages/search_stage/pipeline.py @@ -13,7 +13,9 @@ from typing import List, Optional from bioscancast.filtering.config import FILTER_CONFIG +from bioscancast.filtering.heuristics import build_query_terms from bioscancast.filtering.models import ForecastQuestion, SearchResult +from bioscancast.filtering.utils import keyword_overlap_score from bioscancast.llm.base import LLMClient from bioscancast.stages.search_stage.backends.base import RawSearchResult, SearchBackend from bioscancast.stages.search_stage.cache import SearchCache @@ -83,10 +85,39 @@ def _compute_freshness( return max(0.0, min(1.0, 1.0 - (days_old / 365.0))) -def _compute_search_stage_score(domain_score: float, freshness_score: float, rank: int) -> float: - """search_stage_score = 0.5 * domain_score + 0.3 * freshness_score + 0.2 * (1/rank)""" +# search_stage_score weights (sum to 1.0). Relevance (keyword overlap of +# title/snippet/domain against the question terms) is the dominant term: +# domain/freshness/rank alone rank off-topic high-authority content too highly, +# because freshness is ~uniform in live mode and domain score is too coarse to +# separate on-topic from off-topic within a tier. Freshness is kept low for that +# reason. See data/investigations/findings-issues-3-4-13.md (#4). +_SCORE_W_RELEVANCE = 0.45 +_SCORE_W_DOMAIN = 0.30 +_SCORE_W_FRESHNESS = 0.10 +_SCORE_W_RANK = 0.15 + + +def _compute_relevance(result: SearchResult, question: ForecastQuestion) -> float: + """Keyword overlap of the result against the question terms. + + Mirrors ``bioscancast.filtering.heuristics.compute_heuristic_relevance`` so + the search stage and the filter stage use the same relevance signal. + """ + text = f"{result.title} {result.snippet} {result.domain}" + return keyword_overlap_score(text, build_query_terms(question)) + + +def _compute_search_stage_score( + relevance: float, domain_score: float, freshness_score: float, rank: int +) -> float: + """search_stage_score = 0.45*relevance + 0.30*domain + 0.10*freshness + 0.15*(1/rank)""" rank_score = 1.0 / max(rank, 1) - raw = 0.5 * domain_score + 0.3 * freshness_score + 0.2 * rank_score + raw = ( + _SCORE_W_RELEVANCE * relevance + + _SCORE_W_DOMAIN * domain_score + + _SCORE_W_FRESHNESS * freshness_score + + _SCORE_W_RANK * rank_score + ) return max(0.0, min(1.0, raw)) @@ -260,7 +291,10 @@ def run(self, question: ForecastQuestion) -> List[SearchResult]: r.published_date, reference_date=as_of ) r.search_stage_score = _compute_search_stage_score( - r.domain_score, r.freshness_score, r.rank + _compute_relevance(r, question), + r.domain_score, + r.freshness_score, + r.rank, ) # 8. Sort and cap diff --git a/bioscancast/tests/test_search_pipeline.py b/bioscancast/tests/test_search_pipeline.py index 907d6a4..d140fab 100644 --- a/bioscancast/tests/test_search_pipeline.py +++ b/bioscancast/tests/test_search_pipeline.py @@ -160,9 +160,14 @@ def test_total_cap_enforced(self): def test_scoring_formula(self): """Verify the search_stage_score formula for a known result.""" + from bioscancast.stages.search_stage.pipeline import _compute_relevance + + question = _make_question() results = self._run_pipeline() for r in results: - expected = 0.5 * r.domain_score + 0.3 * r.freshness_score + 0.2 * (1.0 / max(r.rank, 1)) + rel = _compute_relevance(r, question) + rank_score = 1.0 / max(r.rank, 1) + expected = 0.45 * rel + 0.30 * r.domain_score + 0.10 * r.freshness_score + 0.15 * rank_score expected = max(0.0, min(1.0, expected)) assert abs(r.search_stage_score - expected) < 1e-9, ( f"Score mismatch for {r.url}: {r.search_stage_score} != {expected}" From 65fcb9cc148c978cc356d4fb5231cab5835f8be1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Thu, 28 May 2026 13:03:12 +0200 Subject: [PATCH 6/8] Fix stale dashboard URLs and make pathogen routing tolerant The CDC mpox dashboard URL returned 404; replaced with the (extractable) monkeypox/situation-summary page. Updated two stale redirects (afro.who.int ebola-disease, cdc.gov/ebola/about). DASHBOARD_LOOKUP routing was an exact lowercase key match, so 'marburg virus disease' failed to route to the 'marburg' key; added _resolve_pathogen_key with alias + substring matching (marburg virus disease->marburg, monkeypox->mpox, bird flu->h5n1). Addresses #3. Co-Authored-By: Claude Opus 4.7 --- bioscancast/datasets/biosecurity_sources.py | 10 ++-- .../stages/search_stage/dashboard_lookup.py | 46 +++++++++++++++++-- bioscancast/tests/test_dashboard_lookup.py | 17 +++++++ 3 files changed, 65 insertions(+), 8 deletions(-) diff --git a/bioscancast/datasets/biosecurity_sources.py b/bioscancast/datasets/biosecurity_sources.py index f4b9878..2a47288 100644 --- a/bioscancast/datasets/biosecurity_sources.py +++ b/bioscancast/datasets/biosecurity_sources.py @@ -68,19 +68,19 @@ class DashboardEntry: snippet="WHO situation reports with weekly case counts, country breakdowns, and public-health guidance for ongoing outbreaks including mpox.", ), DashboardEntry( - url="https://www.cdc.gov/mpox/data-research/index.html", - title="CDC mpox data and research dashboard for the United States", - snippet="CDC tracking of US mpox cases, demographic data, vaccination coverage, and outbreak response.", + url="https://www.cdc.gov/monkeypox/situation-summary/index.html", + title="CDC mpox current situation summary: confirmed cases in the United States", + snippet="CDC current situation summary for mpox, with US confirmed case counts, clade information, demographics, and outbreak response.", ), ], "ebola": [ DashboardEntry( - url="https://www.afro.who.int/health-topics/ebola-virus-disease", + url="https://www.afro.who.int/health-topics/ebola-disease", title="WHO Africa Ebola virus disease outbreak surveillance and case counts", snippet="WHO regional office for Africa tracking of Ebola virus disease outbreaks, confirmed and suspected cases, deaths, and response across African countries.", ), DashboardEntry( - url="https://www.cdc.gov/ebola/index.html", + url="https://www.cdc.gov/ebola/about/index.html", title="CDC Ebola virus disease outbreak history and case counts", snippet="CDC information on current and historical Ebola virus disease outbreaks worldwide, with case counts, deaths, and US public-health response.", ), diff --git a/bioscancast/stages/search_stage/dashboard_lookup.py b/bioscancast/stages/search_stage/dashboard_lookup.py index 87fb9a5..2d3048c 100644 --- a/bioscancast/stages/search_stage/dashboard_lookup.py +++ b/bioscancast/stages/search_stage/dashboard_lookup.py @@ -31,6 +31,46 @@ logger = logging.getLogger(__name__) +# Common name variants that should route to a canonical DASHBOARD_LOOKUP key. +# The canonical-key substring fallback in ``_resolve_pathogen_key`` already +# handles suffixes like "marburg virus disease" -> "marburg"; this map covers +# synonyms where the canonical key is NOT a substring of the alias. +_PATHOGEN_ALIASES: dict[str, str] = { + "monkeypox": "mpox", + "sars-cov-2": "covid-19", + "sars-cov2": "covid-19", + "covid": "covid-19", + "covid19": "covid-19", + "coronavirus": "covid-19", + "bird flu": "h5n1", + "avian flu": "h5n1", +} + + +def _resolve_pathogen_key(pathogen: str) -> str | None: + """Map a free-text pathogen string to a DASHBOARD_LOOKUP key, tolerantly. + + Resolution order: exact key, exact alias, alias-substring, then + canonical-key substring (longest match wins, so "ebola virus disease" + resolves to "ebola" and "marburg virus disease" to "marburg"). Returns + None if nothing matches. + """ + key = pathogen.strip().lower() + if not key: + return None + if key in DASHBOARD_LOOKUP: + return key + if key in _PATHOGEN_ALIASES and _PATHOGEN_ALIASES[key] in DASHBOARD_LOOKUP: + return _PATHOGEN_ALIASES[key] + for alias, canon in _PATHOGEN_ALIASES.items(): + if alias in key and canon in DASHBOARD_LOOKUP: + return canon + matches = [k for k in DASHBOARD_LOOKUP if k in key] + if matches: + return max(matches, key=len) + return None + + def lookup_dashboards(question: ForecastQuestion) -> List[SearchResult]: """Generate synthetic SearchResult entries for known pathogen dashboards. @@ -47,10 +87,10 @@ def lookup_dashboards(question: ForecastQuestion) -> List[SearchResult]: if not question.pathogen: return [] - pathogen_key = question.pathogen.strip().lower() - entries = DASHBOARD_LOOKUP.get(pathogen_key, []) - if not entries: + pathogen_key = _resolve_pathogen_key(question.pathogen) + if not pathogen_key: return [] + entries = DASHBOARD_LOOKUP[pathogen_key] as_of = question.as_of_date results: list[SearchResult] = [] diff --git a/bioscancast/tests/test_dashboard_lookup.py b/bioscancast/tests/test_dashboard_lookup.py index c244e49..73163b0 100644 --- a/bioscancast/tests/test_dashboard_lookup.py +++ b/bioscancast/tests/test_dashboard_lookup.py @@ -46,6 +46,23 @@ def test_case_insensitive(self): results = lookup_dashboards(q) assert len(results) > 0 + def test_multiword_pathogen_routes_via_substring(self): + # CSV-natural "Marburg Virus Disease" -> pathogen "marburg virus disease" + # must still route to the "marburg" dashboard key. + canonical = lookup_dashboards(_make_question(pathogen="marburg")) + multiword = lookup_dashboards(_make_question(pathogen="marburg virus disease")) + assert len(multiword) > 0 + assert [r.url for r in multiword] == [r.url for r in canonical] + + def test_alias_routes_to_canonical(self): + # "monkeypox" -> "mpox"; "bird flu" -> "h5n1". + assert len(lookup_dashboards(_make_question(pathogen="monkeypox"))) > 0 + assert ( + [r.url for r in lookup_dashboards(_make_question(pathogen="monkeypox"))] + == [r.url for r in lookup_dashboards(_make_question(pathogen="mpox"))] + ) + assert len(lookup_dashboards(_make_question(pathogen="bird flu"))) > 0 + def test_results_have_required_fields(self): q = _make_question(pathogen="ebola") results = lookup_dashboards(q) From 7a1600035dd7885a877d8171f5a239121786e816 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Thu, 28 May 2026 13:03:23 +0200 Subject: [PATCH 7/8] Add national/international news outlets to Tier 3 Reputable outbreak reporting from outlets like CNN, NBC, CBS, ABC, NPR, USA Today, LA Times, Politico, Axios, Forbes, Bloomberg, FT, WSJ, Economist, Time, The Atlantic, Ars Technica and Business Insider was resolving to the 'unknown' tier (domain_score 0.2), sinking it below the filter's credibility floor. Promote them to Tier 3 (trusted_media, 0.6); second-level-domain matching covers subdomains. Relates to #13. Co-Authored-By: Claude Opus 4.7 --- bioscancast/datasets/source_tiers.py | 30 +++++++++++++++++++++++ bioscancast/tests/test_tier_resolution.py | 12 +++++++++ 2 files changed, 42 insertions(+) diff --git a/bioscancast/datasets/source_tiers.py b/bioscancast/datasets/source_tiers.py index 93a0013..8abadcc 100644 --- a/bioscancast/datasets/source_tiers.py +++ b/bioscancast/datasets/source_tiers.py @@ -59,6 +59,36 @@ "wikipedia.org", "sciencedirect.com", "pubmed.ncbi.nlm.nih.gov", + # National/international news with established newsrooms. Added after the + # #13 tier-coverage audit (data/investigations/findings-issues-3-4-13.md): + # live pools showed reputable outbreak reporting from these outlets + # resolving to "unknown" (domain_score 0.2), which sank them below the + # filter's credibility floor. Second-level-domain matching in + # resolve_tier() covers subdomains (edition.cnn.com, ca.news.yahoo.com, + # africa.businessinsider.com, etc.). + "cnn.com", + "nbcnews.com", + "cbsnews.com", + "abcnews.go.com", + "abcnews.com", + "npr.org", + "pbs.org", + "usatoday.com", + "latimes.com", + "politico.com", + "politico.eu", + "axios.com", + "thehill.com", + "forbes.com", + "bloomberg.com", + "ft.com", + "wsj.com", + "economist.com", + "time.com", + "theatlantic.com", + "newyorker.com", + "arstechnica.com", + "businessinsider.com", } TIER_4_DOMAINS: set[str] = { diff --git a/bioscancast/tests/test_tier_resolution.py b/bioscancast/tests/test_tier_resolution.py index d8aa72f..d291303 100644 --- a/bioscancast/tests/test_tier_resolution.py +++ b/bioscancast/tests/test_tier_resolution.py @@ -54,6 +54,18 @@ def test_unknown_domain(self): assert score == 0.2 assert label == "unknown" + def test_national_news_is_trusted_media(self): + for domain in ("cnn.com", "nbcnews.com", "forbes.com", "latimes.com", "npr.org"): + tier, score, label = resolve_tier(domain) + assert tier == 3, domain + assert score == 0.6, domain + assert label == "trusted_media", domain + + def test_national_news_subdomain_match(self): + # edition.cnn.com / africa.businessinsider.com resolve via SLD. + assert resolve_tier("edition.cnn.com")[2] == "trusted_media" + assert resolve_tier("africa.businessinsider.com")[2] == "trusted_media" + def test_subdomain_match(self): """wwwnc.cdc.gov should match cdc.gov via second-level domain.""" tier, score, label = resolve_tier("wwwnc.cdc.gov") From 770f339e05c1f0ce78b54f785a352096e35b6587 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Thu, 28 May 2026 13:03:34 +0200 Subject: [PATCH 8/8] Add opt-in soft fallback for the no-LLM filter path When llm_client is None the ambiguous rerank band was always rejected (fail-closed), which is overly aggressive for dev/offline/no-API-key runs. Add a default-off FILTER_CONFIG flag 'no_llm_soft_fallback' (+ no_llm_fallback_relevance_threshold) that instead keeps a borderline candidate iff it is an official domain OR its keyword-overlap relevance clears the threshold, approximating the LLM-rescue path. Production (always has an LLM client) is unchanged. Addresses #13. Co-Authored-By: Claude Opus 4.7 --- bioscancast/filtering/config.py | 12 +++++++ bioscancast/filtering/pipeline.py | 19 ++++++++-- bioscancast/tests/test_pipeline.py | 58 +++++++++++++++++++++++++++++- 3 files changed, 85 insertions(+), 4 deletions(-) diff --git a/bioscancast/filtering/config.py b/bioscancast/filtering/config.py index 4bf99c2..187dee4 100644 --- a/bioscancast/filtering/config.py +++ b/bioscancast/filtering/config.py @@ -56,6 +56,18 @@ "auto_reject_after_rerank": 0.30, "max_llm_filter_candidates": 10, + # When no LLM client is configured, the ambiguous "llm_needed" band + # (reranked priority between auto_reject and auto_keep) is normally + # rejected outright (fail-closed). With this flag enabled — for dev / + # offline / no-API-key runs — a borderline candidate is instead KEPT if it + # is an official domain OR its keyword-overlap relevance clears + # ``no_llm_fallback_relevance_threshold``. This approximates the LLM-rescue + # path without an API call, recovering the on-topic / authoritative tail + # without admitting the generic-news mass. Default OFF so production (which + # always has an LLM client) is unchanged. See issue #13. + "no_llm_soft_fallback": False, + "no_llm_fallback_relevance_threshold": 0.5, + "max_docs_per_domain": 2, "max_docs_per_type": 5, diff --git a/bioscancast/filtering/pipeline.py b/bioscancast/filtering/pipeline.py index e8c2e5b..80dc0d7 100644 --- a/bioscancast/filtering/pipeline.py +++ b/bioscancast/filtering/pipeline.py @@ -37,11 +37,24 @@ def run( llm_decisions: list[FilterDecision] = [] if llm_needed: if self.llm_client is None: - # Fail closed: reject ambiguous cases if no LLM client is configured. + # No LLM client. Default is fail-closed (reject the ambiguous + # band). When the soft-fallback flag is enabled, keep candidates + # that are official-domain or sufficiently relevant — see + # FILTER_CONFIG["no_llm_soft_fallback"] and issue #13. + soft = FILTER_CONFIG.get("no_llm_soft_fallback", False) + rel_threshold = FILTER_CONFIG.get( + "no_llm_fallback_relevance_threshold", 0.5 + ) for d in llm_needed: - d.keep = False d.stage = "llm_skipped" - d.reason_codes.append("no_llm_client_configured") + result = result_map.get(d.result_id) + is_official = bool(result and result.is_official_domain) + if soft and (is_official or d.relevance_score >= rel_threshold): + d.keep = True + d.reason_codes.append("no_llm_soft_fallback_kept") + else: + d.keep = False + d.reason_codes.append("no_llm_client_configured") llm_decisions = llm_needed else: llm_decisions = llm_filter_candidates( diff --git a/bioscancast/tests/test_pipeline.py b/bioscancast/tests/test_pipeline.py index f65bceb..f914225 100644 --- a/bioscancast/tests/test_pipeline.py +++ b/bioscancast/tests/test_pipeline.py @@ -1,5 +1,6 @@ from datetime import datetime +from bioscancast.filtering.config import FILTER_CONFIG from bioscancast.filtering.models import ForecastQuestion, SearchResult from bioscancast.filtering.pipeline import FilteringPipeline @@ -36,4 +37,59 @@ def test_pipeline_keeps_official_result(): docs = pipeline.run(question, [result]) assert len(docs) == 1 - assert docs[0].domain == "who.int" \ No newline at end of file + assert docs[0].domain == "who.int" + + +def _borderline_question(): + return ForecastQuestion( + id="q1", + text="How many confirmed Ebola cases in the DRC outbreak?", + created_at=datetime(2026, 5, 1), + pathogen="ebola", + region="DRC", + ) + + +def _borderline_result(): + # trusted_media (domain_score 0.6, non-official) with partial term overlap → + # lands in the heuristic borderline band, then the no-LLM "llm_needed" band. + return SearchResult( + id="r-border", + question_id="q1", + query_id="sq1", + engine="google", + url="https://www.cnn.com/ebola-drc", + canonical_url="https://www.cnn.com/ebola-drc", + domain="cnn.com", + title="Ebola cases climb in the DRC outbreak", + snippet="Confirmed Ebola cases reported in the Democratic Republic of the Congo outbreak.", + rank=2, + retrieved_at=datetime(2026, 5, 1), + source_tier="trusted_media", + is_official_domain=False, + domain_score=0.6, + freshness_score=1.0, + search_stage_score=0.6, + ) + + +def test_no_llm_soft_fallback_flag_changes_borderline_outcome(): + question = _borderline_question() + result = _borderline_result() + + saved = dict(FILTER_CONFIG) + try: + # Flag OFF (default): fail closed → borderline candidate dropped. + FILTER_CONFIG["no_llm_soft_fallback"] = False + docs_off = FilteringPipeline(llm_client=None).run(question, [result]) + + # Flag ON: relevant borderline candidate kept without an LLM call. + FILTER_CONFIG["no_llm_soft_fallback"] = True + FILTER_CONFIG["no_llm_fallback_relevance_threshold"] = 0.0 + docs_on = FilteringPipeline(llm_client=None).run(question, [result]) + finally: + FILTER_CONFIG.clear() + FILTER_CONFIG.update(saved) + + assert {d.result_id for d in docs_off} == set() + assert {d.result_id for d in docs_on} == {"r-border"} \ No newline at end of file