Skip to content
112 changes: 95 additions & 17 deletions bioscancast/datasets/biosecurity_sources.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,112 @@
"""Known biosecurity dashboard URLs by pathogen.

v1 — flagged for iteration after first benchmark run.
The dashboard list and routing logic will need updating as new outbreaks emerge
and data portals change.
v1 — flagged for iteration after first benchmark run. The dashboard list
and routing logic will need updating as new outbreaks emerge and data
portals change.

Each entry carries a pathogen-specific ``title`` and ``snippet`` so that
the heuristic filter and the LLM-rescue path have real signal to work
with. The earlier convention ("Dashboard: cdc.gov" with a generic
snippet) produced keyword_overlap_score = 0.000 across the board — see
issue #14 and the q7/q12 live-run findings.
"""

DASHBOARD_LOOKUP: dict[str, list[str]] = {
from __future__ import annotations

from dataclasses import dataclass


@dataclass(frozen=True)
class DashboardEntry:
"""A curated authoritative source for a pathogen.

The title and snippet are intended to be readable as a search result
in their own right: pathogen name, the kind of data the page hosts,
and the publisher. They feed both the keyword-overlap heuristic and
the LLM-rescue path.
"""

url: str
title: str
snippet: str


DASHBOARD_LOOKUP: dict[str, list[DashboardEntry]] = {
"h5n1": [
"https://www.cdc.gov/bird-flu/situation-summary/",
"https://www.who.int/teams/global-influenza-programme/avian-influenza",
DashboardEntry(
url="https://www.cdc.gov/bird-flu/situation-summary/",
title="CDC H5N1 bird flu situation summary: human cases and outbreaks in the United States",
snippet="CDC tracking of H5N1 avian influenza human cases, affected livestock herds, and public-health response in the US.",
),
DashboardEntry(
url="https://www.who.int/teams/global-influenza-programme/avian-influenza",
title="WHO Global Influenza Programme: avian influenza A(H5N1) human cases and surveillance",
snippet="WHO monitoring of human H5N1 cases, animal-to-human spillover events, and global surveillance reporting.",
),
],
"avian influenza": [
"https://www.cdc.gov/bird-flu/situation-summary/",
"https://www.who.int/teams/global-influenza-programme/avian-influenza",
DashboardEntry(
url="https://www.cdc.gov/bird-flu/situation-summary/",
title="CDC H5N1 bird flu situation summary: human cases and outbreaks in the United States",
snippet="CDC tracking of H5N1 avian influenza human cases, affected livestock herds, and public-health response in the US.",
),
DashboardEntry(
url="https://www.who.int/teams/global-influenza-programme/avian-influenza",
title="WHO Global Influenza Programme: avian influenza A(H5N1) human cases and surveillance",
snippet="WHO monitoring of human H5N1 cases, animal-to-human spillover events, and global surveillance reporting.",
),
],
"mpox": [
"https://ourworldindata.org/mpox",
"https://www.who.int/emergencies/situation-reports",
"https://www.cdc.gov/mpox/data-research/index.html",
DashboardEntry(
url="https://ourworldindata.org/mpox",
title="Our World in Data mpox tracker: global confirmed cases and deaths",
snippet="OWID dashboard tracking cumulative confirmed mpox cases and deaths globally, broken down by country and region, updated from national health agencies.",
),
DashboardEntry(
url="https://www.who.int/emergencies/situation-reports",
title="WHO situation reports including the multi-country mpox outbreak",
snippet="WHO situation reports with weekly case counts, country breakdowns, and public-health guidance for ongoing outbreaks including mpox.",
),
DashboardEntry(
url="https://www.cdc.gov/monkeypox/situation-summary/index.html",
title="CDC mpox current situation summary: confirmed cases in the United States",
snippet="CDC current situation summary for mpox, with US confirmed case counts, clade information, demographics, and outbreak response.",
),
],
"ebola": [
"https://www.afro.who.int/health-topics/ebola-virus-disease",
"https://www.cdc.gov/ebola/index.html",
DashboardEntry(
url="https://www.afro.who.int/health-topics/ebola-disease",
title="WHO Africa Ebola virus disease outbreak surveillance and case counts",
snippet="WHO regional office for Africa tracking of Ebola virus disease outbreaks, confirmed and suspected cases, deaths, and response across African countries.",
),
DashboardEntry(
url="https://www.cdc.gov/ebola/about/index.html",
title="CDC Ebola virus disease outbreak history and case counts",
snippet="CDC information on current and historical Ebola virus disease outbreaks worldwide, with case counts, deaths, and US public-health response.",
),
],
"covid-19": [
"https://ourworldindata.org/coronavirus",
"https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports",
DashboardEntry(
url="https://ourworldindata.org/coronavirus",
title="Our World in Data COVID-19 tracker: global cases, deaths, and vaccinations",
snippet="OWID dashboard tracking cumulative COVID-19 confirmed cases, deaths, hospitalizations, and vaccination coverage globally by country.",
),
DashboardEntry(
url="https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports",
title="WHO COVID-19 situation reports and global case counts",
snippet="WHO situation reports with updates on COVID-19 confirmed cases, deaths, variant tracking, and country-level data.",
),
],
"marburg": [
"https://www.who.int/news-room/fact-sheets/detail/marburg-virus-disease",
"https://www.cdc.gov/marburg/index.html",
DashboardEntry(
url="https://www.who.int/news-room/fact-sheets/detail/marburg-virus-disease",
title="WHO Marburg virus disease facts and outbreak case counts",
snippet="WHO factsheet on Marburg virus disease including transmission, symptoms, case-fatality ratio, and historical outbreak case and death counts.",
),
DashboardEntry(
url="https://www.cdc.gov/marburg/index.html",
title="CDC Marburg virus disease outbreaks and surveillance",
snippet="CDC information on Marburg virus disease outbreaks worldwide, case counts, deaths, and US public-health surveillance.",
),
],
}
30 changes: 30 additions & 0 deletions bioscancast/datasets/source_tiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,36 @@
"wikipedia.org",
"sciencedirect.com",
"pubmed.ncbi.nlm.nih.gov",
# National/international news with established newsrooms. Added after the
# #13 tier-coverage audit (data/investigations/findings-issues-3-4-13.md):
# live pools showed reputable outbreak reporting from these outlets
# resolving to "unknown" (domain_score 0.2), which sank them below the
# filter's credibility floor. Second-level-domain matching in
# resolve_tier() covers subdomains (edition.cnn.com, ca.news.yahoo.com,
# africa.businessinsider.com, etc.).
"cnn.com",
"nbcnews.com",
"cbsnews.com",
"abcnews.go.com",
"abcnews.com",
"npr.org",
"pbs.org",
"usatoday.com",
"latimes.com",
"politico.com",
"politico.eu",
"axios.com",
"thehill.com",
"forbes.com",
"bloomberg.com",
"ft.com",
"wsj.com",
"economist.com",
"time.com",
"theatlantic.com",
"newyorker.com",
"arstechnica.com",
"businessinsider.com",
}

TIER_4_DOMAINS: set[str] = {
Expand Down
21 changes: 20 additions & 1 deletion bioscancast/filtering/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,14 @@
"domain": 0.20,
"official_bonus": 0.20,
},
"heuristic_keep_threshold": 0.72,
# Lowered from 0.72 to 0.65 after q7/q12 live runs showed filter
# survival of 4.7% / 13.5% — the threshold was tighter than the
# heuristic's actual signal supports. Borderline candidates that
# cross the new threshold still go to the LLM rescue path; this
# change just stops dropping high-credibility-but-low-keyword-overlap
# results pre-LLM (e.g. apnews/theguardian/washingtonpost in q7).
# See issue #13.
"heuristic_keep_threshold": 0.65,
"heuristic_borderline_threshold": 0.45,

"reranker_weights": {
Expand All @@ -49,6 +56,18 @@
"auto_reject_after_rerank": 0.30,
"max_llm_filter_candidates": 10,

# When no LLM client is configured, the ambiguous "llm_needed" band
# (reranked priority between auto_reject and auto_keep) is normally
# rejected outright (fail-closed). With this flag enabled — for dev /
# offline / no-API-key runs — a borderline candidate is instead KEPT if it
# is an official domain OR its keyword-overlap relevance clears
# ``no_llm_fallback_relevance_threshold``. This approximates the LLM-rescue
# path without an API call, recovering the on-topic / authoritative tail
# without admitting the generic-news mass. Default OFF so production (which
# always has an LLM client) is unchanged. See issue #13.
"no_llm_soft_fallback": False,
"no_llm_fallback_relevance_threshold": 0.5,

"max_docs_per_domain": 2,
"max_docs_per_type": 5,

Expand Down
21 changes: 21 additions & 0 deletions bioscancast/filtering/heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,27 @@ def heuristic_filter(
)
continue

# Dashboard-injected results are hand-curated in
# ``bioscancast/datasets/biosecurity_sources.py``; they bypass the
# keyword-overlap-driven heuristic which structurally undervalues
# their generic titles. See issue #14 and live-run data on q7/q12
# where 4/4 injected dashboards had keyword_overlap == 0.000.
if result.retrieval_reason == "dashboard_lookup":
relevance_score = compute_heuristic_relevance(result, question)
credibility_score = compute_heuristic_credibility(result)
keep_list.append(
make_decision(
result=result,
keep=True,
stage="heuristic",
relevance_score=relevance_score,
credibility_score=credibility_score,
priority_score=1.0,
reason_codes=["dashboard_lookup_bypass"],
)
)
continue

relevance_score = compute_heuristic_relevance(result, question)
credibility_score = compute_heuristic_credibility(result)
priority_score = compute_priority_score(result, relevance_score, credibility_score)
Expand Down
19 changes: 16 additions & 3 deletions bioscancast/filtering/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,24 @@ def run(
llm_decisions: list[FilterDecision] = []
if llm_needed:
if self.llm_client is None:
# Fail closed: reject ambiguous cases if no LLM client is configured.
# No LLM client. Default is fail-closed (reject the ambiguous
# band). When the soft-fallback flag is enabled, keep candidates
# that are official-domain or sufficiently relevant — see
# FILTER_CONFIG["no_llm_soft_fallback"] and issue #13.
soft = FILTER_CONFIG.get("no_llm_soft_fallback", False)
rel_threshold = FILTER_CONFIG.get(
"no_llm_fallback_relevance_threshold", 0.5
)
for d in llm_needed:
d.keep = False
d.stage = "llm_skipped"
d.reason_codes.append("no_llm_client_configured")
result = result_map.get(d.result_id)
is_official = bool(result and result.is_official_domain)
if soft and (is_official or d.relevance_score >= rel_threshold):
d.keep = True
d.reason_codes.append("no_llm_soft_fallback_kept")
else:
d.keep = False
d.reason_codes.append("no_llm_client_configured")
llm_decisions = llm_needed
else:
llm_decisions = llm_filter_candidates(
Expand Down
29 changes: 23 additions & 6 deletions bioscancast/filtering/postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,38 @@ def cap_per_domain_and_type(
max_docs_per_domain: int,
max_docs_per_type: int,
) -> List[FilteredDocument]:
"""Limit how many docs from a single domain or file type survive.

Dashboard-bypassed docs (selection_reasons contains
``"dashboard_lookup_bypass"``) are always kept and do not consume a
slot against either cap. Curated dashboard injections are a separate
channel from organic search results; without this carve-out, a
dashboard sitting at synthetic priority 1.0 displaces a genuine
organic candidate on the same domain - which is exactly what
happened on q7 (WHO sitreps dashboard squeezed out the WHO research
event page that the baseline extracted records from).
"""
kept: list[FilteredDocument] = []
domain_counts = defaultdict(int)
type_counts = defaultdict(int)

for doc in docs:
doc_type = doc.file_type or "unknown"

if domain_counts[doc.domain] >= max_docs_per_domain:
continue
if type_counts[doc_type] >= max_docs_per_type:
continue
is_dashboard_bypass = "dashboard_lookup_bypass" in (
doc.selection_reasons or []
)

if not is_dashboard_bypass:
if domain_counts[doc.domain] >= max_docs_per_domain:
continue
if type_counts[doc_type] >= max_docs_per_type:
continue

kept.append(doc)
domain_counts[doc.domain] += 1
type_counts[doc_type] += 1
if not is_dashboard_bypass:
domain_counts[doc.domain] += 1
type_counts[doc_type] += 1

return kept

Expand Down
Loading