Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
128463b
Fix Excel-serial parsing in load_questions
smodee May 27, 2026
94b315a
Add build_forecast_question factory and load_question_by_id
smodee May 27, 2026
7b84936
Add orchestration/test_questions.csv with q7 + q12
smodee May 27, 2026
015cf1f
Add OpenAI price table and cost estimator
smodee May 27, 2026
ce8c718
Add bioscancast.orchestration package and persistence helpers
smodee May 27, 2026
c71dafd
Implement end-to-end pipeline orchestrator in bioscancast/main.py
smodee May 27, 2026
7c8c9d8
Add scripts/run_pipeline.py wrapper and gitignore data/runs/
smodee May 27, 2026
86dc646
Handle sets and dated OpenAI model aliases (surfaced by live runs)
smodee May 27, 2026
2a03ea8
Bypass heuristic scoring for dashboard-injected results (#14)
smodee May 27, 2026
13db598
Enrich dashboard titles and snippets with pathogen-specific text (#14)
smodee May 27, 2026
1a4816a
Lower heuristic_keep_threshold from 0.72 to 0.65 (#13)
smodee May 27, 2026
c81f007
Add suspected_deaths to controlled metric_name vocabulary
smodee May 27, 2026
26ae088
Require the quote field to contain the figure (extraction prompt)
smodee May 27, 2026
c584ef6
Adaptive retrieval_top_k when filter survivors are few
smodee May 27, 2026
eb0a5f6
Exempt dashboard-bypassed docs from cap_per_domain_and_type
smodee May 27, 2026
75f2925
Add per-stage cost line-items to orchestrator epilogue
smodee May 28, 2026
76d5adf
Add case-insensitive layer 4 to the hallucination guard
smodee May 28, 2026
cf4bed2
Add a relevance term to the search-stage score
smodee May 28, 2026
61e7787
Fix stale dashboard URLs and make pathogen routing tolerant
smodee May 28, 2026
8136172
Add national/international news outlets to Tier 3
smodee May 28, 2026
953a95a
Add opt-in soft fallback for the no-LLM filter path
smodee May 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ build/

# Data / cache
data/cache/
data/runs/
*.sqlite

# Docling eval — keep FINDINGS.md and sources/, ignore generated run artifacts
Expand Down
112 changes: 95 additions & 17 deletions bioscancast/datasets/biosecurity_sources.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,112 @@
"""Known biosecurity dashboard URLs by pathogen.

v1 — flagged for iteration after first benchmark run.
The dashboard list and routing logic will need updating as new outbreaks emerge
and data portals change.
v1 — flagged for iteration after first benchmark run. The dashboard list
and routing logic will need updating as new outbreaks emerge and data
portals change.

Each entry carries a pathogen-specific ``title`` and ``snippet`` so that
the heuristic filter and the LLM-rescue path have real signal to work
with. The earlier convention ("Dashboard: cdc.gov" with a generic
snippet) produced keyword_overlap_score = 0.000 across the board — see
issue #14 and the q7/q12 live-run findings.
"""

DASHBOARD_LOOKUP: dict[str, list[str]] = {
from __future__ import annotations

from dataclasses import dataclass


@dataclass(frozen=True)
class DashboardEntry:
"""A curated authoritative source for a pathogen.

The title and snippet are intended to be readable as a search result
in their own right: pathogen name, the kind of data the page hosts,
and the publisher. They feed both the keyword-overlap heuristic and
the LLM-rescue path.
"""

url: str
title: str
snippet: str


DASHBOARD_LOOKUP: dict[str, list[DashboardEntry]] = {
"h5n1": [
"https://www.cdc.gov/bird-flu/situation-summary/",
"https://www.who.int/teams/global-influenza-programme/avian-influenza",
DashboardEntry(
url="https://www.cdc.gov/bird-flu/situation-summary/",
title="CDC H5N1 bird flu situation summary: human cases and outbreaks in the United States",
snippet="CDC tracking of H5N1 avian influenza human cases, affected livestock herds, and public-health response in the US.",
),
DashboardEntry(
url="https://www.who.int/teams/global-influenza-programme/avian-influenza",
title="WHO Global Influenza Programme: avian influenza A(H5N1) human cases and surveillance",
snippet="WHO monitoring of human H5N1 cases, animal-to-human spillover events, and global surveillance reporting.",
),
],
"avian influenza": [
"https://www.cdc.gov/bird-flu/situation-summary/",
"https://www.who.int/teams/global-influenza-programme/avian-influenza",
DashboardEntry(
url="https://www.cdc.gov/bird-flu/situation-summary/",
title="CDC H5N1 bird flu situation summary: human cases and outbreaks in the United States",
snippet="CDC tracking of H5N1 avian influenza human cases, affected livestock herds, and public-health response in the US.",
),
DashboardEntry(
url="https://www.who.int/teams/global-influenza-programme/avian-influenza",
title="WHO Global Influenza Programme: avian influenza A(H5N1) human cases and surveillance",
snippet="WHO monitoring of human H5N1 cases, animal-to-human spillover events, and global surveillance reporting.",
),
],
"mpox": [
"https://ourworldindata.org/mpox",
"https://www.who.int/emergencies/situation-reports",
"https://www.cdc.gov/mpox/data-research/index.html",
DashboardEntry(
url="https://ourworldindata.org/mpox",
title="Our World in Data mpox tracker: global confirmed cases and deaths",
snippet="OWID dashboard tracking cumulative confirmed mpox cases and deaths globally, broken down by country and region, updated from national health agencies.",
),
DashboardEntry(
url="https://www.who.int/emergencies/situation-reports",
title="WHO situation reports including the multi-country mpox outbreak",
snippet="WHO situation reports with weekly case counts, country breakdowns, and public-health guidance for ongoing outbreaks including mpox.",
),
DashboardEntry(
url="https://www.cdc.gov/monkeypox/situation-summary/index.html",
title="CDC mpox current situation summary: confirmed cases in the United States",
snippet="CDC current situation summary for mpox, with US confirmed case counts, clade information, demographics, and outbreak response.",
),
],
"ebola": [
"https://www.afro.who.int/health-topics/ebola-virus-disease",
"https://www.cdc.gov/ebola/index.html",
DashboardEntry(
url="https://www.afro.who.int/health-topics/ebola-disease",
title="WHO Africa Ebola virus disease outbreak surveillance and case counts",
snippet="WHO regional office for Africa tracking of Ebola virus disease outbreaks, confirmed and suspected cases, deaths, and response across African countries.",
),
DashboardEntry(
url="https://www.cdc.gov/ebola/about/index.html",
title="CDC Ebola virus disease outbreak history and case counts",
snippet="CDC information on current and historical Ebola virus disease outbreaks worldwide, with case counts, deaths, and US public-health response.",
),
],
"covid-19": [
"https://ourworldindata.org/coronavirus",
"https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports",
DashboardEntry(
url="https://ourworldindata.org/coronavirus",
title="Our World in Data COVID-19 tracker: global cases, deaths, and vaccinations",
snippet="OWID dashboard tracking cumulative COVID-19 confirmed cases, deaths, hospitalizations, and vaccination coverage globally by country.",
),
DashboardEntry(
url="https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports",
title="WHO COVID-19 situation reports and global case counts",
snippet="WHO situation reports with updates on COVID-19 confirmed cases, deaths, variant tracking, and country-level data.",
),
],
"marburg": [
"https://www.who.int/news-room/fact-sheets/detail/marburg-virus-disease",
"https://www.cdc.gov/marburg/index.html",
DashboardEntry(
url="https://www.who.int/news-room/fact-sheets/detail/marburg-virus-disease",
title="WHO Marburg virus disease facts and outbreak case counts",
snippet="WHO factsheet on Marburg virus disease including transmission, symptoms, case-fatality ratio, and historical outbreak case and death counts.",
),
DashboardEntry(
url="https://www.cdc.gov/marburg/index.html",
title="CDC Marburg virus disease outbreaks and surveillance",
snippet="CDC information on Marburg virus disease outbreaks worldwide, case counts, deaths, and US public-health surveillance.",
),
],
}
30 changes: 30 additions & 0 deletions bioscancast/datasets/source_tiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,36 @@
"wikipedia.org",
"sciencedirect.com",
"pubmed.ncbi.nlm.nih.gov",
# National/international news with established newsrooms. Added after the
# #13 tier-coverage audit (data/investigations/findings-issues-3-4-13.md):
# live pools showed reputable outbreak reporting from these outlets
# resolving to "unknown" (domain_score 0.2), which sank them below the
# filter's credibility floor. Second-level-domain matching in
# resolve_tier() covers subdomains (edition.cnn.com, ca.news.yahoo.com,
# africa.businessinsider.com, etc.).
"cnn.com",
"nbcnews.com",
"cbsnews.com",
"abcnews.go.com",
"abcnews.com",
"npr.org",
"pbs.org",
"usatoday.com",
"latimes.com",
"politico.com",
"politico.eu",
"axios.com",
"thehill.com",
"forbes.com",
"bloomberg.com",
"ft.com",
"wsj.com",
"economist.com",
"time.com",
"theatlantic.com",
"newyorker.com",
"arstechnica.com",
"businessinsider.com",
}

TIER_4_DOMAINS: set[str] = {
Expand Down
21 changes: 20 additions & 1 deletion bioscancast/filtering/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,14 @@
"domain": 0.20,
"official_bonus": 0.20,
},
"heuristic_keep_threshold": 0.72,
# Lowered from 0.72 to 0.65 after q7/q12 live runs showed filter
# survival of 4.7% / 13.5% — the threshold was tighter than the
# heuristic's actual signal supports. Borderline candidates that
# cross the new threshold still go to the LLM rescue path; this
# change just stops dropping high-credibility-but-low-keyword-overlap
# results pre-LLM (e.g. apnews/theguardian/washingtonpost in q7).
# See issue #13.
"heuristic_keep_threshold": 0.65,
"heuristic_borderline_threshold": 0.45,

"reranker_weights": {
Expand All @@ -49,6 +56,18 @@
"auto_reject_after_rerank": 0.30,
"max_llm_filter_candidates": 10,

# When no LLM client is configured, the ambiguous "llm_needed" band
# (reranked priority between auto_reject and auto_keep) is normally
# rejected outright (fail-closed). With this flag enabled — for dev /
# offline / no-API-key runs — a borderline candidate is instead KEPT if it
# is an official domain OR its keyword-overlap relevance clears
# ``no_llm_fallback_relevance_threshold``. This approximates the LLM-rescue
# path without an API call, recovering the on-topic / authoritative tail
# without admitting the generic-news mass. Default OFF so production (which
# always has an LLM client) is unchanged. See issue #13.
"no_llm_soft_fallback": False,
"no_llm_fallback_relevance_threshold": 0.5,

"max_docs_per_domain": 2,
"max_docs_per_type": 5,

Expand Down
21 changes: 21 additions & 0 deletions bioscancast/filtering/heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,27 @@ def heuristic_filter(
)
continue

# Dashboard-injected results are hand-curated in
# ``bioscancast/datasets/biosecurity_sources.py``; they bypass the
# keyword-overlap-driven heuristic which structurally undervalues
# their generic titles. See issue #14 and live-run data on q7/q12
# where 4/4 injected dashboards had keyword_overlap == 0.000.
if result.retrieval_reason == "dashboard_lookup":
relevance_score = compute_heuristic_relevance(result, question)
credibility_score = compute_heuristic_credibility(result)
keep_list.append(
make_decision(
result=result,
keep=True,
stage="heuristic",
relevance_score=relevance_score,
credibility_score=credibility_score,
priority_score=1.0,
reason_codes=["dashboard_lookup_bypass"],
)
)
continue

relevance_score = compute_heuristic_relevance(result, question)
credibility_score = compute_heuristic_credibility(result)
priority_score = compute_priority_score(result, relevance_score, credibility_score)
Expand Down
19 changes: 16 additions & 3 deletions bioscancast/filtering/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,24 @@ def run(
llm_decisions: list[FilterDecision] = []
if llm_needed:
if self.llm_client is None:
# Fail closed: reject ambiguous cases if no LLM client is configured.
# No LLM client. Default is fail-closed (reject the ambiguous
# band). When the soft-fallback flag is enabled, keep candidates
# that are official-domain or sufficiently relevant — see
# FILTER_CONFIG["no_llm_soft_fallback"] and issue #13.
soft = FILTER_CONFIG.get("no_llm_soft_fallback", False)
rel_threshold = FILTER_CONFIG.get(
"no_llm_fallback_relevance_threshold", 0.5
)
for d in llm_needed:
d.keep = False
d.stage = "llm_skipped"
d.reason_codes.append("no_llm_client_configured")
result = result_map.get(d.result_id)
is_official = bool(result and result.is_official_domain)
if soft and (is_official or d.relevance_score >= rel_threshold):
d.keep = True
d.reason_codes.append("no_llm_soft_fallback_kept")
else:
d.keep = False
d.reason_codes.append("no_llm_client_configured")
llm_decisions = llm_needed
else:
llm_decisions = llm_filter_candidates(
Expand Down
29 changes: 23 additions & 6 deletions bioscancast/filtering/postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,38 @@ def cap_per_domain_and_type(
max_docs_per_domain: int,
max_docs_per_type: int,
) -> List[FilteredDocument]:
"""Limit how many docs from a single domain or file type survive.

Dashboard-bypassed docs (selection_reasons contains
``"dashboard_lookup_bypass"``) are always kept and do not consume a
slot against either cap. Curated dashboard injections are a separate
channel from organic search results; without this carve-out, a
dashboard sitting at synthetic priority 1.0 displaces a genuine
organic candidate on the same domain - which is exactly what
happened on q7 (WHO sitreps dashboard squeezed out the WHO research
event page that the baseline extracted records from).
"""
kept: list[FilteredDocument] = []
domain_counts = defaultdict(int)
type_counts = defaultdict(int)

for doc in docs:
doc_type = doc.file_type or "unknown"

if domain_counts[doc.domain] >= max_docs_per_domain:
continue
if type_counts[doc_type] >= max_docs_per_type:
continue
is_dashboard_bypass = "dashboard_lookup_bypass" in (
doc.selection_reasons or []
)

if not is_dashboard_bypass:
if domain_counts[doc.domain] >= max_docs_per_domain:
continue
if type_counts[doc_type] >= max_docs_per_type:
continue

kept.append(doc)
domain_counts[doc.domain] += 1
type_counts[doc_type] += 1
if not is_dashboard_bypass:
domain_counts[doc.domain] += 1
type_counts[doc_type] += 1

return kept

Expand Down
14 changes: 14 additions & 0 deletions bioscancast/insight/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
"max_chunks_per_document": 12,
"extraction_max_output_tokens": 4096,
"chunk_workers": 6,
"low_survival_doc_threshold": 5,
"low_survival_top_k": 20,
}


Expand Down Expand Up @@ -43,6 +45,18 @@ class InsightConfig:
Set to 1 for sequential execution (useful for debugging or rate-
limit-sensitive setups)."""

low_survival_doc_threshold: int = 5
"""When the filter passes fewer than this many usable documents to
insight, switch to ``low_survival_top_k`` for both retrieval and the
per-document chunk cap. q7 reached insight with only 2 surviving
documents; in that regime per-doc retrieval depth becomes the
bottleneck on coverage."""

low_survival_top_k: int = 20
"""Retrieval / per-doc cap used when usable documents are at or below
``low_survival_doc_threshold``. Set to ``None`` (or equal to
``retrieval_top_k``) to disable the adaptive lift."""

@classmethod
def from_dict(cls, d: dict) -> InsightConfig:
"""Create an InsightConfig from a dict, ignoring unknown keys."""
Expand Down
18 changes: 18 additions & 0 deletions bioscancast/insight/extraction/chunk_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,24 @@ def _quote_matches(quote: str, chunk_text: str) -> Optional[str]:
if unwrap_quote in unwrap_chunk:
return unwrap_quote

# Layer 4: case-insensitive substring. Catches the model lowercasing
# the leading letter of a sentence it quotes from mid-paragraph -
# otherwise verbatim drift that's very common (q12 live runs:
# "there are now 750 suspected cases..." vs the source's "There are
# now 750..."). Returns the chunk's own casing so the stored quote
# reflects the source. Crucially this does NOT recover content-
# insertion hallucinations: a fabricated continuation still fails the
# substring test regardless of case (verified against the q12
# "...have been reported in Ituri, North Kivu" fabrication, whose real
# source text continues "...and 906 suspected cases").
ci_chunk = norm_chunk.lower()
for candidate in (norm_quote, stripped):
if not candidate:
continue
idx = ci_chunk.find(candidate.lower())
if idx >= 0:
return norm_chunk[idx: idx + len(candidate)]

return None


Expand Down
Loading