algorithmicgovernance · smodee · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,7 @@ build/
 
 # Data / cache
 data/cache/
+data/runs/
 *.sqlite
 
 # Docling eval — keep FINDINGS.md and sources/, ignore generated run artifacts

diff --git a/bioscancast/datasets/biosecurity_sources.py b/bioscancast/datasets/biosecurity_sources.py
@@ -1,34 +1,112 @@
 """Known biosecurity dashboard URLs by pathogen.
 
-v1 — flagged for iteration after first benchmark run.
-The dashboard list and routing logic will need updating as new outbreaks emerge
-and data portals change.
+v1 — flagged for iteration after first benchmark run. The dashboard list
+and routing logic will need updating as new outbreaks emerge and data
+portals change.
+
+Each entry carries a pathogen-specific ``title`` and ``snippet`` so that
+the heuristic filter and the LLM-rescue path have real signal to work
+with. The earlier convention ("Dashboard: cdc.gov" with a generic
+snippet) produced keyword_overlap_score = 0.000 across the board — see
+issue #14 and the q7/q12 live-run findings.
 """
 
-DASHBOARD_LOOKUP: dict[str, list[str]] = {
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class DashboardEntry:
+    """A curated authoritative source for a pathogen.
+
+    The title and snippet are intended to be readable as a search result
+    in their own right: pathogen name, the kind of data the page hosts,
+    and the publisher. They feed both the keyword-overlap heuristic and
+    the LLM-rescue path.
+    """
+
+    url: str
+    title: str
+    snippet: str
+
+
+DASHBOARD_LOOKUP: dict[str, list[DashboardEntry]] = {
     "h5n1": [
-        "https://www.cdc.gov/bird-flu/situation-summary/",
-        "https://www.who.int/teams/global-influenza-programme/avian-influenza",
+        DashboardEntry(
+            url="https://www.cdc.gov/bird-flu/situation-summary/",
+            title="CDC H5N1 bird flu situation summary: human cases and outbreaks in the United States",
+            snippet="CDC tracking of H5N1 avian influenza human cases, affected livestock herds, and public-health response in the US.",
+        ),
+        DashboardEntry(
+            url="https://www.who.int/teams/global-influenza-programme/avian-influenza",
+            title="WHO Global Influenza Programme: avian influenza A(H5N1) human cases and surveillance",
+            snippet="WHO monitoring of human H5N1 cases, animal-to-human spillover events, and global surveillance reporting.",
+        ),
     ],
     "avian influenza": [
-        "https://www.cdc.gov/bird-flu/situation-summary/",
-        "https://www.who.int/teams/global-influenza-programme/avian-influenza",
+        DashboardEntry(
+            url="https://www.cdc.gov/bird-flu/situation-summary/",
+            title="CDC H5N1 bird flu situation summary: human cases and outbreaks in the United States",
+            snippet="CDC tracking of H5N1 avian influenza human cases, affected livestock herds, and public-health response in the US.",
+        ),
+        DashboardEntry(
+            url="https://www.who.int/teams/global-influenza-programme/avian-influenza",
+            title="WHO Global Influenza Programme: avian influenza A(H5N1) human cases and surveillance",
+            snippet="WHO monitoring of human H5N1 cases, animal-to-human spillover events, and global surveillance reporting.",
+        ),
     ],
     "mpox": [
-        "https://ourworldindata.org/mpox",
-        "https://www.who.int/emergencies/situation-reports",
-        "https://www.cdc.gov/mpox/data-research/index.html",
+        DashboardEntry(
+            url="https://ourworldindata.org/mpox",
+            title="Our World in Data mpox tracker: global confirmed cases and deaths",
+            snippet="OWID dashboard tracking cumulative confirmed mpox cases and deaths globally, broken down by country and region, updated from national health agencies.",
+        ),
+        DashboardEntry(
+            url="https://www.who.int/emergencies/situation-reports",
+            title="WHO situation reports including the multi-country mpox outbreak",
+            snippet="WHO situation reports with weekly case counts, country breakdowns, and public-health guidance for ongoing outbreaks including mpox.",
+        ),
+        DashboardEntry(
+            url="https://www.cdc.gov/monkeypox/situation-summary/index.html",
+            title="CDC mpox current situation summary: confirmed cases in the United States",
+            snippet="CDC current situation summary for mpox, with US confirmed case counts, clade information, demographics, and outbreak response.",
+        ),
     ],
     "ebola": [
-        "https://www.afro.who.int/health-topics/ebola-virus-disease",
-        "https://www.cdc.gov/ebola/index.html",
+        DashboardEntry(
+            url="https://www.afro.who.int/health-topics/ebola-disease",
+            title="WHO Africa Ebola virus disease outbreak surveillance and case counts",
+            snippet="WHO regional office for Africa tracking of Ebola virus disease outbreaks, confirmed and suspected cases, deaths, and response across African countries.",
+        ),
+        DashboardEntry(
+            url="https://www.cdc.gov/ebola/about/index.html",
+            title="CDC Ebola virus disease outbreak history and case counts",
+            snippet="CDC information on current and historical Ebola virus disease outbreaks worldwide, with case counts, deaths, and US public-health response.",
+        ),
     ],
     "covid-19": [
-        "https://ourworldindata.org/coronavirus",
-        "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports",
+        DashboardEntry(
+            url="https://ourworldindata.org/coronavirus",
+            title="Our World in Data COVID-19 tracker: global cases, deaths, and vaccinations",
+            snippet="OWID dashboard tracking cumulative COVID-19 confirmed cases, deaths, hospitalizations, and vaccination coverage globally by country.",
+        ),
+        DashboardEntry(
+            url="https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports",
+            title="WHO COVID-19 situation reports and global case counts",
+            snippet="WHO situation reports with updates on COVID-19 confirmed cases, deaths, variant tracking, and country-level data.",
+        ),
     ],
     "marburg": [
-        "https://www.who.int/news-room/fact-sheets/detail/marburg-virus-disease",
-        "https://www.cdc.gov/marburg/index.html",
+        DashboardEntry(
+            url="https://www.who.int/news-room/fact-sheets/detail/marburg-virus-disease",
+            title="WHO Marburg virus disease facts and outbreak case counts",
+            snippet="WHO factsheet on Marburg virus disease including transmission, symptoms, case-fatality ratio, and historical outbreak case and death counts.",
+        ),
+        DashboardEntry(
+            url="https://www.cdc.gov/marburg/index.html",
+            title="CDC Marburg virus disease outbreaks and surveillance",
+            snippet="CDC information on Marburg virus disease outbreaks worldwide, case counts, deaths, and US public-health surveillance.",
+        ),
     ],
 }
diff --git a/bioscancast/datasets/source_tiers.py b/bioscancast/datasets/source_tiers.py
@@ -59,6 +59,36 @@
     "wikipedia.org",
     "sciencedirect.com",
     "pubmed.ncbi.nlm.nih.gov",
+    # National/international news with established newsrooms. Added after the
+    # #13 tier-coverage audit (data/investigations/findings-issues-3-4-13.md):
+    # live pools showed reputable outbreak reporting from these outlets
+    # resolving to "unknown" (domain_score 0.2), which sank them below the
+    # filter's credibility floor. Second-level-domain matching in
+    # resolve_tier() covers subdomains (edition.cnn.com, ca.news.yahoo.com,
+    # africa.businessinsider.com, etc.).
+    "cnn.com",
+    "nbcnews.com",
+    "cbsnews.com",
+    "abcnews.go.com",
+    "abcnews.com",
+    "npr.org",
+    "pbs.org",
+    "usatoday.com",
+    "latimes.com",
+    "politico.com",
+    "politico.eu",
+    "axios.com",
+    "thehill.com",
+    "forbes.com",
+    "bloomberg.com",
+    "ft.com",
+    "wsj.com",
+    "economist.com",
+    "time.com",
+    "theatlantic.com",
+    "newyorker.com",
+    "arstechnica.com",
+    "businessinsider.com",
 }
 
 TIER_4_DOMAINS: set[str] = {

diff --git a/bioscancast/filtering/config.py b/bioscancast/filtering/config.py
@@ -38,7 +38,14 @@
         "domain": 0.20,
         "official_bonus": 0.20,
     },
-    "heuristic_keep_threshold": 0.72,
+    # Lowered from 0.72 to 0.65 after q7/q12 live runs showed filter
+    # survival of 4.7% / 13.5% — the threshold was tighter than the
+    # heuristic's actual signal supports. Borderline candidates that
+    # cross the new threshold still go to the LLM rescue path; this
+    # change just stops dropping high-credibility-but-low-keyword-overlap
+    # results pre-LLM (e.g. apnews/theguardian/washingtonpost in q7).
+    # See issue #13.
+    "heuristic_keep_threshold": 0.65,
     "heuristic_borderline_threshold": 0.45,
 
     "reranker_weights": {
@@ -49,6 +56,18 @@
     "auto_reject_after_rerank": 0.30,
     "max_llm_filter_candidates": 10,
 
+    # When no LLM client is configured, the ambiguous "llm_needed" band
+    # (reranked priority between auto_reject and auto_keep) is normally
+    # rejected outright (fail-closed). With this flag enabled — for dev /
+    # offline / no-API-key runs — a borderline candidate is instead KEPT if it
+    # is an official domain OR its keyword-overlap relevance clears
+    # ``no_llm_fallback_relevance_threshold``. This approximates the LLM-rescue
+    # path without an API call, recovering the on-topic / authoritative tail
+    # without admitting the generic-news mass. Default OFF so production (which
+    # always has an LLM client) is unchanged. See issue #13.
+    "no_llm_soft_fallback": False,
+    "no_llm_fallback_relevance_threshold": 0.5,
+
     "max_docs_per_domain": 2,
     "max_docs_per_type": 5,
 

diff --git a/bioscancast/filtering/heuristics.py b/bioscancast/filtering/heuristics.py
@@ -121,6 +121,27 @@ def heuristic_filter(
             )
             continue
 
+        # Dashboard-injected results are hand-curated in
+        # ``bioscancast/datasets/biosecurity_sources.py``; they bypass the
+        # keyword-overlap-driven heuristic which structurally undervalues
+        # their generic titles. See issue #14 and live-run data on q7/q12
+        # where 4/4 injected dashboards had keyword_overlap == 0.000.
+        if result.retrieval_reason == "dashboard_lookup":
+            relevance_score = compute_heuristic_relevance(result, question)
+            credibility_score = compute_heuristic_credibility(result)
+            keep_list.append(
+                make_decision(
+                    result=result,
+                    keep=True,
+                    stage="heuristic",
+                    relevance_score=relevance_score,
+                    credibility_score=credibility_score,
+                    priority_score=1.0,
+                    reason_codes=["dashboard_lookup_bypass"],
+                )
+            )
+            continue
+
         relevance_score = compute_heuristic_relevance(result, question)
         credibility_score = compute_heuristic_credibility(result)
         priority_score = compute_priority_score(result, relevance_score, credibility_score)

diff --git a/bioscancast/filtering/pipeline.py b/bioscancast/filtering/pipeline.py
@@ -37,11 +37,24 @@ def run(
         llm_decisions: list[FilterDecision] = []
         if llm_needed:
             if self.llm_client is None:
-                # Fail closed: reject ambiguous cases if no LLM client is configured.
+                # No LLM client. Default is fail-closed (reject the ambiguous
+                # band). When the soft-fallback flag is enabled, keep candidates
+                # that are official-domain or sufficiently relevant — see
+                # FILTER_CONFIG["no_llm_soft_fallback"] and issue #13.
+                soft = FILTER_CONFIG.get("no_llm_soft_fallback", False)
+                rel_threshold = FILTER_CONFIG.get(
+                    "no_llm_fallback_relevance_threshold", 0.5
+                )
                 for d in llm_needed:
-                    d.keep = False
                     d.stage = "llm_skipped"
-                    d.reason_codes.append("no_llm_client_configured")
+                    result = result_map.get(d.result_id)
+                    is_official = bool(result and result.is_official_domain)
+                    if soft and (is_official or d.relevance_score >= rel_threshold):
+                        d.keep = True
+                        d.reason_codes.append("no_llm_soft_fallback_kept")
+                    else:
+                        d.keep = False
+                        d.reason_codes.append("no_llm_client_configured")
                 llm_decisions = llm_needed
             else:
                 llm_decisions = llm_filter_candidates(

diff --git a/bioscancast/filtering/postprocess.py b/bioscancast/filtering/postprocess.py
@@ -49,21 +49,38 @@ def cap_per_domain_and_type(
     max_docs_per_domain: int,
     max_docs_per_type: int,
 ) -> List[FilteredDocument]:
+    """Limit how many docs from a single domain or file type survive.
+
+    Dashboard-bypassed docs (selection_reasons contains
+    ``"dashboard_lookup_bypass"``) are always kept and do not consume a
+    slot against either cap. Curated dashboard injections are a separate
+    channel from organic search results; without this carve-out, a
+    dashboard sitting at synthetic priority 1.0 displaces a genuine
+    organic candidate on the same domain - which is exactly what
+    happened on q7 (WHO sitreps dashboard squeezed out the WHO research
+    event page that the baseline extracted records from).
+    """
     kept: list[FilteredDocument] = []
     domain_counts = defaultdict(int)
     type_counts = defaultdict(int)
 
     for doc in docs:
         doc_type = doc.file_type or "unknown"
 
-        if domain_counts[doc.domain] >= max_docs_per_domain:
-            continue
-        if type_counts[doc_type] >= max_docs_per_type:
-            continue
+        is_dashboard_bypass = "dashboard_lookup_bypass" in (
+            doc.selection_reasons or []
+        )
+
+        if not is_dashboard_bypass:
+            if domain_counts[doc.domain] >= max_docs_per_domain:
+                continue
+            if type_counts[doc_type] >= max_docs_per_type:
+                continue
 
         kept.append(doc)
-        domain_counts[doc.domain] += 1
-        type_counts[doc_type] += 1
+        if not is_dashboard_bypass:
+            domain_counts[doc.domain] += 1
+            type_counts[doc_type] += 1
 
     return kept
 

diff --git a/bioscancast/insight/config.py b/bioscancast/insight/config.py
@@ -16,6 +16,8 @@
     "max_chunks_per_document": 12,
     "extraction_max_output_tokens": 4096,
     "chunk_workers": 6,
+    "low_survival_doc_threshold": 5,
+    "low_survival_top_k": 20,
 }
 
 
@@ -43,6 +45,18 @@ class InsightConfig:
     Set to 1 for sequential execution (useful for debugging or rate-
     limit-sensitive setups)."""
 
+    low_survival_doc_threshold: int = 5
+    """When the filter passes fewer than this many usable documents to
+    insight, switch to ``low_survival_top_k`` for both retrieval and the
+    per-document chunk cap. q7 reached insight with only 2 surviving
+    documents; in that regime per-doc retrieval depth becomes the
+    bottleneck on coverage."""
+
+    low_survival_top_k: int = 20
+    """Retrieval / per-doc cap used when usable documents are at or below
+    ``low_survival_doc_threshold``. Set to ``None`` (or equal to
+    ``retrieval_top_k``) to disable the adaptive lift."""
+
     @classmethod
     def from_dict(cls, d: dict) -> InsightConfig:
         """Create an InsightConfig from a dict, ignoring unknown keys."""

diff --git a/bioscancast/insight/extraction/chunk_extractor.py b/bioscancast/insight/extraction/chunk_extractor.py
@@ -221,6 +221,24 @@ def _quote_matches(quote: str, chunk_text: str) -> Optional[str]:
     if unwrap_quote in unwrap_chunk:
         return unwrap_quote
 
+    # Layer 4: case-insensitive substring. Catches the model lowercasing
+    # the leading letter of a sentence it quotes from mid-paragraph -
+    # otherwise verbatim drift that's very common (q12 live runs:
+    # "there are now 750 suspected cases..." vs the source's "There are
+    # now 750..."). Returns the chunk's own casing so the stored quote
+    # reflects the source. Crucially this does NOT recover content-
+    # insertion hallucinations: a fabricated continuation still fails the
+    # substring test regardless of case (verified against the q12
+    # "...have been reported in Ituri, North Kivu" fabrication, whose real
+    # source text continues "...and 906 suspected cases").
+    ci_chunk = norm_chunk.lower()
+    for candidate in (norm_quote, stripped):
+        if not candidate:
+            continue
+        idx = ci_chunk.find(candidate.lower())
+        if idx >= 0:
+            return norm_chunk[idx: idx + len(candidate)]
+
     return None