From 128463b098b147af3e02c96848c3c61b032aff66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 14:10:56 +0200 Subject: [PATCH 01/21] Fix Excel-serial parsing in load_questions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bioscancast_questions.csv stores created_date as an Excel serial day number (e.g. 45712). pd.to_datetime without unit=D + origin=1899-12-30 treated those integers as nanoseconds past 1970, yielding garbage dates like 1970-01-01 00:00:00.000045712. The bug was latent — no caller had yet relied on the parsed date — but the new orchestrator's build_forecast_question factory needs an accurate created_at. After the fix, q7 resolves to 2025-02-24 as expected. --- bioscancast/stages/eval_stage/loaders.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bioscancast/stages/eval_stage/loaders.py b/bioscancast/stages/eval_stage/loaders.py index cf7dbb5..92da2ea 100644 --- a/bioscancast/stages/eval_stage/loaders.py +++ b/bioscancast/stages/eval_stage/loaders.py @@ -66,7 +66,13 @@ def load_questions(path: PathLike) -> pd.DataFrame: df = _clean_text_columns(df) if "created_date" in df.columns: - df["created_date"] = pd.to_datetime(df["created_date"], errors="coerce") + # The CSV stores created_date as an Excel serial day (e.g. 45712 → + # 2025-02-19). Without unit="D" + origin="1899-12-30", pandas treats + # the integer as nanoseconds past 1970 and produces garbage dates + # like 1970-01-01 00:00:00.000045712. + df["created_date"] = pd.to_datetime( + df["created_date"], unit="D", origin="1899-12-30", errors="coerce", + ) if "question_status" in df.columns: df["question_status"] = df["question_status"].str.lower() From 94b315abf57b77ab195405dddaee6d456a2214cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 14:14:40 +0200 Subject: [PATCH 02/21] Add build_forecast_question factory and load_question_by_id The orchestrator (next commit) needs to turn a CSV row into a typed ForecastQuestion. Maps: - created_date -> tz-aware UTC datetime (already parsed by load_questions) - topic "Pathogen (Region)" -> lowercased pathogen + region - question_text "by Month day, year" -> target_date via regex; falls back to "by Month year" giving the first of next month - question_type + keyword hints in text -> event_type (case_count / death_count / outbreak_declared / None) - resolution_criteria passes through - as_of_date is a factory kwarg, not a CSV column; orchestrator passes it from --as-of-date Tested against all 11 rows of bioscancast_questions.csv; q7 produces ForecastQuestion(id=q7, pathogen=mpox, region=world, target_date=2025-02-28, event_type=case_count, ...). --- bioscancast/stages/eval_stage/loaders.py | 174 ++++++++++++++++++++++- 1 file changed, 172 insertions(+), 2 deletions(-) diff --git a/bioscancast/stages/eval_stage/loaders.py b/bioscancast/stages/eval_stage/loaders.py index 92da2ea..ee1f5bb 100644 --- a/bioscancast/stages/eval_stage/loaders.py +++ b/bioscancast/stages/eval_stage/loaders.py @@ -1,13 +1,107 @@ from __future__ import annotations +import logging +import re +from datetime import datetime, timezone from pathlib import Path -from typing import Union +from typing import Optional, Union import pandas as pd +from bioscancast.filtering.models import ForecastQuestion + PathLike = Union[str, Path] +logger = logging.getLogger(__name__) + + +_MONTH_NUMBERS: dict[str, int] = { + "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6, + "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, + "december": 12, +} + +# "by February 28, 2025" / "by Feb 28th 2025" / "by February 2025" +_TARGET_DATE_RE = re.compile( + r"by\s+(?P" + + "|".join(_MONTH_NUMBERS.keys()) + + r"|" + + "|".join(m[:3] for m in _MONTH_NUMBERS.keys()) + + r")" + + r"(?:\s+(?P\d{1,2})(?:st|nd|rd|th)?)?" + + r"(?:,)?\s+(?P\d{4})", + re.IGNORECASE, +) + + +def _parse_target_date(text: str) -> Optional[datetime]: + """Extract a target_date from a question's natural-language text. + + Matches phrasings like "by February 28, 2025", "by Feb 28 2025", and the + month-only fallback "by February 2025" (resolves to the 1st of the + following month, conservatively interpreting "by" as inclusive of the + named month). Returns None if no clear pattern matches. + """ + m = _TARGET_DATE_RE.search(text) + if not m: + return None + month_name = m.group("month").lower() + if len(month_name) == 3: + # Map 3-letter abbreviation back to canonical month + for full, num in _MONTH_NUMBERS.items(): + if full.startswith(month_name): + month = num + break + else: + return None + else: + month = _MONTH_NUMBERS[month_name] + year = int(m.group("year")) + day_str = m.group("day") + if day_str: + day = int(day_str) + try: + return datetime(year, month, day, tzinfo=timezone.utc) + except ValueError: + return None + # Month-only fallback: anchor to the last day of the month would require + # a calendar lookup; the simpler "1st of next month" gives the same + # cutoff semantics for a "by " question. + next_month = month + 1 + next_year = year + if next_month == 13: + next_month = 1 + next_year += 1 + return datetime(next_year, next_month, 1, tzinfo=timezone.utc) + + +def _split_topic(topic: str) -> tuple[Optional[str], Optional[str]]: + """Split a topic like "Mpox (World)" → ("mpox", "world"). + + Returns (pathogen, region). If no parenthetical exists, the whole topic + becomes the pathogen and region is None. + """ + if not topic or pd.isna(topic): + return None, None + topic = topic.strip() + if "(" in topic and topic.endswith(")"): + head, _, tail = topic.rpartition(" (") + return head.strip().lower() or None, tail.rstrip(")").strip().lower() or None + return topic.lower() or None, None + + +def _infer_event_type(question_type: str, question_text: str) -> Optional[str]: + """Map the CSV's question_type plus keyword hints to an event_type.""" + text = (question_text or "").lower() + if "deaths" in text or "death" in text: + return "death_count" + if "cases" in text or "case " in text: + return "case_count" + if "outbreak" in text: + return "outbreak_declared" + return None + def _read_csv(path: PathLike) -> pd.DataFrame: """ @@ -111,4 +205,80 @@ def load_forecasts(path: PathLike) -> pd.DataFrame: if "question_id" in df.columns: df["question_id"] = df["question_id"].str.strip() - return df \ No newline at end of file + return df + + +def build_forecast_question( + row: pd.Series, + *, + as_of_date: Optional[datetime] = None, +) -> ForecastQuestion: + """Convert one row of the question CSV into a ForecastQuestion. + + Used by the orchestrator (`bioscancast.main`) to turn a CSV row into the + typed object the search/filter/insight stages expect. ``as_of_date`` is + passed through verbatim and is the historical-replay cutoff; ``None`` + means live mode. + """ + qid = str(row["question_id"]).strip() + text = str(row["question_text"]).strip() + + created_value = row.get("created_date") + if isinstance(created_value, pd.Timestamp) and not pd.isna(created_value): + created_at = created_value.to_pydatetime() + if created_at.tzinfo is None: + created_at = created_at.replace(tzinfo=timezone.utc) + elif isinstance(created_value, datetime): + created_at = ( + created_value if created_value.tzinfo + else created_value.replace(tzinfo=timezone.utc) + ) + else: + logger.warning( + "Question %s has unparseable created_date %r; defaulting to now()", + qid, created_value, + ) + created_at = datetime.now(timezone.utc) + + topic = row.get("topic", "") + pathogen, region = _split_topic(str(topic) if not pd.isna(topic) else "") + + target_date = _parse_target_date(text) + event_type = _infer_event_type(str(row.get("question_type", "")), text) + + resolution_criteria_val = row.get("resolution_criteria") + resolution_criteria = ( + str(resolution_criteria_val).strip() + if resolution_criteria_val is not None and not pd.isna(resolution_criteria_val) + else None + ) + + return ForecastQuestion( + id=qid, + text=text, + created_at=created_at, + target_date=target_date, + region=region, + pathogen=pathogen, + event_type=event_type, + resolution_criteria=resolution_criteria, + as_of_date=as_of_date, + ) + + +def load_question_by_id( + path: PathLike, + question_id: str, + *, + as_of_date: Optional[datetime] = None, +) -> ForecastQuestion: + """Load a single question from the CSV by its question_id.""" + df = load_questions(path) + matches = df[df["question_id"].astype(str).str.strip() == question_id.strip()] + if matches.empty: + available = sorted(df["question_id"].astype(str).str.strip().tolist()) + raise KeyError( + f"question_id {question_id!r} not found in {path}. " + f"Available: {available}" + ) + return build_forecast_question(matches.iloc[0], as_of_date=as_of_date) \ No newline at end of file From 7b8493679427be18a026a07f6ed7dfb58db1a042 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 14:15:52 +0200 Subject: [PATCH 03/21] Add orchestration/test_questions.csv with q7 + q12 Branch-local question fixture for the new end-to-end orchestrator's live smoke tests. Two rows: - q7: verbatim copy of the row in bioscancast/stages/eval_stage/ bioscancast_questions.csv. Resolved at 126,441 mpox cases globally by Feb 28 2025. Run with --as-of-date 2025-02-28 to exercise historical replay. - q12: new live question on the current East Africa Ebola outbreak, target_date 2026-06-30. Run with no --as-of-date for live mode. Kept separate from bioscancast_questions.csv so the canonical CSV stays an unmodified record of what human forecasters actually evaluated. --- bioscancast/orchestration/test_questions.csv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 bioscancast/orchestration/test_questions.csv diff --git a/bioscancast/orchestration/test_questions.csv b/bioscancast/orchestration/test_questions.csv new file mode 100644 index 0000000..84e3ff5 --- /dev/null +++ b/bioscancast/orchestration/test_questions.csv @@ -0,0 +1,3 @@ +question_id;topic;question_text;question_type;resolution_criteria;created_date;question_status;resolved_option;comparison_to_outcome;takeaways;relevant_links +q7;Mpox (World);How many confirmed cumulative Mpox cases will be reported globally by February 28, 2025?;range;The question resolves based on cumulative confirmed Mpox cases.;45712;resolved;126,001-128,500;The final case count was 126,441.;Community adjustment over time demonstrated responsiveness to new data.;https://ourworldindata.org/mpox +q12;Ebola (East Africa);How many cumulative confirmed Ebola cases will be reported in the current East Africa outbreak by June 30, 2026?;range;The number of cumulative confirmed cases reported by Africa CDC and/or the WHO Disease Outbreak News for the current East Africa Ebola outbreak as of June 30, 2026.;46169;unresolved;TBD;TBD;TBD;https://africacdc.org/ From 015cf1f683287cb3aef3e8d71d9a29df51203229 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 14:17:18 +0200 Subject: [PATCH 04/21] Add OpenAI price table and cost estimator bioscancast/llm/pricing.py introduces: - MODEL_PRICES: USD/1M-token snapshot dated 2026-05-27 for the models actually used by stage configs (gpt-4o-mini, gpt-4o, text-embedding-3- small/large) plus a date-pinned gpt-4o-2024-08-06 alias. - estimate_cost(model, input_tokens, output_tokens, cached_input_tokens): computes USD spend with a 50% discount on cached prefix per OpenAI's standard prompt-cache pricing. - estimate_cost_from_summary(): consumes the dict shape that InsightRunResult.budget_summary already produces. Sources cited in the module docstring. Unknown model raises UnknownModelError so the orchestrator surfaces stale price tables loudly rather than under-reporting cost. --- bioscancast/llm/pricing.py | 110 +++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 bioscancast/llm/pricing.py diff --git a/bioscancast/llm/pricing.py b/bioscancast/llm/pricing.py new file mode 100644 index 0000000..d0ae65a --- /dev/null +++ b/bioscancast/llm/pricing.py @@ -0,0 +1,110 @@ +"""USD price table for OpenAI models and a per-call cost estimator. + +The orchestrator uses this to surface estimated cost per pipeline run. +Prices are a point-in-time snapshot — refresh when OpenAI changes rates or +when new model identifiers enter the stage configs. + +Snapshot taken 2026-05-27 from OpenAI's public API pricing pages +(https://devtk.ai/en/models/gpt-4o-mini/ and +https://www.cloudzero.com/blog/openai-pricing/). All numbers are USD per +1,000,000 tokens. The cached-input rate is OpenAI's standard 50% discount +on the cached prefix of an input; embedding models have no separate cached +rate. +""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class ModelPrice: + """USD per 1,000,000 tokens for one model.""" + + input: float + cached_input: float + output: float + + +MODEL_PRICES: dict[str, ModelPrice] = { + # Cheap chat workhorse — used by search (query decomposition + filter + # rescue) and insight (chunk extraction). + "gpt-4o-mini": ModelPrice(input=0.15, cached_input=0.075, output=0.60), + # Strong model — scaffolded for issue #26 refinement but not in + # production use as of 2026-05-27. + "gpt-4o": ModelPrice(input=2.50, cached_input=1.25, output=10.00), + "gpt-4o-2024-08-06": ModelPrice(input=2.50, cached_input=1.25, output=10.00), + # Embeddings (insight retrieval). + "text-embedding-3-small": ModelPrice(input=0.02, cached_input=0.02, output=0.0), + "text-embedding-3-large": ModelPrice(input=0.13, cached_input=0.13, output=0.0), +} + + +class UnknownModelError(KeyError): + """Raised when a model name is not in MODEL_PRICES.""" + + +def estimate_cost( + model: str, + input_tokens: int, + output_tokens: int, + cached_input_tokens: int = 0, +) -> float: + """Estimate USD cost of an LLM call. + + Args: + model: Identifier matching a key in MODEL_PRICES. + input_tokens: Total input tokens (including any cached portion). + output_tokens: Output tokens generated. + cached_input_tokens: Subset of input_tokens that hit the prompt + cache; must be <= input_tokens. The non-cached remainder is + billed at the full input rate. + + Raises: + UnknownModelError: If ``model`` is not in MODEL_PRICES — refresh + this module when adding a new model to a stage config. + ValueError: If cached_input_tokens > input_tokens or any token + count is negative. + """ + if input_tokens < 0 or output_tokens < 0 or cached_input_tokens < 0: + raise ValueError("Token counts must be non-negative") + if cached_input_tokens > input_tokens: + raise ValueError( + f"cached_input_tokens ({cached_input_tokens}) exceeds " + f"input_tokens ({input_tokens})" + ) + try: + price = MODEL_PRICES[model] + except KeyError as exc: + raise UnknownModelError( + f"Model {model!r} is not in MODEL_PRICES; refresh " + f"bioscancast/llm/pricing.py with current rates." + ) from exc + + fresh_input = input_tokens - cached_input_tokens + return ( + fresh_input * price.input + + cached_input_tokens * price.cached_input + + output_tokens * price.output + ) / 1_000_000.0 + + +def estimate_cost_from_summary(summary: dict) -> float: + """Estimate USD cost from an InsightRunResult.budget_summary-style dict. + + Expects a dict with a ``per_model`` key whose value is + ``{model: {input_tokens, output_tokens, [cached_input_tokens]}}`` — + the shape the insight pipeline already produces. Unknown models are + skipped with a noisy KeyError so callers can decide whether to + suppress or surface them. + """ + per_model = summary.get("per_model") or {} + total = 0.0 + for model, counts in per_model.items(): + total += estimate_cost( + model, + input_tokens=int(counts.get("input_tokens", 0)), + output_tokens=int(counts.get("output_tokens", 0)), + cached_input_tokens=int(counts.get("cached_input_tokens", 0)), + ) + return total From ce8c718e16dd13f9db0d4c1059631ce6c9b84ca5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 14:18:28 +0200 Subject: [PATCH 05/21] Add bioscancast.orchestration package and persistence helpers New module with the run-directory layout (data/runs/{question_id}/{run_id}/) and per-stage JSON dump helpers: save_question / save_search / save_filtered / save_documents / save_insight / save_manifest. _json_default and the asdict pattern are lifted from scripts/eval_insight_on_real_docs.py so the orchestrator and the eval harness share serialization conventions. --- bioscancast/orchestration/__init__.py | 7 ++ bioscancast/orchestration/persistence.py | 83 ++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 bioscancast/orchestration/__init__.py create mode 100644 bioscancast/orchestration/persistence.py diff --git a/bioscancast/orchestration/__init__.py b/bioscancast/orchestration/__init__.py new file mode 100644 index 0000000..23cc5aa --- /dev/null +++ b/bioscancast/orchestration/__init__.py @@ -0,0 +1,7 @@ +"""End-to-end pipeline orchestration helpers. + +The main orchestrator lives in :mod:`bioscancast.main`. This package +holds the persistence layer (run-directory layout, per-stage JSON dumps, +manifest write/append) so :mod:`bioscancast.main` stays focused on +stage composition. +""" diff --git a/bioscancast/orchestration/persistence.py b/bioscancast/orchestration/persistence.py new file mode 100644 index 0000000..f0d8f3a --- /dev/null +++ b/bioscancast/orchestration/persistence.py @@ -0,0 +1,83 @@ +"""Run-directory layout and per-stage JSON dump helpers. + +The orchestrator writes each stage's output and a running manifest to +``data/runs/{question_id}/{run_id}/`` so a crashed or interrupted run +still has partial artifacts for debugging. +""" + +from __future__ import annotations + +import json +from dataclasses import asdict, is_dataclass +from datetime import datetime +from pathlib import Path +from typing import Any, Iterable + +from bioscancast.filtering.models import ForecastQuestion +from bioscancast.insight.pipeline import InsightRunResult + + +def _json_default(obj: Any) -> Any: + """Default JSON encoder for datetimes and dataclasses. + + Lifted from scripts/eval_insight_on_real_docs.py so the orchestrator + and the eval harness use the same conventions for run artifacts. + """ + if isinstance(obj, datetime): + return obj.isoformat() + if is_dataclass(obj): + return asdict(obj) + raise TypeError(f"not serializable: {type(obj).__name__}") + + +def _dump(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + json.dump(payload, f, indent=2, default=_json_default) + + +def make_run_dir(out_root: Path, question_id: str, run_id: str) -> Path: + """Create and return ``out_root/question_id/run_id/``.""" + run_dir = Path(out_root) / question_id / run_id + run_dir.mkdir(parents=True, exist_ok=True) + return run_dir + + +def save_question(run_dir: Path, question: ForecastQuestion) -> None: + _dump(run_dir / "question.json", asdict(question)) + + +def save_search(run_dir: Path, results: Iterable[Any]) -> None: + _dump(run_dir / "search.json", [asdict(r) for r in results]) + + +def save_filtered(run_dir: Path, docs: Iterable[Any]) -> None: + _dump(run_dir / "filtered.json", [asdict(d) for d in docs]) + + +def save_documents(run_dir: Path, docs: Iterable[Any]) -> None: + _dump(run_dir / "documents.json", [asdict(d) for d in docs]) + + +def save_insight(run_dir: Path, result: InsightRunResult) -> None: + """Serialize an InsightRunResult to insight.json. + + The dataclass nests InsightRecord and ChunkReference dataclasses; + asdict handles both. budget_summary is already a dict. + """ + payload = { + "records": [asdict(r) for r in result.records], + "budget_summary": result.budget_summary, + "documents_processed": result.documents_processed, + "documents_skipped": result.documents_skipped, + "notes": list(result.notes), + } + _dump(run_dir / "insight.json", payload) + + +def save_manifest(run_dir: Path, manifest: dict) -> None: + """Write the manifest. Called repeatedly — once before each stage and + again after every stage completes — so a crashed run keeps the + partial timings/config that did make it through. + """ + _dump(run_dir / "manifest.json", manifest) From c71dafdbbae30fede719a694ee66450299f64d2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 14:21:22 +0200 Subject: [PATCH 06/21] Implement end-to-end pipeline orchestrator in bioscancast/main.py Replaces the 14-line commented sketch with a real argparse-driven orchestrator that chains all four stages for a single ForecastQuestion: python -m bioscancast.main q7 --as-of-date 2025-02-28 -v Pipeline: 1. load_question_by_id reads the CSV row and builds a ForecastQuestion via the new factory (applying any CLI overrides). 2. SearchStagePipeline runs with a usage-tracking LLM wrapper so search + filter token usage is accumulated for cost reporting. 3. FilteringPipeline reuses the same wrapped client. 4. ExtractionPipeline gets as_of_date=question.as_of_date so the fetcher uses Wayback snapshots in historical-replay mode. 5. InsightPipeline receives the raw (unwrapped) client; its BudgetTracker already tracks usage, so wrapping would double-count. After all stages, search/filter usage and insight per_model usage are merged and fed to bioscancast.llm.pricing.estimate_cost for a single USD figure in the final epilogue and manifest. Persistence: data/runs/{question_id}/{run_id}/ question.json, search.json, filtered.json, documents.json, insight.json, manifest.json The manifest is rewritten after every stage so a crashed run keeps partial timings + config. On any stage exception the manifest pins the failing stage and re-raises wrapped in PipelineError; main() exits 1. Empty intermediate output is not an error - logged and passed through (the insight stage already handles zero documents). --- bioscancast/main.py | 424 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 413 insertions(+), 11 deletions(-) diff --git a/bioscancast/main.py b/bioscancast/main.py index 6dd54a9..68e5c80 100644 --- a/bioscancast/main.py +++ b/bioscancast/main.py @@ -1,14 +1,416 @@ -# question = ForecastQuestion( -# id="Q123", -# text="Will country X report more than 50 confirmed human cases of pathogen Y by 30 June 2026?", -# created_at=now() -# ) +"""End-to-end pipeline orchestrator: search -> filter -> extract -> insight. -# search_results = run_search_stage(question, config) +Runs all four stages against a single forecast question, persisting each +stage's output and a running manifest under ``data/runs/{qid}/{run_id}/`` +so a crashed run still has partial artifacts for debugging. -# filtered_docs = run_filtering_stage(question, search_results, config) +Usage: -# downstream -# extracted_texts = run_extraction_stage(filtered_docs, config) -# insights_df = run_insight_stage(question, extracted_texts, config) -# forecast = run_forecasting_stage(question, insights_df, config) \ No newline at end of file + python -m bioscancast.main QUESTION_ID [--csv PATH] [--as-of-date Y-M-D] ... + +See ``--help`` for the full flag list. +""" + +from __future__ import annotations + +import argparse +import logging +import os +import sys +import time +from collections import defaultdict +from contextlib import contextmanager +from dataclasses import asdict +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Iterator, Optional + +try: + from dotenv import load_dotenv + load_dotenv() +except ImportError: + pass + +from bioscancast.extraction.config import ExtractionConfig +from bioscancast.extraction.pipeline import ExtractionPipeline +from bioscancast.filtering.config import FILTER_CONFIG +from bioscancast.filtering.models import ForecastQuestion +from bioscancast.filtering.pipeline import FilteringPipeline +from bioscancast.insight.config import InsightConfig +from bioscancast.insight.pipeline import InsightPipeline, InsightRunResult +from bioscancast.llm.base import LLMResponse +from bioscancast.llm.openai_client import OpenAILLMClient +from bioscancast.llm.pricing import ( + UnknownModelError, + estimate_cost, +) +from bioscancast.orchestration import persistence +from bioscancast.stages.eval_stage.loaders import load_question_by_id +from bioscancast.stages.search_stage.backends.tavily_backend import TavilyBackend +from bioscancast.stages.search_stage.cache import SearchCache +from bioscancast.stages.search_stage.pipeline import SearchStagePipeline + + +DEFAULT_CSV = "bioscancast/stages/eval_stage/bioscancast_questions.csv" +DEFAULT_OUT_ROOT = "data/runs" + +logger = logging.getLogger("bioscancast.main") + + +class PipelineError(RuntimeError): + """Raised when a stage fails; wraps the underlying exception with stage name.""" + + def __init__(self, stage: str, original: BaseException) -> None: + super().__init__(f"Pipeline failed in stage {stage!r}: {original}") + self.stage = stage + self.original = original + + +class _UsageTrackingClient: + """Wraps an LLMClient and accumulates per-model token usage. + + Used for the search and filter stages so the orchestrator can include + their cost in the final estimate. The insight stage already tracks its + own usage via BudgetTracker — passing the raw client to insight avoids + double-counting. + """ + + def __init__(self, inner) -> None: + self._inner = inner + self.per_model: dict[str, dict[str, int]] = defaultdict( + lambda: {"input_tokens": 0, "output_tokens": 0, "calls": 0} + ) + + def generate_json(self, **kwargs) -> LLMResponse: + response = self._inner.generate_json(**kwargs) + bucket = self.per_model[response.model] + bucket["input_tokens"] += response.input_tokens + bucket["output_tokens"] += response.output_tokens + bucket["calls"] += 1 + return response + + def embed(self, texts: list[str], *, model: str) -> list[list[float]]: + # The OpenAI embed() call doesn't expose usage today; the insight + # pipeline doesn't currently track embedding cost either. If + # embeddings become a material cost line, hook tokenizer-based + # estimation in here. + return self._inner.embed(texts, model=model) + + +# ---------------------------------------------------------------------------- +# CLI parsing and question construction +# ---------------------------------------------------------------------------- + + +def _parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog="bioscancast.main", + description="Run the BioScanCast pipeline end-to-end for one question.", + ) + parser.add_argument( + "question_id", + help="ID of the question in the CSV (e.g. q7).", + ) + parser.add_argument( + "--csv", + default=DEFAULT_CSV, + help=f"Path to the question CSV. Default: {DEFAULT_CSV}", + ) + parser.add_argument( + "--out-root", + default=DEFAULT_OUT_ROOT, + help=f"Run-artifact root directory. Default: {DEFAULT_OUT_ROOT}", + ) + parser.add_argument( + "--run-id", + default=None, + help="Override the UTC-timestamp run directory name.", + ) + parser.add_argument( + "--as-of-date", + default=None, + help="Historical-replay cutoff (YYYY-MM-DD). Omit for live mode.", + ) + parser.add_argument( + "--target-date", + default=None, + help="Override CSV-derived target_date (YYYY-MM-DD).", + ) + parser.add_argument("--region", default=None, help="Override region field.") + parser.add_argument("--pathogen", default=None, help="Override pathogen field.") + parser.add_argument( + "--event-type", default=None, help="Override event_type field." + ) + parser.add_argument( + "--no-cache", + action="store_true", + help="Disable the search-stage cache.", + ) + parser.add_argument( + "--max-input-tokens", + type=int, + default=None, + help="Override InsightConfig.max_input_tokens_per_run.", + ) + parser.add_argument( + "-v", "--verbose", + action="store_true", + help="Set log level to INFO.", + ) + return parser.parse_args(argv) + + +def _parse_date(arg: Optional[str]) -> Optional[datetime]: + if arg is None: + return None + return datetime.strptime(arg, "%Y-%m-%d").replace(tzinfo=timezone.utc) + + +def _apply_overrides(q: ForecastQuestion, args: argparse.Namespace) -> ForecastQuestion: + target_date = _parse_date(args.target_date) + return ForecastQuestion( + id=q.id, + text=q.text, + created_at=q.created_at, + target_date=target_date if target_date is not None else q.target_date, + region=args.region or q.region, + pathogen=args.pathogen or q.pathogen, + event_type=args.event_type or q.event_type, + resolution_criteria=q.resolution_criteria, + as_of_date=q.as_of_date, + ) + + +# ---------------------------------------------------------------------------- +# Stage timing and console output +# ---------------------------------------------------------------------------- + + +@contextmanager +def _stage_timer(manifest: dict, stage: str) -> Iterator[None]: + manifest["stage_timings"].setdefault(stage, None) + manifest["current_stage"] = stage + print(f"[{stage}] starting...", flush=True) + t0 = time.perf_counter() + try: + yield + finally: + elapsed = time.perf_counter() - t0 + manifest["stage_timings"][stage] = round(elapsed, 3) + + +def _log_summary(stage: str, summary: str, elapsed: Optional[float]) -> None: + tail = f"({elapsed:.1f}s)" if elapsed is not None else "" + print(f"[{stage}] {summary} {tail}", flush=True) + + +# ---------------------------------------------------------------------------- +# Cost estimation +# ---------------------------------------------------------------------------- + + +def _merge_usage(*usage_dicts: dict[str, dict[str, int]]) -> dict[str, dict[str, int]]: + """Combine per-model usage dicts from different stages.""" + merged: dict[str, dict[str, int]] = defaultdict( + lambda: {"input_tokens": 0, "output_tokens": 0, "calls": 0} + ) + for usage in usage_dicts: + for model, counts in (usage or {}).items(): + for k in ("input_tokens", "output_tokens", "calls"): + merged[model][k] += int(counts.get(k, 0)) + return dict(merged) + + +def _estimate_total_cost(per_model: dict[str, dict[str, int]]) -> tuple[float, list[str]]: + """Return (usd_total, list_of_warnings).""" + total = 0.0 + warnings: list[str] = [] + for model, counts in per_model.items(): + try: + total += estimate_cost( + model, + input_tokens=int(counts.get("input_tokens", 0)), + output_tokens=int(counts.get("output_tokens", 0)), + ) + except UnknownModelError as exc: + warnings.append(str(exc)) + return total, warnings + + +# ---------------------------------------------------------------------------- +# Main pipeline +# ---------------------------------------------------------------------------- + + +def run_pipeline(args: argparse.Namespace) -> InsightRunResult: + csv_path = Path(args.csv) + if not csv_path.exists(): + raise FileNotFoundError(f"Question CSV not found: {csv_path}") + + for var in ("OPENAI_API_KEY", "TAVILY_API_KEY"): + if not os.environ.get(var): + raise RuntimeError( + f"Missing required environment variable {var}. " + f"Set it in your shell or in a .env file." + ) + + question = load_question_by_id( + csv_path, + args.question_id, + as_of_date=_parse_date(args.as_of_date), + ) + question = _apply_overrides(question, args) + + run_id = args.run_id or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + out_root = Path(args.out_root) + run_dir = persistence.make_run_dir(out_root, question.id, run_id) + + print(f"=== Pipeline run: {question.id} ({run_id}) ===") + print( + f"question: {question.text}\n" + f"pathogen: {question.pathogen!r}\n" + f"region: {question.region!r}\n" + f"target: {question.target_date.date() if question.target_date else None}\n" + f"as_of: {question.as_of_date.date() if question.as_of_date else None}\n" + f"artifacts: {run_dir}\n", + ) + + persistence.save_question(run_dir, question) + + insight_config = InsightConfig() + if args.max_input_tokens: + insight_config.max_input_tokens_per_run = args.max_input_tokens + + manifest: dict[str, Any] = { + "run_id": run_id, + "question_id": question.id, + "csv_path": str(csv_path), + "started_at": datetime.now(timezone.utc).isoformat(), + "completed_at": None, + "stage_timings": {}, + "current_stage": None, + "errored_stage": None, + "error_message": None, + "config": { + "filter": dict(FILTER_CONFIG), + "extraction": asdict(ExtractionConfig()), + "insight": asdict(insight_config), + }, + } + persistence.save_manifest(run_dir, manifest) + + shared_llm_raw = OpenAILLMClient() + shared_llm = _UsageTrackingClient(shared_llm_raw) + + try: + with _stage_timer(manifest, "search"): + backend = TavilyBackend() + cache = None if args.no_cache else SearchCache() + search_pipeline = SearchStagePipeline( + search_backend=backend, + llm_client=shared_llm, + cache=cache, + backend_name="tavily", + ) + search_results = search_pipeline.run(question) + persistence.save_search(run_dir, search_results) + if cache: + cache.close() + _log_summary( + "search", f"{len(search_results)} results", + manifest["stage_timings"]["search"], + ) + persistence.save_manifest(run_dir, manifest) + + with _stage_timer(manifest, "filter"): + filter_pipeline = FilteringPipeline(llm_client=shared_llm) + filtered_docs = filter_pipeline.run(question, search_results) + persistence.save_filtered(run_dir, filtered_docs) + _log_summary( + "filter", f"{len(filtered_docs)} docs", + manifest["stage_timings"]["filter"], + ) + persistence.save_manifest(run_dir, manifest) + + with _stage_timer(manifest, "extract"): + extraction_pipeline = ExtractionPipeline(as_of_date=question.as_of_date) + documents = extraction_pipeline.run(filtered_docs) + persistence.save_documents(run_dir, documents) + n_ok = sum(1 for d in documents if d.status == "success") + _log_summary( + "extract", f"{n_ok}/{len(documents)} success", + manifest["stage_timings"]["extract"], + ) + persistence.save_manifest(run_dir, manifest) + + with _stage_timer(manifest, "insight"): + insight_pipeline = InsightPipeline( + llm_client=shared_llm_raw, + config=insight_config, + ) + insight_result = insight_pipeline.run(question, documents) + persistence.save_insight(run_dir, insight_result) + budget = insight_result.budget_summary + _log_summary( + "insight", + f"{len(insight_result.records)} records | " + f"in={budget.get('total_input_tokens', 0):,} " + f"out={budget.get('total_output_tokens', 0):,}", + manifest["stage_timings"]["insight"], + ) + + except Exception as exc: + manifest["errored_stage"] = manifest.get("current_stage") + manifest["error_message"] = f"{type(exc).__name__}: {exc}" + persistence.save_manifest(run_dir, manifest) + raise PipelineError(manifest["current_stage"] or "unknown", exc) from exc + + # Combine usage across stages and estimate cost. + combined_usage = _merge_usage( + dict(shared_llm.per_model), + insight_result.budget_summary.get("per_model") or {}, + ) + cost_usd, cost_warnings = _estimate_total_cost(combined_usage) + + manifest["current_stage"] = None + manifest["completed_at"] = datetime.now(timezone.utc).isoformat() + manifest["combined_usage"] = combined_usage + manifest["estimated_cost_usd"] = round(cost_usd, 6) + if cost_warnings: + manifest["cost_estimate_warnings"] = cost_warnings + persistence.save_manifest(run_dir, manifest) + + print() + print(f"=== Pipeline complete: {question.id} ===") + for stage in ("search", "filter", "extract", "insight"): + elapsed = manifest["stage_timings"].get(stage) + if elapsed is not None: + print(f" {stage:<8} {elapsed:>7.2f}s") + print(f" estimated cost: ${cost_usd:.4f}") + if cost_warnings: + for w in cost_warnings: + print(f" ! {w}") + print(f" artifacts: {run_dir}") + + return insight_result + + +def main(argv: Optional[list[str]] = None) -> int: + args = _parse_args(argv if argv is not None else sys.argv[1:]) + logging.basicConfig( + level=logging.INFO if args.verbose else logging.WARNING, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + ) + try: + run_pipeline(args) + except PipelineError as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 1 + except (FileNotFoundError, KeyError, RuntimeError) as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 2 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 7c8c9d83d1c20a03bde2c5d5831b8681f14b31ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 14:22:27 +0200 Subject: [PATCH 07/21] Add scripts/run_pipeline.py wrapper and gitignore data/runs/ Thin scripts/run_pipeline.py wrapper around bioscancast.main:main so the new orchestrator follows the same `scripts/run_*.py` convention as the existing per-stage runners. Both invocations are equivalent: python -m bioscancast.main q7 python scripts/run_pipeline.py q7 data/runs/ added to .gitignore so per-run artifacts (some quite large - documents.json includes every chunk text) don't pollute the repo. --- .gitignore | 1 + scripts/run_pipeline.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 scripts/run_pipeline.py diff --git a/.gitignore b/.gitignore index ccb93a7..43b0233 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ build/ # Data / cache data/cache/ +data/runs/ *.sqlite # Docling eval — keep FINDINGS.md and sources/, ignore generated run artifacts diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py new file mode 100644 index 0000000..d76fecd --- /dev/null +++ b/scripts/run_pipeline.py @@ -0,0 +1,21 @@ +"""Thin wrapper around ``python -m bioscancast.main`` so the orchestrator +matches the existing per-stage runner convention (scripts/run_*.py). + +Usage: + python scripts/run_pipeline.py q7 --as-of-date 2025-02-28 -v +""" + +from __future__ import annotations + +import os +import sys + +# Add project root to path so `bioscancast` imports work when run from +# anywhere. +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from bioscancast.main import main # noqa: E402 + + +if __name__ == "__main__": + sys.exit(main()) From 86dc646a7c7dedc77b79d4d7e35e1ca7dfdff709 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 14:33:22 +0200 Subject: [PATCH 08/21] Handle sets and dated OpenAI model aliases (surfaced by live runs) Two small fixes uncovered by the first live runs of the orchestrator: 1. persistence._json_default crashed on FILTER_CONFIG's set values (blocked_domains, low_value_url_keywords, etc.) when serializing the manifest. Now sorts sets to lists. 2. pricing.MODEL_PRICES needs the dated aliases OpenAI returns in response.model. A request to "gpt-4o-mini" comes back tagged "gpt-4o-mini-2024-07-18", which was missing from the table and produced a $0 cost estimate with a noisy warning. Added that alias plus a couple of known gpt-4o dated variants. q7 historical-replay run subsequently cost $0.0030, q12 live run cost $0.0049, both correctly reported in the manifest. --- bioscancast/llm/pricing.py | 15 +++++++++++---- bioscancast/orchestration/persistence.py | 8 +++++--- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/bioscancast/llm/pricing.py b/bioscancast/llm/pricing.py index d0ae65a..56ef62a 100644 --- a/bioscancast/llm/pricing.py +++ b/bioscancast/llm/pricing.py @@ -26,14 +26,21 @@ class ModelPrice: output: float +_GPT_4O_MINI = ModelPrice(input=0.15, cached_input=0.075, output=0.60) +_GPT_4O = ModelPrice(input=2.50, cached_input=1.25, output=10.00) + MODEL_PRICES: dict[str, ModelPrice] = { # Cheap chat workhorse — used by search (query decomposition + filter - # rescue) and insight (chunk extraction). - "gpt-4o-mini": ModelPrice(input=0.15, cached_input=0.075, output=0.60), + # rescue) and insight (chunk extraction). OpenAI returns the dated + # alias in response.model even when the request used the floating + # name; keep both keyed to the same price. + "gpt-4o-mini": _GPT_4O_MINI, + "gpt-4o-mini-2024-07-18": _GPT_4O_MINI, # Strong model — scaffolded for issue #26 refinement but not in # production use as of 2026-05-27. - "gpt-4o": ModelPrice(input=2.50, cached_input=1.25, output=10.00), - "gpt-4o-2024-08-06": ModelPrice(input=2.50, cached_input=1.25, output=10.00), + "gpt-4o": _GPT_4O, + "gpt-4o-2024-08-06": _GPT_4O, + "gpt-4o-2024-05-13": _GPT_4O, # Embeddings (insight retrieval). "text-embedding-3-small": ModelPrice(input=0.02, cached_input=0.02, output=0.0), "text-embedding-3-large": ModelPrice(input=0.13, cached_input=0.13, output=0.0), diff --git a/bioscancast/orchestration/persistence.py b/bioscancast/orchestration/persistence.py index f0d8f3a..6118e7a 100644 --- a/bioscancast/orchestration/persistence.py +++ b/bioscancast/orchestration/persistence.py @@ -18,15 +18,17 @@ def _json_default(obj: Any) -> Any: - """Default JSON encoder for datetimes and dataclasses. + """Default JSON encoder for datetimes, dataclasses, and sets. - Lifted from scripts/eval_insight_on_real_docs.py so the orchestrator - and the eval harness use the same conventions for run artifacts. + Lifted from scripts/eval_insight_on_real_docs.py and extended to + handle the set/frozenset values that live in FILTER_CONFIG. """ if isinstance(obj, datetime): return obj.isoformat() if is_dataclass(obj): return asdict(obj) + if isinstance(obj, (set, frozenset)): + return sorted(obj) raise TypeError(f"not serializable: {type(obj).__name__}") From 2a03ea8e3f5ef4b2e0eb73c683ecad1ffad8ae03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 23:28:46 +0200 Subject: [PATCH 09/21] Bypass heuristic scoring for dashboard-injected results (#14) Dashboard URLs from the curated registry in bioscancast/datasets/biosecurity_sources.py have hand-picked titles ("Dashboard: cdc.gov") and generic snippets that produce keyword_overlap_score = 0.000 against any real forecast question. The heuristic priority score drags them under the 0.72 keep threshold even though they are by construction high-value sources. Live-run evidence: q7 and q12 each injected two dashboards. All four had keyword_overlap = 0.000. Two of those four were dropped pre-LLM, including ourworldindata.org for q7 - which is the resolution source named in the question's relevant_links column. Fix: in heuristic_filter, detect retrieval_reason == "dashboard_lookup" and auto-keep with reason_code "dashboard_lookup_bypass" and a synthetic priority_score of 1.0. The dashboards still go through the rest of the filtering pipeline (dedup, per-domain cap, extraction-hint assignment) unchanged - this is the keyword-overlap chokepoint only. Implements item 1 from the Tier 1 roadmap. Pairs with the dashboard title/snippet enrichment in the next commit. --- bioscancast/filtering/heuristics.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/bioscancast/filtering/heuristics.py b/bioscancast/filtering/heuristics.py index 8aa67bc..fe41dc5 100644 --- a/bioscancast/filtering/heuristics.py +++ b/bioscancast/filtering/heuristics.py @@ -121,6 +121,27 @@ def heuristic_filter( ) continue + # Dashboard-injected results are hand-curated in + # ``bioscancast/datasets/biosecurity_sources.py``; they bypass the + # keyword-overlap-driven heuristic which structurally undervalues + # their generic titles. See issue #14 and live-run data on q7/q12 + # where 4/4 injected dashboards had keyword_overlap == 0.000. + if result.retrieval_reason == "dashboard_lookup": + relevance_score = compute_heuristic_relevance(result, question) + credibility_score = compute_heuristic_credibility(result) + keep_list.append( + make_decision( + result=result, + keep=True, + stage="heuristic", + relevance_score=relevance_score, + credibility_score=credibility_score, + priority_score=1.0, + reason_codes=["dashboard_lookup_bypass"], + ) + ) + continue + relevance_score = compute_heuristic_relevance(result, question) credibility_score = compute_heuristic_credibility(result) priority_score = compute_priority_score(result, relevance_score, credibility_score) From 13db598efda7ac7d3e66a781d2ed210222bf5ee5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 23:30:54 +0200 Subject: [PATCH 10/21] Enrich dashboard titles and snippets with pathogen-specific text (#14) The previous dashboard injection used generic strings ("Dashboard: cdc.gov", "Known mpox monitoring dashboard") that produced keyword_overlap_score = 0.000 against every real forecast question - 4/4 injected dashboards in the q7/q12 live runs had this exact failure mode. The fix: turn DASHBOARD_LOOKUP into a list of DashboardEntry dataclasses carrying url + title + snippet, with hand-written pathogen-specific text for each entry. The titles read as real search-result titles ("CDC H5N1 bird flu situation summary: human cases and outbreaks in the United States") and the snippets describe what data the page hosts. Pairs with the previous commit's dashboard heuristic bypass: even with the bypass in place, better titles still help (a) the keyword-overlap score for downstream scoring, and (b) the LLM rescue path when it encounters other pathogen-specific dashboards we add later. The bypass keeps low-keyword-overlap dashboards alive; this commit makes them discoverable on their own merits. Implements item 5 from the Tier 1/2 roadmap. --- bioscancast/datasets/biosecurity_sources.py | 112 +++++++++++++++--- .../stages/search_stage/dashboard_lookup.py | 20 ++-- 2 files changed, 105 insertions(+), 27 deletions(-) diff --git a/bioscancast/datasets/biosecurity_sources.py b/bioscancast/datasets/biosecurity_sources.py index ddfd11c..f4b9878 100644 --- a/bioscancast/datasets/biosecurity_sources.py +++ b/bioscancast/datasets/biosecurity_sources.py @@ -1,34 +1,112 @@ """Known biosecurity dashboard URLs by pathogen. -v1 — flagged for iteration after first benchmark run. -The dashboard list and routing logic will need updating as new outbreaks emerge -and data portals change. +v1 — flagged for iteration after first benchmark run. The dashboard list +and routing logic will need updating as new outbreaks emerge and data +portals change. + +Each entry carries a pathogen-specific ``title`` and ``snippet`` so that +the heuristic filter and the LLM-rescue path have real signal to work +with. The earlier convention ("Dashboard: cdc.gov" with a generic +snippet) produced keyword_overlap_score = 0.000 across the board — see +issue #14 and the q7/q12 live-run findings. """ -DASHBOARD_LOOKUP: dict[str, list[str]] = { +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class DashboardEntry: + """A curated authoritative source for a pathogen. + + The title and snippet are intended to be readable as a search result + in their own right: pathogen name, the kind of data the page hosts, + and the publisher. They feed both the keyword-overlap heuristic and + the LLM-rescue path. + """ + + url: str + title: str + snippet: str + + +DASHBOARD_LOOKUP: dict[str, list[DashboardEntry]] = { "h5n1": [ - "https://www.cdc.gov/bird-flu/situation-summary/", - "https://www.who.int/teams/global-influenza-programme/avian-influenza", + DashboardEntry( + url="https://www.cdc.gov/bird-flu/situation-summary/", + title="CDC H5N1 bird flu situation summary: human cases and outbreaks in the United States", + snippet="CDC tracking of H5N1 avian influenza human cases, affected livestock herds, and public-health response in the US.", + ), + DashboardEntry( + url="https://www.who.int/teams/global-influenza-programme/avian-influenza", + title="WHO Global Influenza Programme: avian influenza A(H5N1) human cases and surveillance", + snippet="WHO monitoring of human H5N1 cases, animal-to-human spillover events, and global surveillance reporting.", + ), ], "avian influenza": [ - "https://www.cdc.gov/bird-flu/situation-summary/", - "https://www.who.int/teams/global-influenza-programme/avian-influenza", + DashboardEntry( + url="https://www.cdc.gov/bird-flu/situation-summary/", + title="CDC H5N1 bird flu situation summary: human cases and outbreaks in the United States", + snippet="CDC tracking of H5N1 avian influenza human cases, affected livestock herds, and public-health response in the US.", + ), + DashboardEntry( + url="https://www.who.int/teams/global-influenza-programme/avian-influenza", + title="WHO Global Influenza Programme: avian influenza A(H5N1) human cases and surveillance", + snippet="WHO monitoring of human H5N1 cases, animal-to-human spillover events, and global surveillance reporting.", + ), ], "mpox": [ - "https://ourworldindata.org/mpox", - "https://www.who.int/emergencies/situation-reports", - "https://www.cdc.gov/mpox/data-research/index.html", + DashboardEntry( + url="https://ourworldindata.org/mpox", + title="Our World in Data mpox tracker: global confirmed cases and deaths", + snippet="OWID dashboard tracking cumulative confirmed mpox cases and deaths globally, broken down by country and region, updated from national health agencies.", + ), + DashboardEntry( + url="https://www.who.int/emergencies/situation-reports", + title="WHO situation reports including the multi-country mpox outbreak", + snippet="WHO situation reports with weekly case counts, country breakdowns, and public-health guidance for ongoing outbreaks including mpox.", + ), + DashboardEntry( + url="https://www.cdc.gov/mpox/data-research/index.html", + title="CDC mpox data and research dashboard for the United States", + snippet="CDC tracking of US mpox cases, demographic data, vaccination coverage, and outbreak response.", + ), ], "ebola": [ - "https://www.afro.who.int/health-topics/ebola-virus-disease", - "https://www.cdc.gov/ebola/index.html", + DashboardEntry( + url="https://www.afro.who.int/health-topics/ebola-virus-disease", + title="WHO Africa Ebola virus disease outbreak surveillance and case counts", + snippet="WHO regional office for Africa tracking of Ebola virus disease outbreaks, confirmed and suspected cases, deaths, and response across African countries.", + ), + DashboardEntry( + url="https://www.cdc.gov/ebola/index.html", + title="CDC Ebola virus disease outbreak history and case counts", + snippet="CDC information on current and historical Ebola virus disease outbreaks worldwide, with case counts, deaths, and US public-health response.", + ), ], "covid-19": [ - "https://ourworldindata.org/coronavirus", - "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports", + DashboardEntry( + url="https://ourworldindata.org/coronavirus", + title="Our World in Data COVID-19 tracker: global cases, deaths, and vaccinations", + snippet="OWID dashboard tracking cumulative COVID-19 confirmed cases, deaths, hospitalizations, and vaccination coverage globally by country.", + ), + DashboardEntry( + url="https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports", + title="WHO COVID-19 situation reports and global case counts", + snippet="WHO situation reports with updates on COVID-19 confirmed cases, deaths, variant tracking, and country-level data.", + ), ], "marburg": [ - "https://www.who.int/news-room/fact-sheets/detail/marburg-virus-disease", - "https://www.cdc.gov/marburg/index.html", + DashboardEntry( + url="https://www.who.int/news-room/fact-sheets/detail/marburg-virus-disease", + title="WHO Marburg virus disease facts and outbreak case counts", + snippet="WHO factsheet on Marburg virus disease including transmission, symptoms, case-fatality ratio, and historical outbreak case and death counts.", + ), + DashboardEntry( + url="https://www.cdc.gov/marburg/index.html", + title="CDC Marburg virus disease outbreaks and surveillance", + snippet="CDC information on Marburg virus disease outbreaks worldwide, case counts, deaths, and US public-health surveillance.", + ), ], } diff --git a/bioscancast/stages/search_stage/dashboard_lookup.py b/bioscancast/stages/search_stage/dashboard_lookup.py index e3784c3..87fb9a5 100644 --- a/bioscancast/stages/search_stage/dashboard_lookup.py +++ b/bioscancast/stages/search_stage/dashboard_lookup.py @@ -48,21 +48,21 @@ def lookup_dashboards(question: ForecastQuestion) -> List[SearchResult]: return [] pathogen_key = question.pathogen.strip().lower() - urls = DASHBOARD_LOOKUP.get(pathogen_key, []) - if not urls: + entries = DASHBOARD_LOOKUP.get(pathogen_key, []) + if not entries: return [] as_of = question.as_of_date results: list[SearchResult] = [] now = datetime.now(timezone.utc) - for url in urls: + for entry in entries: if as_of is not None: - snapshot = closest_snapshot_before(url, as_of) + snapshot = closest_snapshot_before(entry.url, as_of) if snapshot is None: logger.info( "Suppressing dashboard %s — no Wayback snapshot at-or-before %s", - url, as_of.isoformat(), + entry.url, as_of.isoformat(), ) continue snapshot_dt, snapshot_url = snapshot @@ -71,12 +71,12 @@ def lookup_dashboards(question: ForecastQuestion) -> List[SearchResult]: published_date_source = "wayback_snapshot" # Keep ``domain`` as the original publisher for tier scoring; # the URL itself points at archive.org for fetching. - domain = extract_domain(url) + domain = extract_domain(entry.url) else: - effective_url = url + effective_url = entry.url published_date = None published_date_source = None - domain = extract_domain(url) + domain = extract_domain(entry.url) tier_num, domain_score, source_tier = resolve_tier(domain) @@ -89,8 +89,8 @@ def lookup_dashboards(question: ForecastQuestion) -> List[SearchResult]: url=effective_url, canonical_url=normalize_url(effective_url), domain=domain, - title=f"Dashboard: {domain}", - snippet=f"Known {pathogen_key} monitoring dashboard", + title=entry.title, + snippet=entry.snippet, rank=0, retrieved_at=now, published_date=published_date, From 1a4816a395389067b3a2689ebab57010518666aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 23:32:05 +0200 Subject: [PATCH 11/21] Lower heuristic_keep_threshold from 0.72 to 0.65 (#13) Live runs on q7 and q12 showed filter survival of 4.7% and 13.5% respectively, even with LLM rescue enabled. The 0.72 threshold was set without benchmarking against real Tavily output and is too tight for the heuristic's actual signal. With the new threshold, priority_scores in the 0.65-0.72 band are auto-kept by heuristics instead of routed to the LLM rescue path. The borderline threshold (0.45) is unchanged, so the LLM filter still gates 0.45-0.65 candidates - the change just moves the auto-keep line to better match what the heuristic can actually distinguish. Implements item 2 from the Tier 1 roadmap. Pairs with the dashboard bypass + enrichment commits to attack the filter chokepoint from multiple angles. --- bioscancast/filtering/config.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bioscancast/filtering/config.py b/bioscancast/filtering/config.py index f0477aa..4bf99c2 100644 --- a/bioscancast/filtering/config.py +++ b/bioscancast/filtering/config.py @@ -38,7 +38,14 @@ "domain": 0.20, "official_bonus": 0.20, }, - "heuristic_keep_threshold": 0.72, + # Lowered from 0.72 to 0.65 after q7/q12 live runs showed filter + # survival of 4.7% / 13.5% — the threshold was tighter than the + # heuristic's actual signal supports. Borderline candidates that + # cross the new threshold still go to the LLM rescue path; this + # change just stops dropping high-credibility-but-low-keyword-overlap + # results pre-LLM (e.g. apnews/theguardian/washingtonpost in q7). + # See issue #13. + "heuristic_keep_threshold": 0.65, "heuristic_borderline_threshold": 0.45, "reranker_weights": { From c81f0070efc572cd7c6359d6af23eb28674aea33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 23:33:49 +0200 Subject: [PATCH 12/21] Add suspected_deaths to controlled metric_name vocabulary q12 Record 3 reported metric_name=deaths, metric_value=160 from the source quote "160 suspected deaths out of 670 suspected cases". The prompt's canonical vocabulary already had `suspected_cases` (for the 670) but no `suspected_deaths` slot - so the model collapsed the "suspected" qualifier and emitted plain `deaths`. The result was an arithmetically scandalous record (160 deaths against 61 confirmed cases) that wouldn't survive any reasonable post-hoc sanity check. Changes: - Add `suspected_deaths` alongside the existing `suspected_cases` entry. Two-tier system per category (confirmed_* and suspected_*), matching the agreed shape of the vocab. - Drop the now-redundant standalone `probable_cases` line; the `suspected_cases` description explicitly covers "suspected", "probable", and "possible" as the same tier. WHO's combined `confirmed_or_probable_cases` bucket is kept separately because it is a distinct reporting category. - Add a deaths-family mapping rule paralleling the existing cases-family rule ("suspected deaths", "probable deaths", "deaths under investigation" all map to suspected_deaths). Mirror what the existing `confirmed_cases` line does for the cases family. - Clean up the stale "possible all get their own variants below" parenthetical which referenced a `possible_cases` slot that has never existed. After this, q12's "160 suspected deaths" should extract as suspected_deaths=160 rather than deaths=160 - same value, correct category, no longer competing with confirmed_cases for downstream forecasting weight. Implements item 3 from the Tier 1 roadmap. --- bioscancast/insight/extraction/prompts.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/bioscancast/insight/extraction/prompts.py b/bioscancast/insight/extraction/prompts.py index 0f74b4c..ab603b6 100644 --- a/bioscancast/insight/extraction/prompts.py +++ b/bioscancast/insight/extraction/prompts.py @@ -40,12 +40,13 @@ 6. For metric_name, use one of these canonical snake_case values when \ applicable (this lets downstream dedup merge facts about the same \ metric across sources): - - confirmed_cases (suspected, probable, possible all get \ -their own variants below) - - suspected_cases - - probable_cases - - confirmed_or_probable_cases - - deaths + - confirmed_cases (the "confirmed" tier — lab-confirmed) + - suspected_cases (the "not-yet-confirmed" tier — covers \ +"suspected", "probable", and "possible" reporting categories) + - confirmed_or_probable_cases (WHO/CDC's combined reporting bucket) + - deaths (lab-confirmed deaths) + - suspected_deaths (the "not-yet-confirmed" tier for deaths — \ +covers "suspected", "probable", "under investigation" reporting) - hospitalizations - recoveries - vaccinations_administered @@ -58,7 +59,10 @@ If none of these fit, invent a short snake_case label. Do NOT put \ qualifiers (sex, age, sub-region, time-period like "weekly") in \ metric_name — capture those in `summary` or `location` instead. \ -"cases", "reported cases", "total cases" all map to confirmed_cases. +"cases", "reported cases", "total cases" all map to confirmed_cases. \ +"suspected cases", "probable cases", "possible cases" all map to \ +suspected_cases. "deaths" alone maps to deaths; "suspected deaths", \ +"probable deaths", "deaths under investigation" map to suspected_deaths. 7. Be aware of cognitive biases that affect information processing: - Anchoring: do not over-weight the first number you encounter. - Availability: rare dramatic events are not necessarily more likely. From 26ae088b555f6887c6ca733ba098ae9e737962c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 23:34:33 +0200 Subject: [PATCH 13/21] Require the quote field to contain the figure (extraction prompt) q12 Record 4 reported metric_value=82 with the quote "the outbreak now poses a 'very high' risk for Congo - up from a previous categorization of 'high'" - no digit, no number-word, no relative reference. The hallucination guard's verbatim-substring check passed because the quote string did appear in the source chunk, but nothing in the guard required the quote sentence to be the one actually carrying the figure. The metric_value of 82 came from elsewhere in the chunk; the quote was a "supporting context" sentence. A deterministic post-hoc check (str(metric_value) in quote) would over-reject: word numbers ("a dozen"), relative quantities ("a quarter of the population"), and number-word forms ("ninety-nine thousand") would all be false-positive rejections. So the fix lives at the prompt level instead: tell the model the quote MUST be the sentence that carries the figure - digits, number-word, or a clear relative reference. A purely contextual sentence is not acceptable. The verbatim-substring guard remains the safety net. This change tightens the model's understanding of what `quote` is supposed to do without committing to a brittle programmatic check that would lose real signal on legitimate paraphrases. Implements item 4 from the Tier 2 roadmap. --- bioscancast/insight/extraction/prompts.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bioscancast/insight/extraction/prompts.py b/bioscancast/insight/extraction/prompts.py index ab603b6..a750014 100644 --- a/bioscancast/insight/extraction/prompts.py +++ b/bioscancast/insight/extraction/prompts.py @@ -27,7 +27,12 @@ by the chunk text. Do NOT infer, speculate, or use outside knowledge. 2. For each fact, provide a verbatim quote from the chunk (max 200 \ characters) that supports the claim. The quote must be an exact \ -substring of the chunk text. +substring of the chunk text. The quote MUST be the sentence (or \ +sentence fragment) that carries the figure itself — it must contain \ +the metric_value either as digits (e.g. "82"), as a number-word \ +(e.g. "eighty-two", "a dozen"), or as a clear relative reference \ +(e.g. "a quarter of the population"). A contextual or supporting \ +sentence that mentions the topic but not the figure is NOT acceptable. 3. If the chunk contains no relevant facts, return an empty facts list. \ This is expected and common — most chunks are irrelevant. 4. Do NOT answer the forecast question. Your job is fact extraction, \ From c584ef66efa8faa3733c4c07a998903737540b38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 23:39:28 +0200 Subject: [PATCH 14/21] Adaptive retrieval_top_k when filter survivors are few When the filtering stage passes only a handful of documents through to insight, per-document retrieval depth becomes the bottleneck on coverage. q7's live run reached insight with 2 usable documents and hit retrieval_top_k=12 on each - meaning at most ~24 chunk extractions for the whole question. Bumping per-doc retrieval depth costs little and gives the model more chances to find the relevant figure in each surviving document. Adds two InsightConfig fields: - low_survival_doc_threshold (default 5) - low_survival_top_k (default 20) When the count of usable documents (status != "failed" and non-empty chunks) is at or below the threshold, both retrieval_top_k and max_chunks_per_document effectively rise to low_survival_top_k for the run, and a note is appended to InsightRunResult.notes flagging that the adaptive path engaged. Default config (12 doc threshold not hit -> normal top_k=12) is unchanged. Tests that pin retrieval_top_k to small values to control fake-LLM call counts now also pin low_survival_top_k to the same value, opting out of the adaptive lift explicitly. 447 tests still passing. Implements item 6 from the Tier 2 roadmap. Completes the planned bundle of items 1+2+3+4+5+6. --- bioscancast/insight/config.py | 14 +++++++++ bioscancast/insight/pipeline.py | 28 +++++++++++++++-- bioscancast/tests/test_insight_pipeline.py | 35 ++++++++++++++++------ 3 files changed, 66 insertions(+), 11 deletions(-) diff --git a/bioscancast/insight/config.py b/bioscancast/insight/config.py index fad6418..6eaf37b 100644 --- a/bioscancast/insight/config.py +++ b/bioscancast/insight/config.py @@ -16,6 +16,8 @@ "max_chunks_per_document": 12, "extraction_max_output_tokens": 4096, "chunk_workers": 6, + "low_survival_doc_threshold": 5, + "low_survival_top_k": 20, } @@ -43,6 +45,18 @@ class InsightConfig: Set to 1 for sequential execution (useful for debugging or rate- limit-sensitive setups).""" + low_survival_doc_threshold: int = 5 + """When the filter passes fewer than this many usable documents to + insight, switch to ``low_survival_top_k`` for both retrieval and the + per-document chunk cap. q7 reached insight with only 2 surviving + documents; in that regime per-doc retrieval depth becomes the + bottleneck on coverage.""" + + low_survival_top_k: int = 20 + """Retrieval / per-doc cap used when usable documents are at or below + ``low_survival_doc_threshold``. Set to ``None`` (or equal to + ``retrieval_top_k``) to disable the adaptive lift.""" + @classmethod def from_dict(cls, d: dict) -> InsightConfig: """Create an InsightConfig from a dict, ignoring unknown keys.""" diff --git a/bioscancast/insight/pipeline.py b/bioscancast/insight/pipeline.py index ea294fd..7d5c5dd 100644 --- a/bioscancast/insight/pipeline.py +++ b/bioscancast/insight/pipeline.py @@ -103,6 +103,30 @@ def run( result = InsightRunResult() embedding_cache: dict[str, list[float]] = {} + # Adaptive top-k: when the filter passes through only a handful + # of usable documents, lift retrieval depth so the per-doc chunk + # budget isn't the bottleneck on coverage. See InsightConfig + # docstrings for the rationale. + usable_doc_count = sum( + 1 for d in documents if d.status != "failed" and d.chunks + ) + if usable_doc_count <= config.low_survival_doc_threshold: + effective_top_k = max(config.retrieval_top_k, config.low_survival_top_k) + effective_max_chunks = max( + config.max_chunks_per_document, config.low_survival_top_k + ) + if effective_top_k != config.retrieval_top_k: + result.notes.append( + f"Low-survival adaptive top_k engaged: " + f"{usable_doc_count} usable docs (≤ threshold " + f"{config.low_survival_doc_threshold}); " + f"retrieval_top_k={effective_top_k} (default " + f"{config.retrieval_top_k})." + ) + else: + effective_top_k = config.retrieval_top_k + effective_max_chunks = config.max_chunks_per_document + for doc in documents: # --- Skip check --- if doc.status == "failed" or not doc.chunks: @@ -126,7 +150,7 @@ def run( question, doc, self._llm, - top_k=config.retrieval_top_k, + top_k=effective_top_k, bm25_weight=config.bm25_weight, embedding_weight=config.embedding_weight, embedding_model=config.embedding_model, @@ -134,7 +158,7 @@ def run( ) # Cap chunks per document - scored_chunks = scored_chunks[: config.max_chunks_per_document] + scored_chunks = scored_chunks[:effective_max_chunks] # --- Per-chunk extraction (parallel within a doc) --- # Live tests on real biosecurity documents show the per-doc diff --git a/bioscancast/tests/test_insight_pipeline.py b/bioscancast/tests/test_insight_pipeline.py index 1d6b9ba..679924a 100644 --- a/bioscancast/tests/test_insight_pipeline.py +++ b/bioscancast/tests/test_insight_pipeline.py @@ -59,7 +59,9 @@ def test_pipeline_single_document(): RISK_ASSESSMENT_RESPONSE, # chunk p4 (no facts) ]) - config = InsightConfig(retrieval_top_k=5, max_chunks_per_document=5) + config = InsightConfig( + retrieval_top_k=5, max_chunks_per_document=5, low_survival_top_k=5, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN]) @@ -91,7 +93,9 @@ def test_pipeline_skips_failed_documents(): EMPTY_RESPONSE, # For the one chunk that gets extracted ]) - config = InsightConfig(retrieval_top_k=1, max_chunks_per_document=1) + config = InsightConfig( + retrieval_top_k=1, max_chunks_per_document=1, low_survival_top_k=1, + ) pipeline = InsightPipeline(llm_client=client, config=config) # Include a failed document alongside a successful one @@ -114,7 +118,9 @@ def test_pipeline_budget_tracking(): SUDAN_TABLE_RESPONSE, ]) - config = InsightConfig(retrieval_top_k=2, max_chunks_per_document=2) + config = InsightConfig( + retrieval_top_k=2, max_chunks_per_document=2, low_survival_top_k=2, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN]) @@ -137,6 +143,7 @@ def test_pipeline_stops_on_budget_exceeded(): config = InsightConfig( retrieval_top_k=2, max_chunks_per_document=2, + low_survival_top_k=2, max_input_tokens_per_run=1, # Absurdly low -> triggers immediately ) pipeline = InsightPipeline(llm_client=client, config=config) @@ -170,7 +177,9 @@ def test_pipeline_deduplication(): DUPLICATE_SUDAN_CASE_COUNT, # doc 2 -> 1 fact (duplicate case) ]) - config = InsightConfig(retrieval_top_k=1, max_chunks_per_document=1) + config = InsightConfig( + retrieval_top_k=1, max_chunks_per_document=1, low_survival_top_k=1, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN, doc2]) @@ -580,6 +589,7 @@ def test_pipeline_parallel_chunk_extraction_produces_all_records(): config = InsightConfig( retrieval_top_k=4, max_chunks_per_document=4, + low_survival_top_k=4, chunk_workers=4, ) pipeline = InsightPipeline(llm_client=fake, config=config) @@ -603,10 +613,12 @@ def test_pipeline_sequential_and_parallel_produce_same_record_count(): of records when the fake LLM is content-keyed (so result depends on chunk content, not worker order).""" config_seq = InsightConfig( - retrieval_top_k=4, max_chunks_per_document=4, chunk_workers=1, + retrieval_top_k=4, max_chunks_per_document=4, low_survival_top_k=4, + chunk_workers=1, ) config_par = InsightConfig( - retrieval_top_k=4, max_chunks_per_document=4, chunk_workers=4, + retrieval_top_k=4, max_chunks_per_document=4, low_survival_top_k=4, + chunk_workers=4, ) seq_pipeline = InsightPipeline( @@ -653,7 +665,8 @@ def embed(self, texts, *, model): fake = _IntermittentFake() config = InsightConfig( - retrieval_top_k=4, max_chunks_per_document=4, chunk_workers=4, + retrieval_top_k=4, max_chunks_per_document=4, low_survival_top_k=4, + chunk_workers=4, ) pipeline = InsightPipeline(llm_client=fake, config=config) # Must not raise — failed chunk is logged and skipped @@ -684,7 +697,9 @@ def test_pipeline_multi_document(): H5N1_TABLE_RESPONSE, ]) - config = InsightConfig(retrieval_top_k=2, max_chunks_per_document=2) + config = InsightConfig( + retrieval_top_k=2, max_chunks_per_document=2, low_survival_top_k=2, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_H5N1, [DOC_WHO_SUDAN, DOC_CDC_H5N1]) @@ -709,7 +724,9 @@ def test_pipeline_output_records_valid(): SUDAN_TABLE_RESPONSE, ]) - config = InsightConfig(retrieval_top_k=2, max_chunks_per_document=2) + config = InsightConfig( + retrieval_top_k=2, max_chunks_per_document=2, low_survival_top_k=2, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN]) From eb0a5f6173bb5854ebfa5c986c7a54fae9008352 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 23:57:01 +0200 Subject: [PATCH 15/21] Exempt dashboard-bypassed docs from cap_per_domain_and_type q7's second live run on this branch surfaced an interaction between the new dashboard heuristic bypass and the per-domain cap. With max_docs_per_domain=2 and the dashboard bypass injecting one who.int slot at synthetic priority 1.0, the cap was effectively reducing who.int to ONE organic slot - and the slot was going to a priority-0.7097 strategic-plan announcement page, squeezing out the priority-0.6966 WHO mpox research event page that the baseline run had extracted records from. Offline filter replay on the saved q7 search.json confirms the mechanism: Heuristic-keep (4 who.int / ourworldindata.org docs): 1.0000 WHO sitreps dashboard (bypass) 1.0000 OWID mpox dashboard (bypass) 0.7097 WHO global strategic preparedness plan (organic) 0.6966 WHO mpox research event (organic) <- baseline's data source After old cap_per_domain (max=2 per domain): Dashboards displace one organic each; research event capped out. The fix: dashboard-bypass docs (selection_reasons contains "dashboard_lookup_bypass") are always kept and do not consume a slot against the per-domain or per-type caps. They are curated additions, not competing organic results. After the change all four candidates survive, and the WHO research event page reaches insight as it did in the baseline. 447 tests still passing. --- bioscancast/filtering/postprocess.py | 29 ++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/bioscancast/filtering/postprocess.py b/bioscancast/filtering/postprocess.py index c0caba5..fd8d07c 100644 --- a/bioscancast/filtering/postprocess.py +++ b/bioscancast/filtering/postprocess.py @@ -49,6 +49,17 @@ def cap_per_domain_and_type( max_docs_per_domain: int, max_docs_per_type: int, ) -> List[FilteredDocument]: + """Limit how many docs from a single domain or file type survive. + + Dashboard-bypassed docs (selection_reasons contains + ``"dashboard_lookup_bypass"``) are always kept and do not consume a + slot against either cap. Curated dashboard injections are a separate + channel from organic search results; without this carve-out, a + dashboard sitting at synthetic priority 1.0 displaces a genuine + organic candidate on the same domain - which is exactly what + happened on q7 (WHO sitreps dashboard squeezed out the WHO research + event page that the baseline extracted records from). + """ kept: list[FilteredDocument] = [] domain_counts = defaultdict(int) type_counts = defaultdict(int) @@ -56,14 +67,20 @@ def cap_per_domain_and_type( for doc in docs: doc_type = doc.file_type or "unknown" - if domain_counts[doc.domain] >= max_docs_per_domain: - continue - if type_counts[doc_type] >= max_docs_per_type: - continue + is_dashboard_bypass = "dashboard_lookup_bypass" in ( + doc.selection_reasons or [] + ) + + if not is_dashboard_bypass: + if domain_counts[doc.domain] >= max_docs_per_domain: + continue + if type_counts[doc_type] >= max_docs_per_type: + continue kept.append(doc) - domain_counts[doc.domain] += 1 - type_counts[doc_type] += 1 + if not is_dashboard_bypass: + domain_counts[doc.domain] += 1 + type_counts[doc_type] += 1 return kept From 75f292500e0089c9d3130e73d7aae4ae961d2451 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Thu, 28 May 2026 08:09:08 +0200 Subject: [PATCH 16/21] Add per-stage cost line-items to orchestrator epilogue The epilogue previously printed a single total-cost figure. This splits cost by stage (search / filter / insight) so a cost spike in any one stage is visible at a glance during iteration. Mechanism: _UsageTrackingClient gains a snapshot() method; run_pipeline snapshots the shared tracker after search and after filter and diffs them (_usage_delta) to attribute usage to each stage. Insight reports its own budget_summary per_model as before. The extract stage makes no LLM calls, so it shows timing only. Manifest gains stage_usage and stage_costs_usd alongside the existing combined_usage and estimated_cost_usd. Epilogue now reads e.g.: search 23.87s $0.0009 filter 3.06s $0.0003 extract 59.69s insight 12.28s $0.0030 total cost: $0.0042 Implements item 11 from the roadmap. 447 tests still passing. --- bioscancast/main.py | 58 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/bioscancast/main.py b/bioscancast/main.py index 68e5c80..8c40c81 100644 --- a/bioscancast/main.py +++ b/bioscancast/main.py @@ -96,6 +96,11 @@ def embed(self, texts: list[str], *, model: str) -> list[list[float]]: # estimation in here. return self._inner.embed(texts, model=model) + def snapshot(self) -> dict[str, dict[str, int]]: + """Return a plain-dict copy of cumulative usage for delta-ing + between stages.""" + return {model: dict(counts) for model, counts in self.per_model.items()} + # ---------------------------------------------------------------------------- # CLI parsing and question construction @@ -221,6 +226,24 @@ def _merge_usage(*usage_dicts: dict[str, dict[str, int]]) -> dict[str, dict[str, return dict(merged) +def _usage_delta( + before: dict[str, dict[str, int]], + after: dict[str, dict[str, int]], +) -> dict[str, dict[str, int]]: + """Per-model usage that accrued between two snapshots of the same + tracker. Models with no change are omitted.""" + delta: dict[str, dict[str, int]] = {} + for model, counts in after.items(): + b = before.get(model, {}) + d = { + k: int(counts.get(k, 0)) - int(b.get(k, 0)) + for k in ("input_tokens", "output_tokens", "calls") + } + if any(d.values()): + delta[model] = d + return delta + + def _estimate_total_cost(per_model: dict[str, dict[str, int]]) -> tuple[float, list[str]]: """Return (usd_total, list_of_warnings).""" total = 0.0 @@ -302,6 +325,11 @@ def run_pipeline(args: argparse.Namespace) -> InsightRunResult: shared_llm_raw = OpenAILLMClient() shared_llm = _UsageTrackingClient(shared_llm_raw) + # Per-stage usage is captured by snapshotting the shared tracker + # before/after each stage that uses it. Search and filter share one + # client; insight reports its own budget_summary. + stage_usage: dict[str, dict[str, dict[str, int]]] = {} + try: with _stage_timer(manifest, "search"): backend = TavilyBackend() @@ -316,6 +344,8 @@ def run_pipeline(args: argparse.Namespace) -> InsightRunResult: persistence.save_search(run_dir, search_results) if cache: cache.close() + usage_after_search = shared_llm.snapshot() + stage_usage["search"] = usage_after_search _log_summary( "search", f"{len(search_results)} results", manifest["stage_timings"]["search"], @@ -326,6 +356,7 @@ def run_pipeline(args: argparse.Namespace) -> InsightRunResult: filter_pipeline = FilteringPipeline(llm_client=shared_llm) filtered_docs = filter_pipeline.run(question, search_results) persistence.save_filtered(run_dir, filtered_docs) + stage_usage["filter"] = _usage_delta(usage_after_search, shared_llm.snapshot()) _log_summary( "filter", f"{len(filtered_docs)} docs", manifest["stage_timings"]["filter"], @@ -350,6 +381,7 @@ def run_pipeline(args: argparse.Namespace) -> InsightRunResult: ) insight_result = insight_pipeline.run(question, documents) persistence.save_insight(run_dir, insight_result) + stage_usage["insight"] = insight_result.budget_summary.get("per_model") or {} budget = insight_result.budget_summary _log_summary( "insight", @@ -365,16 +397,29 @@ def run_pipeline(args: argparse.Namespace) -> InsightRunResult: persistence.save_manifest(run_dir, manifest) raise PipelineError(manifest["current_stage"] or "unknown", exc) from exc - # Combine usage across stages and estimate cost. + # Per-stage cost from the captured usage snapshots. + stage_costs: dict[str, float] = {} + cost_warnings: list[str] = [] + for stage in ("search", "filter", "insight"): + usage = stage_usage.get(stage) or {} + cost, warns = _estimate_total_cost(usage) + stage_costs[stage] = round(cost, 6) + cost_warnings.extend(warns) + + # Combine usage across stages and estimate total cost. combined_usage = _merge_usage( dict(shared_llm.per_model), insight_result.budget_summary.get("per_model") or {}, ) - cost_usd, cost_warnings = _estimate_total_cost(combined_usage) + cost_usd, _ = _estimate_total_cost(combined_usage) + # Dedup warnings (same unknown model can surface in multiple stages). + cost_warnings = list(dict.fromkeys(cost_warnings)) manifest["current_stage"] = None manifest["completed_at"] = datetime.now(timezone.utc).isoformat() manifest["combined_usage"] = combined_usage + manifest["stage_usage"] = stage_usage + manifest["stage_costs_usd"] = stage_costs manifest["estimated_cost_usd"] = round(cost_usd, 6) if cost_warnings: manifest["cost_estimate_warnings"] = cost_warnings @@ -384,9 +429,12 @@ def run_pipeline(args: argparse.Namespace) -> InsightRunResult: print(f"=== Pipeline complete: {question.id} ===") for stage in ("search", "filter", "extract", "insight"): elapsed = manifest["stage_timings"].get(stage) - if elapsed is not None: - print(f" {stage:<8} {elapsed:>7.2f}s") - print(f" estimated cost: ${cost_usd:.4f}") + if elapsed is None: + continue + stage_cost = stage_costs.get(stage) + cost_str = f" ${stage_cost:.4f}" if stage_cost is not None else "" + print(f" {stage:<8} {elapsed:>7.2f}s{cost_str}") + print(f" total cost: ${cost_usd:.4f}") if cost_warnings: for w in cost_warnings: print(f" ! {w}") From 76d5adfd1d9df347debd4b433d77d49917456e54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Thu, 28 May 2026 08:14:05 +0200 Subject: [PATCH 17/21] Add case-insensitive layer 4 to the hallucination guard Item 8 investigation: of the three quotes the guard dropped across the q12 live runs, two were real facts lost purely because the model lowercased the leading letter of a sentence it quoted from mid-paragraph: source: "There are now 750 suspected cases and 177 suspected deaths" model: "there are now 750 suspected cases and 177 suspected deaths" source: "The Congolese Ministry of Communication, in a post to X ... said that there were 904 suspected cases and 119 ..." model: "the Congolese Ministry of Communication, in a post to X ..." The third rejection was a genuine content-insertion hallucination - the model bolted the real prefix "a total of 105 confirmed cases (including 10 deaths)" onto a fabricated continuation "...have been reported in Ituri, North Kivu, and South Kivu" (the source actually continues "...and 906 suspected cases"). Fix: a fourth, case-insensitive substring layer. It returns the chunk's own casing so the stored quote still reflects the source verbatim. The key safety property - verified against the real q12 fabrication and captured in a new regression test - is that case-folding does NOT recover content insertions: a fabricated continuation fails the substring test regardless of case. Tests: new _LAYER4_CASE_INSENSITIVE_CASES (the two recovered q12 quotes) plus a hallucination case mirroring the q12 fabrication that must stay rejected. 450 passed (was 447; +3 guard cases). Implements item 8 from the roadmap. --- .../insight/extraction/chunk_extractor.py | 18 ++++++++++ .../tests/test_insight_chunk_extractor.py | 35 ++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/bioscancast/insight/extraction/chunk_extractor.py b/bioscancast/insight/extraction/chunk_extractor.py index dd24d71..d8a773d 100644 --- a/bioscancast/insight/extraction/chunk_extractor.py +++ b/bioscancast/insight/extraction/chunk_extractor.py @@ -221,6 +221,24 @@ def _quote_matches(quote: str, chunk_text: str) -> Optional[str]: if unwrap_quote in unwrap_chunk: return unwrap_quote + # Layer 4: case-insensitive substring. Catches the model lowercasing + # the leading letter of a sentence it quotes from mid-paragraph - + # otherwise verbatim drift that's very common (q12 live runs: + # "there are now 750 suspected cases..." vs the source's "There are + # now 750..."). Returns the chunk's own casing so the stored quote + # reflects the source. Crucially this does NOT recover content- + # insertion hallucinations: a fabricated continuation still fails the + # substring test regardless of case (verified against the q12 + # "...have been reported in Ituri, North Kivu" fabrication, whose real + # source text continues "...and 906 suspected cases"). + ci_chunk = norm_chunk.lower() + for candidate in (norm_quote, stripped): + if not candidate: + continue + idx = ci_chunk.find(candidate.lower()) + if idx >= 0: + return norm_chunk[idx: idx + len(candidate)] + return None diff --git a/bioscancast/tests/test_insight_chunk_extractor.py b/bioscancast/tests/test_insight_chunk_extractor.py index 2ba0421..4a112b9 100644 --- a/bioscancast/tests/test_insight_chunk_extractor.py +++ b/bioscancast/tests/test_insight_chunk_extractor.py @@ -367,6 +367,25 @@ def test_response_returned_for_budget_tracking(): ] +_LAYER4_CASE_INSENSITIVE_CASES = [ + ( + # Real q12 finding: model lowercased the leading "T" of a sentence + # it quoted from mid-paragraph; otherwise verbatim. + "leading letter lowercased by model", + "There are now 750 suspected cases and 177 suspected deaths, though more are expected.", + "there are now 750 suspected cases and 177 suspected deaths", + True, + ), + ( + # Real q12 finding: same drift on a longer attribution clause. + "leading 'The' lowercased mid-paragraph quote", + "The Congolese Ministry of Communication, in a post to X on Sunday, said that there were 904 suspected cases and 119 suspected deaths.", + "the Congolese Ministry of Communication, in a post to X on Sunday, said that there were 904 suspected cases and 119 suspected deaths", + True, + ), +] + + _HALLUCINATION_CASES = [ ( "fabricated word inserted into list", @@ -374,6 +393,17 @@ def test_response_returned_for_budget_tracking(): "Ghana, Atlantis, and Liberia have reported human mpox due to clade IIa MPXV.", False, ), + ( + # Real q12 finding: model bolted a real prefix ("a total of 105 + # confirmed cases (including 10 deaths)") onto a fabricated + # continuation. The source actually continues "...and 906 + # suspected cases". Must stay rejected even with the new + # case-insensitive layer 4. + "real prefix bolted onto fabricated continuation (q12)", + "According to the Ministry of Health of DRC on 25 May, a total of 105 confirmed cases (including 10 deaths) and 906 suspected cases.", + "a total of 105 confirmed cases (including 10 deaths) have been reported in Ituri, North Kivu, and South Kivu", + False, + ), ( "wholesale fabrication", "Some real chunk content about measles cases in Utah.", @@ -410,7 +440,10 @@ def test_response_returned_for_budget_tracking(): @pytest.mark.parametrize( "label,chunk_text,quote,should_match", - _LAYER1_NFKC_CASES + _LAYER2_TERMINAL_PUNCTUATION_CASES + _LAYER3_WRAPPING_PUNCTUATION_CASES, + _LAYER1_NFKC_CASES + + _LAYER2_TERMINAL_PUNCTUATION_CASES + + _LAYER3_WRAPPING_PUNCTUATION_CASES + + _LAYER4_CASE_INSENSITIVE_CASES, ) def test_quote_matches_accepts_real_quotes_with_normalisation_drift( label, chunk_text, quote, should_match From cf4bed2fe6f2b677c567528a74f01ca52dedc989 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Thu, 28 May 2026 13:02:59 +0200 Subject: [PATCH 18/21] Add a relevance term to the search-stage score search_stage_score was 0.5*domain + 0.3*freshness + 0.2*rank with no topical-relevance signal, so high-authority but off-topic results ranked at the top (e.g. sports/legal/unrelated-pathogen news). It is now 0.45*relevance + 0.30*domain + 0.10*freshness + 0.15*rank, reusing the filter's keyword_overlap_score/build_query_terms. Freshness is kept low because it is near-uniform in live mode. Addresses #4. Co-Authored-By: Claude Opus 4.7 --- bioscancast/stages/search_stage/pipeline.py | 42 +++++++++++++++++++-- bioscancast/tests/test_search_pipeline.py | 7 +++- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/bioscancast/stages/search_stage/pipeline.py b/bioscancast/stages/search_stage/pipeline.py index edc573f..1477634 100644 --- a/bioscancast/stages/search_stage/pipeline.py +++ b/bioscancast/stages/search_stage/pipeline.py @@ -13,7 +13,9 @@ from typing import List, Optional from bioscancast.filtering.config import FILTER_CONFIG +from bioscancast.filtering.heuristics import build_query_terms from bioscancast.filtering.models import ForecastQuestion, SearchResult +from bioscancast.filtering.utils import keyword_overlap_score from bioscancast.llm.base import LLMClient from bioscancast.stages.search_stage.backends.base import RawSearchResult, SearchBackend from bioscancast.stages.search_stage.cache import SearchCache @@ -83,10 +85,39 @@ def _compute_freshness( return max(0.0, min(1.0, 1.0 - (days_old / 365.0))) -def _compute_search_stage_score(domain_score: float, freshness_score: float, rank: int) -> float: - """search_stage_score = 0.5 * domain_score + 0.3 * freshness_score + 0.2 * (1/rank)""" +# search_stage_score weights (sum to 1.0). Relevance (keyword overlap of +# title/snippet/domain against the question terms) is the dominant term: +# domain/freshness/rank alone rank off-topic high-authority content too highly, +# because freshness is ~uniform in live mode and domain score is too coarse to +# separate on-topic from off-topic within a tier. Freshness is kept low for that +# reason. See data/investigations/findings-issues-3-4-13.md (#4). +_SCORE_W_RELEVANCE = 0.45 +_SCORE_W_DOMAIN = 0.30 +_SCORE_W_FRESHNESS = 0.10 +_SCORE_W_RANK = 0.15 + + +def _compute_relevance(result: SearchResult, question: ForecastQuestion) -> float: + """Keyword overlap of the result against the question terms. + + Mirrors ``bioscancast.filtering.heuristics.compute_heuristic_relevance`` so + the search stage and the filter stage use the same relevance signal. + """ + text = f"{result.title} {result.snippet} {result.domain}" + return keyword_overlap_score(text, build_query_terms(question)) + + +def _compute_search_stage_score( + relevance: float, domain_score: float, freshness_score: float, rank: int +) -> float: + """search_stage_score = 0.45*relevance + 0.30*domain + 0.10*freshness + 0.15*(1/rank)""" rank_score = 1.0 / max(rank, 1) - raw = 0.5 * domain_score + 0.3 * freshness_score + 0.2 * rank_score + raw = ( + _SCORE_W_RELEVANCE * relevance + + _SCORE_W_DOMAIN * domain_score + + _SCORE_W_FRESHNESS * freshness_score + + _SCORE_W_RANK * rank_score + ) return max(0.0, min(1.0, raw)) @@ -260,7 +291,10 @@ def run(self, question: ForecastQuestion) -> List[SearchResult]: r.published_date, reference_date=as_of ) r.search_stage_score = _compute_search_stage_score( - r.domain_score, r.freshness_score, r.rank + _compute_relevance(r, question), + r.domain_score, + r.freshness_score, + r.rank, ) # 8. Sort and cap diff --git a/bioscancast/tests/test_search_pipeline.py b/bioscancast/tests/test_search_pipeline.py index 907d6a4..d140fab 100644 --- a/bioscancast/tests/test_search_pipeline.py +++ b/bioscancast/tests/test_search_pipeline.py @@ -160,9 +160,14 @@ def test_total_cap_enforced(self): def test_scoring_formula(self): """Verify the search_stage_score formula for a known result.""" + from bioscancast.stages.search_stage.pipeline import _compute_relevance + + question = _make_question() results = self._run_pipeline() for r in results: - expected = 0.5 * r.domain_score + 0.3 * r.freshness_score + 0.2 * (1.0 / max(r.rank, 1)) + rel = _compute_relevance(r, question) + rank_score = 1.0 / max(r.rank, 1) + expected = 0.45 * rel + 0.30 * r.domain_score + 0.10 * r.freshness_score + 0.15 * rank_score expected = max(0.0, min(1.0, expected)) assert abs(r.search_stage_score - expected) < 1e-9, ( f"Score mismatch for {r.url}: {r.search_stage_score} != {expected}" From 61e7787e5b5fcfaf192c2f064758521c3a3c4879 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Thu, 28 May 2026 13:03:12 +0200 Subject: [PATCH 19/21] Fix stale dashboard URLs and make pathogen routing tolerant The CDC mpox dashboard URL returned 404; replaced with the (extractable) monkeypox/situation-summary page. Updated two stale redirects (afro.who.int ebola-disease, cdc.gov/ebola/about). DASHBOARD_LOOKUP routing was an exact lowercase key match, so 'marburg virus disease' failed to route to the 'marburg' key; added _resolve_pathogen_key with alias + substring matching (marburg virus disease->marburg, monkeypox->mpox, bird flu->h5n1). Addresses #3. Co-Authored-By: Claude Opus 4.7 --- bioscancast/datasets/biosecurity_sources.py | 10 ++-- .../stages/search_stage/dashboard_lookup.py | 46 +++++++++++++++++-- bioscancast/tests/test_dashboard_lookup.py | 17 +++++++ 3 files changed, 65 insertions(+), 8 deletions(-) diff --git a/bioscancast/datasets/biosecurity_sources.py b/bioscancast/datasets/biosecurity_sources.py index f4b9878..2a47288 100644 --- a/bioscancast/datasets/biosecurity_sources.py +++ b/bioscancast/datasets/biosecurity_sources.py @@ -68,19 +68,19 @@ class DashboardEntry: snippet="WHO situation reports with weekly case counts, country breakdowns, and public-health guidance for ongoing outbreaks including mpox.", ), DashboardEntry( - url="https://www.cdc.gov/mpox/data-research/index.html", - title="CDC mpox data and research dashboard for the United States", - snippet="CDC tracking of US mpox cases, demographic data, vaccination coverage, and outbreak response.", + url="https://www.cdc.gov/monkeypox/situation-summary/index.html", + title="CDC mpox current situation summary: confirmed cases in the United States", + snippet="CDC current situation summary for mpox, with US confirmed case counts, clade information, demographics, and outbreak response.", ), ], "ebola": [ DashboardEntry( - url="https://www.afro.who.int/health-topics/ebola-virus-disease", + url="https://www.afro.who.int/health-topics/ebola-disease", title="WHO Africa Ebola virus disease outbreak surveillance and case counts", snippet="WHO regional office for Africa tracking of Ebola virus disease outbreaks, confirmed and suspected cases, deaths, and response across African countries.", ), DashboardEntry( - url="https://www.cdc.gov/ebola/index.html", + url="https://www.cdc.gov/ebola/about/index.html", title="CDC Ebola virus disease outbreak history and case counts", snippet="CDC information on current and historical Ebola virus disease outbreaks worldwide, with case counts, deaths, and US public-health response.", ), diff --git a/bioscancast/stages/search_stage/dashboard_lookup.py b/bioscancast/stages/search_stage/dashboard_lookup.py index 87fb9a5..2d3048c 100644 --- a/bioscancast/stages/search_stage/dashboard_lookup.py +++ b/bioscancast/stages/search_stage/dashboard_lookup.py @@ -31,6 +31,46 @@ logger = logging.getLogger(__name__) +# Common name variants that should route to a canonical DASHBOARD_LOOKUP key. +# The canonical-key substring fallback in ``_resolve_pathogen_key`` already +# handles suffixes like "marburg virus disease" -> "marburg"; this map covers +# synonyms where the canonical key is NOT a substring of the alias. +_PATHOGEN_ALIASES: dict[str, str] = { + "monkeypox": "mpox", + "sars-cov-2": "covid-19", + "sars-cov2": "covid-19", + "covid": "covid-19", + "covid19": "covid-19", + "coronavirus": "covid-19", + "bird flu": "h5n1", + "avian flu": "h5n1", +} + + +def _resolve_pathogen_key(pathogen: str) -> str | None: + """Map a free-text pathogen string to a DASHBOARD_LOOKUP key, tolerantly. + + Resolution order: exact key, exact alias, alias-substring, then + canonical-key substring (longest match wins, so "ebola virus disease" + resolves to "ebola" and "marburg virus disease" to "marburg"). Returns + None if nothing matches. + """ + key = pathogen.strip().lower() + if not key: + return None + if key in DASHBOARD_LOOKUP: + return key + if key in _PATHOGEN_ALIASES and _PATHOGEN_ALIASES[key] in DASHBOARD_LOOKUP: + return _PATHOGEN_ALIASES[key] + for alias, canon in _PATHOGEN_ALIASES.items(): + if alias in key and canon in DASHBOARD_LOOKUP: + return canon + matches = [k for k in DASHBOARD_LOOKUP if k in key] + if matches: + return max(matches, key=len) + return None + + def lookup_dashboards(question: ForecastQuestion) -> List[SearchResult]: """Generate synthetic SearchResult entries for known pathogen dashboards. @@ -47,10 +87,10 @@ def lookup_dashboards(question: ForecastQuestion) -> List[SearchResult]: if not question.pathogen: return [] - pathogen_key = question.pathogen.strip().lower() - entries = DASHBOARD_LOOKUP.get(pathogen_key, []) - if not entries: + pathogen_key = _resolve_pathogen_key(question.pathogen) + if not pathogen_key: return [] + entries = DASHBOARD_LOOKUP[pathogen_key] as_of = question.as_of_date results: list[SearchResult] = [] diff --git a/bioscancast/tests/test_dashboard_lookup.py b/bioscancast/tests/test_dashboard_lookup.py index c244e49..73163b0 100644 --- a/bioscancast/tests/test_dashboard_lookup.py +++ b/bioscancast/tests/test_dashboard_lookup.py @@ -46,6 +46,23 @@ def test_case_insensitive(self): results = lookup_dashboards(q) assert len(results) > 0 + def test_multiword_pathogen_routes_via_substring(self): + # CSV-natural "Marburg Virus Disease" -> pathogen "marburg virus disease" + # must still route to the "marburg" dashboard key. + canonical = lookup_dashboards(_make_question(pathogen="marburg")) + multiword = lookup_dashboards(_make_question(pathogen="marburg virus disease")) + assert len(multiword) > 0 + assert [r.url for r in multiword] == [r.url for r in canonical] + + def test_alias_routes_to_canonical(self): + # "monkeypox" -> "mpox"; "bird flu" -> "h5n1". + assert len(lookup_dashboards(_make_question(pathogen="monkeypox"))) > 0 + assert ( + [r.url for r in lookup_dashboards(_make_question(pathogen="monkeypox"))] + == [r.url for r in lookup_dashboards(_make_question(pathogen="mpox"))] + ) + assert len(lookup_dashboards(_make_question(pathogen="bird flu"))) > 0 + def test_results_have_required_fields(self): q = _make_question(pathogen="ebola") results = lookup_dashboards(q) From 8136172c8cc4d8776030f5d5b3dae7676da59fdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Thu, 28 May 2026 13:03:23 +0200 Subject: [PATCH 20/21] Add national/international news outlets to Tier 3 Reputable outbreak reporting from outlets like CNN, NBC, CBS, ABC, NPR, USA Today, LA Times, Politico, Axios, Forbes, Bloomberg, FT, WSJ, Economist, Time, The Atlantic, Ars Technica and Business Insider was resolving to the 'unknown' tier (domain_score 0.2), sinking it below the filter's credibility floor. Promote them to Tier 3 (trusted_media, 0.6); second-level-domain matching covers subdomains. Relates to #13. Co-Authored-By: Claude Opus 4.7 --- bioscancast/datasets/source_tiers.py | 30 +++++++++++++++++++++++ bioscancast/tests/test_tier_resolution.py | 12 +++++++++ 2 files changed, 42 insertions(+) diff --git a/bioscancast/datasets/source_tiers.py b/bioscancast/datasets/source_tiers.py index 93a0013..8abadcc 100644 --- a/bioscancast/datasets/source_tiers.py +++ b/bioscancast/datasets/source_tiers.py @@ -59,6 +59,36 @@ "wikipedia.org", "sciencedirect.com", "pubmed.ncbi.nlm.nih.gov", + # National/international news with established newsrooms. Added after the + # #13 tier-coverage audit (data/investigations/findings-issues-3-4-13.md): + # live pools showed reputable outbreak reporting from these outlets + # resolving to "unknown" (domain_score 0.2), which sank them below the + # filter's credibility floor. Second-level-domain matching in + # resolve_tier() covers subdomains (edition.cnn.com, ca.news.yahoo.com, + # africa.businessinsider.com, etc.). + "cnn.com", + "nbcnews.com", + "cbsnews.com", + "abcnews.go.com", + "abcnews.com", + "npr.org", + "pbs.org", + "usatoday.com", + "latimes.com", + "politico.com", + "politico.eu", + "axios.com", + "thehill.com", + "forbes.com", + "bloomberg.com", + "ft.com", + "wsj.com", + "economist.com", + "time.com", + "theatlantic.com", + "newyorker.com", + "arstechnica.com", + "businessinsider.com", } TIER_4_DOMAINS: set[str] = { diff --git a/bioscancast/tests/test_tier_resolution.py b/bioscancast/tests/test_tier_resolution.py index d8aa72f..d291303 100644 --- a/bioscancast/tests/test_tier_resolution.py +++ b/bioscancast/tests/test_tier_resolution.py @@ -54,6 +54,18 @@ def test_unknown_domain(self): assert score == 0.2 assert label == "unknown" + def test_national_news_is_trusted_media(self): + for domain in ("cnn.com", "nbcnews.com", "forbes.com", "latimes.com", "npr.org"): + tier, score, label = resolve_tier(domain) + assert tier == 3, domain + assert score == 0.6, domain + assert label == "trusted_media", domain + + def test_national_news_subdomain_match(self): + # edition.cnn.com / africa.businessinsider.com resolve via SLD. + assert resolve_tier("edition.cnn.com")[2] == "trusted_media" + assert resolve_tier("africa.businessinsider.com")[2] == "trusted_media" + def test_subdomain_match(self): """wwwnc.cdc.gov should match cdc.gov via second-level domain.""" tier, score, label = resolve_tier("wwwnc.cdc.gov") From 953a95a18d21bd1350def3178740407bf5abacbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Thu, 28 May 2026 13:03:34 +0200 Subject: [PATCH 21/21] Add opt-in soft fallback for the no-LLM filter path When llm_client is None the ambiguous rerank band was always rejected (fail-closed), which is overly aggressive for dev/offline/no-API-key runs. Add a default-off FILTER_CONFIG flag 'no_llm_soft_fallback' (+ no_llm_fallback_relevance_threshold) that instead keeps a borderline candidate iff it is an official domain OR its keyword-overlap relevance clears the threshold, approximating the LLM-rescue path. Production (always has an LLM client) is unchanged. Addresses #13. Co-Authored-By: Claude Opus 4.7 --- bioscancast/filtering/config.py | 12 +++++++ bioscancast/filtering/pipeline.py | 19 ++++++++-- bioscancast/tests/test_pipeline.py | 58 +++++++++++++++++++++++++++++- 3 files changed, 85 insertions(+), 4 deletions(-) diff --git a/bioscancast/filtering/config.py b/bioscancast/filtering/config.py index 4bf99c2..187dee4 100644 --- a/bioscancast/filtering/config.py +++ b/bioscancast/filtering/config.py @@ -56,6 +56,18 @@ "auto_reject_after_rerank": 0.30, "max_llm_filter_candidates": 10, + # When no LLM client is configured, the ambiguous "llm_needed" band + # (reranked priority between auto_reject and auto_keep) is normally + # rejected outright (fail-closed). With this flag enabled — for dev / + # offline / no-API-key runs — a borderline candidate is instead KEPT if it + # is an official domain OR its keyword-overlap relevance clears + # ``no_llm_fallback_relevance_threshold``. This approximates the LLM-rescue + # path without an API call, recovering the on-topic / authoritative tail + # without admitting the generic-news mass. Default OFF so production (which + # always has an LLM client) is unchanged. See issue #13. + "no_llm_soft_fallback": False, + "no_llm_fallback_relevance_threshold": 0.5, + "max_docs_per_domain": 2, "max_docs_per_type": 5, diff --git a/bioscancast/filtering/pipeline.py b/bioscancast/filtering/pipeline.py index e8c2e5b..80dc0d7 100644 --- a/bioscancast/filtering/pipeline.py +++ b/bioscancast/filtering/pipeline.py @@ -37,11 +37,24 @@ def run( llm_decisions: list[FilterDecision] = [] if llm_needed: if self.llm_client is None: - # Fail closed: reject ambiguous cases if no LLM client is configured. + # No LLM client. Default is fail-closed (reject the ambiguous + # band). When the soft-fallback flag is enabled, keep candidates + # that are official-domain or sufficiently relevant — see + # FILTER_CONFIG["no_llm_soft_fallback"] and issue #13. + soft = FILTER_CONFIG.get("no_llm_soft_fallback", False) + rel_threshold = FILTER_CONFIG.get( + "no_llm_fallback_relevance_threshold", 0.5 + ) for d in llm_needed: - d.keep = False d.stage = "llm_skipped" - d.reason_codes.append("no_llm_client_configured") + result = result_map.get(d.result_id) + is_official = bool(result and result.is_official_domain) + if soft and (is_official or d.relevance_score >= rel_threshold): + d.keep = True + d.reason_codes.append("no_llm_soft_fallback_kept") + else: + d.keep = False + d.reason_codes.append("no_llm_client_configured") llm_decisions = llm_needed else: llm_decisions = llm_filter_candidates( diff --git a/bioscancast/tests/test_pipeline.py b/bioscancast/tests/test_pipeline.py index f65bceb..f914225 100644 --- a/bioscancast/tests/test_pipeline.py +++ b/bioscancast/tests/test_pipeline.py @@ -1,5 +1,6 @@ from datetime import datetime +from bioscancast.filtering.config import FILTER_CONFIG from bioscancast.filtering.models import ForecastQuestion, SearchResult from bioscancast.filtering.pipeline import FilteringPipeline @@ -36,4 +37,59 @@ def test_pipeline_keeps_official_result(): docs = pipeline.run(question, [result]) assert len(docs) == 1 - assert docs[0].domain == "who.int" \ No newline at end of file + assert docs[0].domain == "who.int" + + +def _borderline_question(): + return ForecastQuestion( + id="q1", + text="How many confirmed Ebola cases in the DRC outbreak?", + created_at=datetime(2026, 5, 1), + pathogen="ebola", + region="DRC", + ) + + +def _borderline_result(): + # trusted_media (domain_score 0.6, non-official) with partial term overlap → + # lands in the heuristic borderline band, then the no-LLM "llm_needed" band. + return SearchResult( + id="r-border", + question_id="q1", + query_id="sq1", + engine="google", + url="https://www.cnn.com/ebola-drc", + canonical_url="https://www.cnn.com/ebola-drc", + domain="cnn.com", + title="Ebola cases climb in the DRC outbreak", + snippet="Confirmed Ebola cases reported in the Democratic Republic of the Congo outbreak.", + rank=2, + retrieved_at=datetime(2026, 5, 1), + source_tier="trusted_media", + is_official_domain=False, + domain_score=0.6, + freshness_score=1.0, + search_stage_score=0.6, + ) + + +def test_no_llm_soft_fallback_flag_changes_borderline_outcome(): + question = _borderline_question() + result = _borderline_result() + + saved = dict(FILTER_CONFIG) + try: + # Flag OFF (default): fail closed → borderline candidate dropped. + FILTER_CONFIG["no_llm_soft_fallback"] = False + docs_off = FilteringPipeline(llm_client=None).run(question, [result]) + + # Flag ON: relevant borderline candidate kept without an LLM call. + FILTER_CONFIG["no_llm_soft_fallback"] = True + FILTER_CONFIG["no_llm_fallback_relevance_threshold"] = 0.0 + docs_on = FilteringPipeline(llm_client=None).run(question, [result]) + finally: + FILTER_CONFIG.clear() + FILTER_CONFIG.update(saved) + + assert {d.result_id for d in docs_off} == set() + assert {d.result_id for d in docs_on} == {"r-border"} \ No newline at end of file