Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions bioscancast/insight/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
"max_chunks_per_document": 12,
"extraction_max_output_tokens": 4096,
"chunk_workers": 6,
"low_survival_doc_threshold": 5,
"low_survival_top_k": 20,
}


Expand Down Expand Up @@ -43,6 +45,18 @@ class InsightConfig:
Set to 1 for sequential execution (useful for debugging or rate-
limit-sensitive setups)."""

low_survival_doc_threshold: int = 5
"""When the filter passes fewer than this many usable documents to
insight, switch to ``low_survival_top_k`` for both retrieval and the
per-document chunk cap. q7 reached insight with only 2 surviving
documents; in that regime per-doc retrieval depth becomes the
bottleneck on coverage."""

low_survival_top_k: int = 20
"""Retrieval / per-doc cap used when usable documents are at or below
``low_survival_doc_threshold``. Set to ``None`` (or equal to
``retrieval_top_k``) to disable the adaptive lift."""

@classmethod
def from_dict(cls, d: dict) -> InsightConfig:
"""Create an InsightConfig from a dict, ignoring unknown keys."""
Expand Down
18 changes: 18 additions & 0 deletions bioscancast/insight/extraction/chunk_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,24 @@ def _quote_matches(quote: str, chunk_text: str) -> Optional[str]:
if unwrap_quote in unwrap_chunk:
return unwrap_quote

# Layer 4: case-insensitive substring. Catches the model lowercasing
# the leading letter of a sentence it quotes from mid-paragraph -
# otherwise verbatim drift that's very common (q12 live runs:
# "there are now 750 suspected cases..." vs the source's "There are
# now 750..."). Returns the chunk's own casing so the stored quote
# reflects the source. Crucially this does NOT recover content-
# insertion hallucinations: a fabricated continuation still fails the
# substring test regardless of case (verified against the q12
# "...have been reported in Ituri, North Kivu" fabrication, whose real
# source text continues "...and 906 suspected cases").
ci_chunk = norm_chunk.lower()
for candidate in (norm_quote, stripped):
if not candidate:
continue
idx = ci_chunk.find(candidate.lower())
if idx >= 0:
return norm_chunk[idx: idx + len(candidate)]

return None


Expand Down
25 changes: 17 additions & 8 deletions bioscancast/insight/extraction/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,12 @@
by the chunk text. Do NOT infer, speculate, or use outside knowledge.
2. For each fact, provide a verbatim quote from the chunk (max 200 \
characters) that supports the claim. The quote must be an exact \
substring of the chunk text.
substring of the chunk text. The quote MUST be the sentence (or \
sentence fragment) that carries the figure itself — it must contain \
the metric_value either as digits (e.g. "82"), as a number-word \
(e.g. "eighty-two", "a dozen"), or as a clear relative reference \
(e.g. "a quarter of the population"). A contextual or supporting \
sentence that mentions the topic but not the figure is NOT acceptable.
3. If the chunk contains no relevant facts, return an empty facts list. \
This is expected and common — most chunks are irrelevant.
4. Do NOT answer the forecast question. Your job is fact extraction, \
Expand All @@ -40,12 +45,13 @@
6. For metric_name, use one of these canonical snake_case values when \
applicable (this lets downstream dedup merge facts about the same \
metric across sources):
- confirmed_cases (suspected, probable, possible all get \
their own variants below)
- suspected_cases
- probable_cases
- confirmed_or_probable_cases
- deaths
- confirmed_cases (the "confirmed" tier — lab-confirmed)
- suspected_cases (the "not-yet-confirmed" tier — covers \
"suspected", "probable", and "possible" reporting categories)
- confirmed_or_probable_cases (WHO/CDC's combined reporting bucket)
- deaths (lab-confirmed deaths)
- suspected_deaths (the "not-yet-confirmed" tier for deaths — \
covers "suspected", "probable", "under investigation" reporting)
- hospitalizations
- recoveries
- vaccinations_administered
Expand All @@ -58,7 +64,10 @@
If none of these fit, invent a short snake_case label. Do NOT put \
qualifiers (sex, age, sub-region, time-period like "weekly") in \
metric_name — capture those in `summary` or `location` instead. \
"cases", "reported cases", "total cases" all map to confirmed_cases.
"cases", "reported cases", "total cases" all map to confirmed_cases. \
"suspected cases", "probable cases", "possible cases" all map to \
suspected_cases. "deaths" alone maps to deaths; "suspected deaths", \
"probable deaths", "deaths under investigation" map to suspected_deaths.
7. Be aware of cognitive biases that affect information processing:
- Anchoring: do not over-weight the first number you encounter.
- Availability: rare dramatic events are not necessarily more likely.
Expand Down
28 changes: 26 additions & 2 deletions bioscancast/insight/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,30 @@ def run(
result = InsightRunResult()
embedding_cache: dict[str, list[float]] = {}

# Adaptive top-k: when the filter passes through only a handful
# of usable documents, lift retrieval depth so the per-doc chunk
# budget isn't the bottleneck on coverage. See InsightConfig
# docstrings for the rationale.
usable_doc_count = sum(
1 for d in documents if d.status != "failed" and d.chunks
)
if usable_doc_count <= config.low_survival_doc_threshold:
effective_top_k = max(config.retrieval_top_k, config.low_survival_top_k)
effective_max_chunks = max(
config.max_chunks_per_document, config.low_survival_top_k
)
if effective_top_k != config.retrieval_top_k:
result.notes.append(
f"Low-survival adaptive top_k engaged: "
f"{usable_doc_count} usable docs (≤ threshold "
f"{config.low_survival_doc_threshold}); "
f"retrieval_top_k={effective_top_k} (default "
f"{config.retrieval_top_k})."
)
else:
effective_top_k = config.retrieval_top_k
effective_max_chunks = config.max_chunks_per_document

for doc in documents:
# --- Skip check ---
if doc.status == "failed" or not doc.chunks:
Expand All @@ -126,15 +150,15 @@ def run(
question,
doc,
self._llm,
top_k=config.retrieval_top_k,
top_k=effective_top_k,
bm25_weight=config.bm25_weight,
embedding_weight=config.embedding_weight,
embedding_model=config.embedding_model,
embedding_cache=embedding_cache,
)

# Cap chunks per document
scored_chunks = scored_chunks[: config.max_chunks_per_document]
scored_chunks = scored_chunks[:effective_max_chunks]

# --- Per-chunk extraction (parallel within a doc) ---
# Live tests on real biosecurity documents show the per-doc
Expand Down
35 changes: 34 additions & 1 deletion bioscancast/tests/test_insight_chunk_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,13 +367,43 @@ def test_response_returned_for_budget_tracking():
]


_LAYER4_CASE_INSENSITIVE_CASES = [
(
# Real q12 finding: model lowercased the leading "T" of a sentence
# it quoted from mid-paragraph; otherwise verbatim.
"leading letter lowercased by model",
"There are now 750 suspected cases and 177 suspected deaths, though more are expected.",
"there are now 750 suspected cases and 177 suspected deaths",
True,
),
(
# Real q12 finding: same drift on a longer attribution clause.
"leading 'The' lowercased mid-paragraph quote",
"The Congolese Ministry of Communication, in a post to X on Sunday, said that there were 904 suspected cases and 119 suspected deaths.",
"the Congolese Ministry of Communication, in a post to X on Sunday, said that there were 904 suspected cases and 119 suspected deaths",
True,
),
]


_HALLUCINATION_CASES = [
(
"fabricated word inserted into list",
"Ghana and Liberia have reported human mpox due to clade IIa MPXV.",
"Ghana, Atlantis, and Liberia have reported human mpox due to clade IIa MPXV.",
False,
),
(
# Real q12 finding: model bolted a real prefix ("a total of 105
# confirmed cases (including 10 deaths)") onto a fabricated
# continuation. The source actually continues "...and 906
# suspected cases". Must stay rejected even with the new
# case-insensitive layer 4.
"real prefix bolted onto fabricated continuation (q12)",
"According to the Ministry of Health of DRC on 25 May, a total of 105 confirmed cases (including 10 deaths) and 906 suspected cases.",
"a total of 105 confirmed cases (including 10 deaths) have been reported in Ituri, North Kivu, and South Kivu",
False,
),
(
"wholesale fabrication",
"Some real chunk content about measles cases in Utah.",
Expand Down Expand Up @@ -410,7 +440,10 @@ def test_response_returned_for_budget_tracking():

@pytest.mark.parametrize(
"label,chunk_text,quote,should_match",
_LAYER1_NFKC_CASES + _LAYER2_TERMINAL_PUNCTUATION_CASES + _LAYER3_WRAPPING_PUNCTUATION_CASES,
_LAYER1_NFKC_CASES
+ _LAYER2_TERMINAL_PUNCTUATION_CASES
+ _LAYER3_WRAPPING_PUNCTUATION_CASES
+ _LAYER4_CASE_INSENSITIVE_CASES,
)
def test_quote_matches_accepts_real_quotes_with_normalisation_drift(
label, chunk_text, quote, should_match
Expand Down
35 changes: 26 additions & 9 deletions bioscancast/tests/test_insight_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ def test_pipeline_single_document():
RISK_ASSESSMENT_RESPONSE, # chunk p4 (no facts)
])

config = InsightConfig(retrieval_top_k=5, max_chunks_per_document=5)
config = InsightConfig(
retrieval_top_k=5, max_chunks_per_document=5, low_survival_top_k=5,
)
pipeline = InsightPipeline(llm_client=client, config=config)

result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN])
Expand Down Expand Up @@ -91,7 +93,9 @@ def test_pipeline_skips_failed_documents():
EMPTY_RESPONSE, # For the one chunk that gets extracted
])

config = InsightConfig(retrieval_top_k=1, max_chunks_per_document=1)
config = InsightConfig(
retrieval_top_k=1, max_chunks_per_document=1, low_survival_top_k=1,
)
pipeline = InsightPipeline(llm_client=client, config=config)

# Include a failed document alongside a successful one
Expand All @@ -114,7 +118,9 @@ def test_pipeline_budget_tracking():
SUDAN_TABLE_RESPONSE,
])

config = InsightConfig(retrieval_top_k=2, max_chunks_per_document=2)
config = InsightConfig(
retrieval_top_k=2, max_chunks_per_document=2, low_survival_top_k=2,
)
pipeline = InsightPipeline(llm_client=client, config=config)

result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN])
Expand All @@ -137,6 +143,7 @@ def test_pipeline_stops_on_budget_exceeded():
config = InsightConfig(
retrieval_top_k=2,
max_chunks_per_document=2,
low_survival_top_k=2,
max_input_tokens_per_run=1, # Absurdly low -> triggers immediately
)
pipeline = InsightPipeline(llm_client=client, config=config)
Expand Down Expand Up @@ -170,7 +177,9 @@ def test_pipeline_deduplication():
DUPLICATE_SUDAN_CASE_COUNT, # doc 2 -> 1 fact (duplicate case)
])

config = InsightConfig(retrieval_top_k=1, max_chunks_per_document=1)
config = InsightConfig(
retrieval_top_k=1, max_chunks_per_document=1, low_survival_top_k=1,
)
pipeline = InsightPipeline(llm_client=client, config=config)

result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN, doc2])
Expand Down Expand Up @@ -580,6 +589,7 @@ def test_pipeline_parallel_chunk_extraction_produces_all_records():
config = InsightConfig(
retrieval_top_k=4,
max_chunks_per_document=4,
low_survival_top_k=4,
chunk_workers=4,
)
pipeline = InsightPipeline(llm_client=fake, config=config)
Expand All @@ -603,10 +613,12 @@ def test_pipeline_sequential_and_parallel_produce_same_record_count():
of records when the fake LLM is content-keyed (so result depends on
chunk content, not worker order)."""
config_seq = InsightConfig(
retrieval_top_k=4, max_chunks_per_document=4, chunk_workers=1,
retrieval_top_k=4, max_chunks_per_document=4, low_survival_top_k=4,
chunk_workers=1,
)
config_par = InsightConfig(
retrieval_top_k=4, max_chunks_per_document=4, chunk_workers=4,
retrieval_top_k=4, max_chunks_per_document=4, low_survival_top_k=4,
chunk_workers=4,
)

seq_pipeline = InsightPipeline(
Expand Down Expand Up @@ -653,7 +665,8 @@ def embed(self, texts, *, model):

fake = _IntermittentFake()
config = InsightConfig(
retrieval_top_k=4, max_chunks_per_document=4, chunk_workers=4,
retrieval_top_k=4, max_chunks_per_document=4, low_survival_top_k=4,
chunk_workers=4,
)
pipeline = InsightPipeline(llm_client=fake, config=config)
# Must not raise — failed chunk is logged and skipped
Expand Down Expand Up @@ -684,7 +697,9 @@ def test_pipeline_multi_document():
H5N1_TABLE_RESPONSE,
])

config = InsightConfig(retrieval_top_k=2, max_chunks_per_document=2)
config = InsightConfig(
retrieval_top_k=2, max_chunks_per_document=2, low_survival_top_k=2,
)
pipeline = InsightPipeline(llm_client=client, config=config)

result = pipeline.run(QUESTION_H5N1, [DOC_WHO_SUDAN, DOC_CDC_H5N1])
Expand All @@ -709,7 +724,9 @@ def test_pipeline_output_records_valid():
SUDAN_TABLE_RESPONSE,
])

config = InsightConfig(retrieval_top_k=2, max_chunks_per_document=2)
config = InsightConfig(
retrieval_top_k=2, max_chunks_per_document=2, low_survival_top_k=2,
)
pipeline = InsightPipeline(llm_client=client, config=config)

result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN])
Expand Down