From bb852b3af12fd3b14294a5f29d28f178ce1a9b8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 23:33:49 +0200 Subject: [PATCH 1/4] Add suspected_deaths to controlled metric_name vocabulary q12 Record 3 reported metric_name=deaths, metric_value=160 from the source quote "160 suspected deaths out of 670 suspected cases". The prompt's canonical vocabulary already had `suspected_cases` (for the 670) but no `suspected_deaths` slot - so the model collapsed the "suspected" qualifier and emitted plain `deaths`. The result was an arithmetically scandalous record (160 deaths against 61 confirmed cases) that wouldn't survive any reasonable post-hoc sanity check. Changes: - Add `suspected_deaths` alongside the existing `suspected_cases` entry. Two-tier system per category (confirmed_* and suspected_*), matching the agreed shape of the vocab. - Drop the now-redundant standalone `probable_cases` line; the `suspected_cases` description explicitly covers "suspected", "probable", and "possible" as the same tier. WHO's combined `confirmed_or_probable_cases` bucket is kept separately because it is a distinct reporting category. - Add a deaths-family mapping rule paralleling the existing cases-family rule ("suspected deaths", "probable deaths", "deaths under investigation" all map to suspected_deaths). Mirror what the existing `confirmed_cases` line does for the cases family. - Clean up the stale "possible all get their own variants below" parenthetical which referenced a `possible_cases` slot that has never existed. After this, q12's "160 suspected deaths" should extract as suspected_deaths=160 rather than deaths=160 - same value, correct category, no longer competing with confirmed_cases for downstream forecasting weight. Implements item 3 from the Tier 1 roadmap. --- bioscancast/insight/extraction/prompts.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/bioscancast/insight/extraction/prompts.py b/bioscancast/insight/extraction/prompts.py index 0f74b4c..ab603b6 100644 --- a/bioscancast/insight/extraction/prompts.py +++ b/bioscancast/insight/extraction/prompts.py @@ -40,12 +40,13 @@ 6. For metric_name, use one of these canonical snake_case values when \ applicable (this lets downstream dedup merge facts about the same \ metric across sources): - - confirmed_cases (suspected, probable, possible all get \ -their own variants below) - - suspected_cases - - probable_cases - - confirmed_or_probable_cases - - deaths + - confirmed_cases (the "confirmed" tier — lab-confirmed) + - suspected_cases (the "not-yet-confirmed" tier — covers \ +"suspected", "probable", and "possible" reporting categories) + - confirmed_or_probable_cases (WHO/CDC's combined reporting bucket) + - deaths (lab-confirmed deaths) + - suspected_deaths (the "not-yet-confirmed" tier for deaths — \ +covers "suspected", "probable", "under investigation" reporting) - hospitalizations - recoveries - vaccinations_administered @@ -58,7 +59,10 @@ If none of these fit, invent a short snake_case label. Do NOT put \ qualifiers (sex, age, sub-region, time-period like "weekly") in \ metric_name — capture those in `summary` or `location` instead. \ -"cases", "reported cases", "total cases" all map to confirmed_cases. +"cases", "reported cases", "total cases" all map to confirmed_cases. \ +"suspected cases", "probable cases", "possible cases" all map to \ +suspected_cases. "deaths" alone maps to deaths; "suspected deaths", \ +"probable deaths", "deaths under investigation" map to suspected_deaths. 7. Be aware of cognitive biases that affect information processing: - Anchoring: do not over-weight the first number you encounter. - Availability: rare dramatic events are not necessarily more likely. From f02beedac21a77081f8b6fa582359a93470c801d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 23:34:33 +0200 Subject: [PATCH 2/4] Require the quote field to contain the figure (extraction prompt) q12 Record 4 reported metric_value=82 with the quote "the outbreak now poses a 'very high' risk for Congo - up from a previous categorization of 'high'" - no digit, no number-word, no relative reference. The hallucination guard's verbatim-substring check passed because the quote string did appear in the source chunk, but nothing in the guard required the quote sentence to be the one actually carrying the figure. The metric_value of 82 came from elsewhere in the chunk; the quote was a "supporting context" sentence. A deterministic post-hoc check (str(metric_value) in quote) would over-reject: word numbers ("a dozen"), relative quantities ("a quarter of the population"), and number-word forms ("ninety-nine thousand") would all be false-positive rejections. So the fix lives at the prompt level instead: tell the model the quote MUST be the sentence that carries the figure - digits, number-word, or a clear relative reference. A purely contextual sentence is not acceptable. The verbatim-substring guard remains the safety net. This change tightens the model's understanding of what `quote` is supposed to do without committing to a brittle programmatic check that would lose real signal on legitimate paraphrases. Implements item 4 from the Tier 2 roadmap. --- bioscancast/insight/extraction/prompts.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bioscancast/insight/extraction/prompts.py b/bioscancast/insight/extraction/prompts.py index ab603b6..a750014 100644 --- a/bioscancast/insight/extraction/prompts.py +++ b/bioscancast/insight/extraction/prompts.py @@ -27,7 +27,12 @@ by the chunk text. Do NOT infer, speculate, or use outside knowledge. 2. For each fact, provide a verbatim quote from the chunk (max 200 \ characters) that supports the claim. The quote must be an exact \ -substring of the chunk text. +substring of the chunk text. The quote MUST be the sentence (or \ +sentence fragment) that carries the figure itself — it must contain \ +the metric_value either as digits (e.g. "82"), as a number-word \ +(e.g. "eighty-two", "a dozen"), or as a clear relative reference \ +(e.g. "a quarter of the population"). A contextual or supporting \ +sentence that mentions the topic but not the figure is NOT acceptable. 3. If the chunk contains no relevant facts, return an empty facts list. \ This is expected and common — most chunks are irrelevant. 4. Do NOT answer the forecast question. Your job is fact extraction, \ From 278b65956ad08573aeb8024e375234c2731d643b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Wed, 27 May 2026 23:39:28 +0200 Subject: [PATCH 3/4] Adaptive retrieval_top_k when filter survivors are few When the filtering stage passes only a handful of documents through to insight, per-document retrieval depth becomes the bottleneck on coverage. q7's live run reached insight with 2 usable documents and hit retrieval_top_k=12 on each - meaning at most ~24 chunk extractions for the whole question. Bumping per-doc retrieval depth costs little and gives the model more chances to find the relevant figure in each surviving document. Adds two InsightConfig fields: - low_survival_doc_threshold (default 5) - low_survival_top_k (default 20) When the count of usable documents (status != "failed" and non-empty chunks) is at or below the threshold, both retrieval_top_k and max_chunks_per_document effectively rise to low_survival_top_k for the run, and a note is appended to InsightRunResult.notes flagging that the adaptive path engaged. Default config (12 doc threshold not hit -> normal top_k=12) is unchanged. Tests that pin retrieval_top_k to small values to control fake-LLM call counts now also pin low_survival_top_k to the same value, opting out of the adaptive lift explicitly. 447 tests still passing. Implements item 6 from the Tier 2 roadmap. Completes the planned bundle of items 1+2+3+4+5+6. --- bioscancast/insight/config.py | 14 +++++++++ bioscancast/insight/pipeline.py | 28 +++++++++++++++-- bioscancast/tests/test_insight_pipeline.py | 35 ++++++++++++++++------ 3 files changed, 66 insertions(+), 11 deletions(-) diff --git a/bioscancast/insight/config.py b/bioscancast/insight/config.py index fad6418..6eaf37b 100644 --- a/bioscancast/insight/config.py +++ b/bioscancast/insight/config.py @@ -16,6 +16,8 @@ "max_chunks_per_document": 12, "extraction_max_output_tokens": 4096, "chunk_workers": 6, + "low_survival_doc_threshold": 5, + "low_survival_top_k": 20, } @@ -43,6 +45,18 @@ class InsightConfig: Set to 1 for sequential execution (useful for debugging or rate- limit-sensitive setups).""" + low_survival_doc_threshold: int = 5 + """When the filter passes fewer than this many usable documents to + insight, switch to ``low_survival_top_k`` for both retrieval and the + per-document chunk cap. q7 reached insight with only 2 surviving + documents; in that regime per-doc retrieval depth becomes the + bottleneck on coverage.""" + + low_survival_top_k: int = 20 + """Retrieval / per-doc cap used when usable documents are at or below + ``low_survival_doc_threshold``. Set to ``None`` (or equal to + ``retrieval_top_k``) to disable the adaptive lift.""" + @classmethod def from_dict(cls, d: dict) -> InsightConfig: """Create an InsightConfig from a dict, ignoring unknown keys.""" diff --git a/bioscancast/insight/pipeline.py b/bioscancast/insight/pipeline.py index ea294fd..7d5c5dd 100644 --- a/bioscancast/insight/pipeline.py +++ b/bioscancast/insight/pipeline.py @@ -103,6 +103,30 @@ def run( result = InsightRunResult() embedding_cache: dict[str, list[float]] = {} + # Adaptive top-k: when the filter passes through only a handful + # of usable documents, lift retrieval depth so the per-doc chunk + # budget isn't the bottleneck on coverage. See InsightConfig + # docstrings for the rationale. + usable_doc_count = sum( + 1 for d in documents if d.status != "failed" and d.chunks + ) + if usable_doc_count <= config.low_survival_doc_threshold: + effective_top_k = max(config.retrieval_top_k, config.low_survival_top_k) + effective_max_chunks = max( + config.max_chunks_per_document, config.low_survival_top_k + ) + if effective_top_k != config.retrieval_top_k: + result.notes.append( + f"Low-survival adaptive top_k engaged: " + f"{usable_doc_count} usable docs (≤ threshold " + f"{config.low_survival_doc_threshold}); " + f"retrieval_top_k={effective_top_k} (default " + f"{config.retrieval_top_k})." + ) + else: + effective_top_k = config.retrieval_top_k + effective_max_chunks = config.max_chunks_per_document + for doc in documents: # --- Skip check --- if doc.status == "failed" or not doc.chunks: @@ -126,7 +150,7 @@ def run( question, doc, self._llm, - top_k=config.retrieval_top_k, + top_k=effective_top_k, bm25_weight=config.bm25_weight, embedding_weight=config.embedding_weight, embedding_model=config.embedding_model, @@ -134,7 +158,7 @@ def run( ) # Cap chunks per document - scored_chunks = scored_chunks[: config.max_chunks_per_document] + scored_chunks = scored_chunks[:effective_max_chunks] # --- Per-chunk extraction (parallel within a doc) --- # Live tests on real biosecurity documents show the per-doc diff --git a/bioscancast/tests/test_insight_pipeline.py b/bioscancast/tests/test_insight_pipeline.py index 1d6b9ba..679924a 100644 --- a/bioscancast/tests/test_insight_pipeline.py +++ b/bioscancast/tests/test_insight_pipeline.py @@ -59,7 +59,9 @@ def test_pipeline_single_document(): RISK_ASSESSMENT_RESPONSE, # chunk p4 (no facts) ]) - config = InsightConfig(retrieval_top_k=5, max_chunks_per_document=5) + config = InsightConfig( + retrieval_top_k=5, max_chunks_per_document=5, low_survival_top_k=5, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN]) @@ -91,7 +93,9 @@ def test_pipeline_skips_failed_documents(): EMPTY_RESPONSE, # For the one chunk that gets extracted ]) - config = InsightConfig(retrieval_top_k=1, max_chunks_per_document=1) + config = InsightConfig( + retrieval_top_k=1, max_chunks_per_document=1, low_survival_top_k=1, + ) pipeline = InsightPipeline(llm_client=client, config=config) # Include a failed document alongside a successful one @@ -114,7 +118,9 @@ def test_pipeline_budget_tracking(): SUDAN_TABLE_RESPONSE, ]) - config = InsightConfig(retrieval_top_k=2, max_chunks_per_document=2) + config = InsightConfig( + retrieval_top_k=2, max_chunks_per_document=2, low_survival_top_k=2, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN]) @@ -137,6 +143,7 @@ def test_pipeline_stops_on_budget_exceeded(): config = InsightConfig( retrieval_top_k=2, max_chunks_per_document=2, + low_survival_top_k=2, max_input_tokens_per_run=1, # Absurdly low -> triggers immediately ) pipeline = InsightPipeline(llm_client=client, config=config) @@ -170,7 +177,9 @@ def test_pipeline_deduplication(): DUPLICATE_SUDAN_CASE_COUNT, # doc 2 -> 1 fact (duplicate case) ]) - config = InsightConfig(retrieval_top_k=1, max_chunks_per_document=1) + config = InsightConfig( + retrieval_top_k=1, max_chunks_per_document=1, low_survival_top_k=1, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN, doc2]) @@ -580,6 +589,7 @@ def test_pipeline_parallel_chunk_extraction_produces_all_records(): config = InsightConfig( retrieval_top_k=4, max_chunks_per_document=4, + low_survival_top_k=4, chunk_workers=4, ) pipeline = InsightPipeline(llm_client=fake, config=config) @@ -603,10 +613,12 @@ def test_pipeline_sequential_and_parallel_produce_same_record_count(): of records when the fake LLM is content-keyed (so result depends on chunk content, not worker order).""" config_seq = InsightConfig( - retrieval_top_k=4, max_chunks_per_document=4, chunk_workers=1, + retrieval_top_k=4, max_chunks_per_document=4, low_survival_top_k=4, + chunk_workers=1, ) config_par = InsightConfig( - retrieval_top_k=4, max_chunks_per_document=4, chunk_workers=4, + retrieval_top_k=4, max_chunks_per_document=4, low_survival_top_k=4, + chunk_workers=4, ) seq_pipeline = InsightPipeline( @@ -653,7 +665,8 @@ def embed(self, texts, *, model): fake = _IntermittentFake() config = InsightConfig( - retrieval_top_k=4, max_chunks_per_document=4, chunk_workers=4, + retrieval_top_k=4, max_chunks_per_document=4, low_survival_top_k=4, + chunk_workers=4, ) pipeline = InsightPipeline(llm_client=fake, config=config) # Must not raise — failed chunk is logged and skipped @@ -684,7 +697,9 @@ def test_pipeline_multi_document(): H5N1_TABLE_RESPONSE, ]) - config = InsightConfig(retrieval_top_k=2, max_chunks_per_document=2) + config = InsightConfig( + retrieval_top_k=2, max_chunks_per_document=2, low_survival_top_k=2, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_H5N1, [DOC_WHO_SUDAN, DOC_CDC_H5N1]) @@ -709,7 +724,9 @@ def test_pipeline_output_records_valid(): SUDAN_TABLE_RESPONSE, ]) - config = InsightConfig(retrieval_top_k=2, max_chunks_per_document=2) + config = InsightConfig( + retrieval_top_k=2, max_chunks_per_document=2, low_survival_top_k=2, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN]) From 248fc229044aa6ca67a6f8a22fc4c4658e684d5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Mod=C3=A9e?= Date: Thu, 28 May 2026 08:14:05 +0200 Subject: [PATCH 4/4] Add case-insensitive layer 4 to the hallucination guard Item 8 investigation: of the three quotes the guard dropped across the q12 live runs, two were real facts lost purely because the model lowercased the leading letter of a sentence it quoted from mid-paragraph: source: "There are now 750 suspected cases and 177 suspected deaths" model: "there are now 750 suspected cases and 177 suspected deaths" source: "The Congolese Ministry of Communication, in a post to X ... said that there were 904 suspected cases and 119 ..." model: "the Congolese Ministry of Communication, in a post to X ..." The third rejection was a genuine content-insertion hallucination - the model bolted the real prefix "a total of 105 confirmed cases (including 10 deaths)" onto a fabricated continuation "...have been reported in Ituri, North Kivu, and South Kivu" (the source actually continues "...and 906 suspected cases"). Fix: a fourth, case-insensitive substring layer. It returns the chunk's own casing so the stored quote still reflects the source verbatim. The key safety property - verified against the real q12 fabrication and captured in a new regression test - is that case-folding does NOT recover content insertions: a fabricated continuation fails the substring test regardless of case. Tests: new _LAYER4_CASE_INSENSITIVE_CASES (the two recovered q12 quotes) plus a hallucination case mirroring the q12 fabrication that must stay rejected. 450 passed (was 447; +3 guard cases). Implements item 8 from the roadmap. --- .../insight/extraction/chunk_extractor.py | 18 ++++++++++ .../tests/test_insight_chunk_extractor.py | 35 ++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/bioscancast/insight/extraction/chunk_extractor.py b/bioscancast/insight/extraction/chunk_extractor.py index dd24d71..d8a773d 100644 --- a/bioscancast/insight/extraction/chunk_extractor.py +++ b/bioscancast/insight/extraction/chunk_extractor.py @@ -221,6 +221,24 @@ def _quote_matches(quote: str, chunk_text: str) -> Optional[str]: if unwrap_quote in unwrap_chunk: return unwrap_quote + # Layer 4: case-insensitive substring. Catches the model lowercasing + # the leading letter of a sentence it quotes from mid-paragraph - + # otherwise verbatim drift that's very common (q12 live runs: + # "there are now 750 suspected cases..." vs the source's "There are + # now 750..."). Returns the chunk's own casing so the stored quote + # reflects the source. Crucially this does NOT recover content- + # insertion hallucinations: a fabricated continuation still fails the + # substring test regardless of case (verified against the q12 + # "...have been reported in Ituri, North Kivu" fabrication, whose real + # source text continues "...and 906 suspected cases"). + ci_chunk = norm_chunk.lower() + for candidate in (norm_quote, stripped): + if not candidate: + continue + idx = ci_chunk.find(candidate.lower()) + if idx >= 0: + return norm_chunk[idx: idx + len(candidate)] + return None diff --git a/bioscancast/tests/test_insight_chunk_extractor.py b/bioscancast/tests/test_insight_chunk_extractor.py index 2ba0421..4a112b9 100644 --- a/bioscancast/tests/test_insight_chunk_extractor.py +++ b/bioscancast/tests/test_insight_chunk_extractor.py @@ -367,6 +367,25 @@ def test_response_returned_for_budget_tracking(): ] +_LAYER4_CASE_INSENSITIVE_CASES = [ + ( + # Real q12 finding: model lowercased the leading "T" of a sentence + # it quoted from mid-paragraph; otherwise verbatim. + "leading letter lowercased by model", + "There are now 750 suspected cases and 177 suspected deaths, though more are expected.", + "there are now 750 suspected cases and 177 suspected deaths", + True, + ), + ( + # Real q12 finding: same drift on a longer attribution clause. + "leading 'The' lowercased mid-paragraph quote", + "The Congolese Ministry of Communication, in a post to X on Sunday, said that there were 904 suspected cases and 119 suspected deaths.", + "the Congolese Ministry of Communication, in a post to X on Sunday, said that there were 904 suspected cases and 119 suspected deaths", + True, + ), +] + + _HALLUCINATION_CASES = [ ( "fabricated word inserted into list", @@ -374,6 +393,17 @@ def test_response_returned_for_budget_tracking(): "Ghana, Atlantis, and Liberia have reported human mpox due to clade IIa MPXV.", False, ), + ( + # Real q12 finding: model bolted a real prefix ("a total of 105 + # confirmed cases (including 10 deaths)") onto a fabricated + # continuation. The source actually continues "...and 906 + # suspected cases". Must stay rejected even with the new + # case-insensitive layer 4. + "real prefix bolted onto fabricated continuation (q12)", + "According to the Ministry of Health of DRC on 25 May, a total of 105 confirmed cases (including 10 deaths) and 906 suspected cases.", + "a total of 105 confirmed cases (including 10 deaths) have been reported in Ituri, North Kivu, and South Kivu", + False, + ), ( "wholesale fabrication", "Some real chunk content about measles cases in Utah.", @@ -410,7 +440,10 @@ def test_response_returned_for_budget_tracking(): @pytest.mark.parametrize( "label,chunk_text,quote,should_match", - _LAYER1_NFKC_CASES + _LAYER2_TERMINAL_PUNCTUATION_CASES + _LAYER3_WRAPPING_PUNCTUATION_CASES, + _LAYER1_NFKC_CASES + + _LAYER2_TERMINAL_PUNCTUATION_CASES + + _LAYER3_WRAPPING_PUNCTUATION_CASES + + _LAYER4_CASE_INSENSITIVE_CASES, ) def test_quote_matches_accepts_real_quotes_with_normalisation_drift( label, chunk_text, quote, should_match