diff --git a/bioscancast/insight/config.py b/bioscancast/insight/config.py index fad6418..6eaf37b 100644 --- a/bioscancast/insight/config.py +++ b/bioscancast/insight/config.py @@ -16,6 +16,8 @@ "max_chunks_per_document": 12, "extraction_max_output_tokens": 4096, "chunk_workers": 6, + "low_survival_doc_threshold": 5, + "low_survival_top_k": 20, } @@ -43,6 +45,18 @@ class InsightConfig: Set to 1 for sequential execution (useful for debugging or rate- limit-sensitive setups).""" + low_survival_doc_threshold: int = 5 + """When the filter passes fewer than this many usable documents to + insight, switch to ``low_survival_top_k`` for both retrieval and the + per-document chunk cap. q7 reached insight with only 2 surviving + documents; in that regime per-doc retrieval depth becomes the + bottleneck on coverage.""" + + low_survival_top_k: int = 20 + """Retrieval / per-doc cap used when usable documents are at or below + ``low_survival_doc_threshold``. Set to ``None`` (or equal to + ``retrieval_top_k``) to disable the adaptive lift.""" + @classmethod def from_dict(cls, d: dict) -> InsightConfig: """Create an InsightConfig from a dict, ignoring unknown keys.""" diff --git a/bioscancast/insight/extraction/chunk_extractor.py b/bioscancast/insight/extraction/chunk_extractor.py index dd24d71..d8a773d 100644 --- a/bioscancast/insight/extraction/chunk_extractor.py +++ b/bioscancast/insight/extraction/chunk_extractor.py @@ -221,6 +221,24 @@ def _quote_matches(quote: str, chunk_text: str) -> Optional[str]: if unwrap_quote in unwrap_chunk: return unwrap_quote + # Layer 4: case-insensitive substring. Catches the model lowercasing + # the leading letter of a sentence it quotes from mid-paragraph - + # otherwise verbatim drift that's very common (q12 live runs: + # "there are now 750 suspected cases..." vs the source's "There are + # now 750..."). Returns the chunk's own casing so the stored quote + # reflects the source. Crucially this does NOT recover content- + # insertion hallucinations: a fabricated continuation still fails the + # substring test regardless of case (verified against the q12 + # "...have been reported in Ituri, North Kivu" fabrication, whose real + # source text continues "...and 906 suspected cases"). + ci_chunk = norm_chunk.lower() + for candidate in (norm_quote, stripped): + if not candidate: + continue + idx = ci_chunk.find(candidate.lower()) + if idx >= 0: + return norm_chunk[idx: idx + len(candidate)] + return None diff --git a/bioscancast/insight/extraction/prompts.py b/bioscancast/insight/extraction/prompts.py index 0f74b4c..a750014 100644 --- a/bioscancast/insight/extraction/prompts.py +++ b/bioscancast/insight/extraction/prompts.py @@ -27,7 +27,12 @@ by the chunk text. Do NOT infer, speculate, or use outside knowledge. 2. For each fact, provide a verbatim quote from the chunk (max 200 \ characters) that supports the claim. The quote must be an exact \ -substring of the chunk text. +substring of the chunk text. The quote MUST be the sentence (or \ +sentence fragment) that carries the figure itself — it must contain \ +the metric_value either as digits (e.g. "82"), as a number-word \ +(e.g. "eighty-two", "a dozen"), or as a clear relative reference \ +(e.g. "a quarter of the population"). A contextual or supporting \ +sentence that mentions the topic but not the figure is NOT acceptable. 3. If the chunk contains no relevant facts, return an empty facts list. \ This is expected and common — most chunks are irrelevant. 4. Do NOT answer the forecast question. Your job is fact extraction, \ @@ -40,12 +45,13 @@ 6. For metric_name, use one of these canonical snake_case values when \ applicable (this lets downstream dedup merge facts about the same \ metric across sources): - - confirmed_cases (suspected, probable, possible all get \ -their own variants below) - - suspected_cases - - probable_cases - - confirmed_or_probable_cases - - deaths + - confirmed_cases (the "confirmed" tier — lab-confirmed) + - suspected_cases (the "not-yet-confirmed" tier — covers \ +"suspected", "probable", and "possible" reporting categories) + - confirmed_or_probable_cases (WHO/CDC's combined reporting bucket) + - deaths (lab-confirmed deaths) + - suspected_deaths (the "not-yet-confirmed" tier for deaths — \ +covers "suspected", "probable", "under investigation" reporting) - hospitalizations - recoveries - vaccinations_administered @@ -58,7 +64,10 @@ If none of these fit, invent a short snake_case label. Do NOT put \ qualifiers (sex, age, sub-region, time-period like "weekly") in \ metric_name — capture those in `summary` or `location` instead. \ -"cases", "reported cases", "total cases" all map to confirmed_cases. +"cases", "reported cases", "total cases" all map to confirmed_cases. \ +"suspected cases", "probable cases", "possible cases" all map to \ +suspected_cases. "deaths" alone maps to deaths; "suspected deaths", \ +"probable deaths", "deaths under investigation" map to suspected_deaths. 7. Be aware of cognitive biases that affect information processing: - Anchoring: do not over-weight the first number you encounter. - Availability: rare dramatic events are not necessarily more likely. diff --git a/bioscancast/insight/pipeline.py b/bioscancast/insight/pipeline.py index ea294fd..7d5c5dd 100644 --- a/bioscancast/insight/pipeline.py +++ b/bioscancast/insight/pipeline.py @@ -103,6 +103,30 @@ def run( result = InsightRunResult() embedding_cache: dict[str, list[float]] = {} + # Adaptive top-k: when the filter passes through only a handful + # of usable documents, lift retrieval depth so the per-doc chunk + # budget isn't the bottleneck on coverage. See InsightConfig + # docstrings for the rationale. + usable_doc_count = sum( + 1 for d in documents if d.status != "failed" and d.chunks + ) + if usable_doc_count <= config.low_survival_doc_threshold: + effective_top_k = max(config.retrieval_top_k, config.low_survival_top_k) + effective_max_chunks = max( + config.max_chunks_per_document, config.low_survival_top_k + ) + if effective_top_k != config.retrieval_top_k: + result.notes.append( + f"Low-survival adaptive top_k engaged: " + f"{usable_doc_count} usable docs (≤ threshold " + f"{config.low_survival_doc_threshold}); " + f"retrieval_top_k={effective_top_k} (default " + f"{config.retrieval_top_k})." + ) + else: + effective_top_k = config.retrieval_top_k + effective_max_chunks = config.max_chunks_per_document + for doc in documents: # --- Skip check --- if doc.status == "failed" or not doc.chunks: @@ -126,7 +150,7 @@ def run( question, doc, self._llm, - top_k=config.retrieval_top_k, + top_k=effective_top_k, bm25_weight=config.bm25_weight, embedding_weight=config.embedding_weight, embedding_model=config.embedding_model, @@ -134,7 +158,7 @@ def run( ) # Cap chunks per document - scored_chunks = scored_chunks[: config.max_chunks_per_document] + scored_chunks = scored_chunks[:effective_max_chunks] # --- Per-chunk extraction (parallel within a doc) --- # Live tests on real biosecurity documents show the per-doc diff --git a/bioscancast/tests/test_insight_chunk_extractor.py b/bioscancast/tests/test_insight_chunk_extractor.py index 2ba0421..4a112b9 100644 --- a/bioscancast/tests/test_insight_chunk_extractor.py +++ b/bioscancast/tests/test_insight_chunk_extractor.py @@ -367,6 +367,25 @@ def test_response_returned_for_budget_tracking(): ] +_LAYER4_CASE_INSENSITIVE_CASES = [ + ( + # Real q12 finding: model lowercased the leading "T" of a sentence + # it quoted from mid-paragraph; otherwise verbatim. + "leading letter lowercased by model", + "There are now 750 suspected cases and 177 suspected deaths, though more are expected.", + "there are now 750 suspected cases and 177 suspected deaths", + True, + ), + ( + # Real q12 finding: same drift on a longer attribution clause. + "leading 'The' lowercased mid-paragraph quote", + "The Congolese Ministry of Communication, in a post to X on Sunday, said that there were 904 suspected cases and 119 suspected deaths.", + "the Congolese Ministry of Communication, in a post to X on Sunday, said that there were 904 suspected cases and 119 suspected deaths", + True, + ), +] + + _HALLUCINATION_CASES = [ ( "fabricated word inserted into list", @@ -374,6 +393,17 @@ def test_response_returned_for_budget_tracking(): "Ghana, Atlantis, and Liberia have reported human mpox due to clade IIa MPXV.", False, ), + ( + # Real q12 finding: model bolted a real prefix ("a total of 105 + # confirmed cases (including 10 deaths)") onto a fabricated + # continuation. The source actually continues "...and 906 + # suspected cases". Must stay rejected even with the new + # case-insensitive layer 4. + "real prefix bolted onto fabricated continuation (q12)", + "According to the Ministry of Health of DRC on 25 May, a total of 105 confirmed cases (including 10 deaths) and 906 suspected cases.", + "a total of 105 confirmed cases (including 10 deaths) have been reported in Ituri, North Kivu, and South Kivu", + False, + ), ( "wholesale fabrication", "Some real chunk content about measles cases in Utah.", @@ -410,7 +440,10 @@ def test_response_returned_for_budget_tracking(): @pytest.mark.parametrize( "label,chunk_text,quote,should_match", - _LAYER1_NFKC_CASES + _LAYER2_TERMINAL_PUNCTUATION_CASES + _LAYER3_WRAPPING_PUNCTUATION_CASES, + _LAYER1_NFKC_CASES + + _LAYER2_TERMINAL_PUNCTUATION_CASES + + _LAYER3_WRAPPING_PUNCTUATION_CASES + + _LAYER4_CASE_INSENSITIVE_CASES, ) def test_quote_matches_accepts_real_quotes_with_normalisation_drift( label, chunk_text, quote, should_match diff --git a/bioscancast/tests/test_insight_pipeline.py b/bioscancast/tests/test_insight_pipeline.py index 1d6b9ba..679924a 100644 --- a/bioscancast/tests/test_insight_pipeline.py +++ b/bioscancast/tests/test_insight_pipeline.py @@ -59,7 +59,9 @@ def test_pipeline_single_document(): RISK_ASSESSMENT_RESPONSE, # chunk p4 (no facts) ]) - config = InsightConfig(retrieval_top_k=5, max_chunks_per_document=5) + config = InsightConfig( + retrieval_top_k=5, max_chunks_per_document=5, low_survival_top_k=5, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN]) @@ -91,7 +93,9 @@ def test_pipeline_skips_failed_documents(): EMPTY_RESPONSE, # For the one chunk that gets extracted ]) - config = InsightConfig(retrieval_top_k=1, max_chunks_per_document=1) + config = InsightConfig( + retrieval_top_k=1, max_chunks_per_document=1, low_survival_top_k=1, + ) pipeline = InsightPipeline(llm_client=client, config=config) # Include a failed document alongside a successful one @@ -114,7 +118,9 @@ def test_pipeline_budget_tracking(): SUDAN_TABLE_RESPONSE, ]) - config = InsightConfig(retrieval_top_k=2, max_chunks_per_document=2) + config = InsightConfig( + retrieval_top_k=2, max_chunks_per_document=2, low_survival_top_k=2, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN]) @@ -137,6 +143,7 @@ def test_pipeline_stops_on_budget_exceeded(): config = InsightConfig( retrieval_top_k=2, max_chunks_per_document=2, + low_survival_top_k=2, max_input_tokens_per_run=1, # Absurdly low -> triggers immediately ) pipeline = InsightPipeline(llm_client=client, config=config) @@ -170,7 +177,9 @@ def test_pipeline_deduplication(): DUPLICATE_SUDAN_CASE_COUNT, # doc 2 -> 1 fact (duplicate case) ]) - config = InsightConfig(retrieval_top_k=1, max_chunks_per_document=1) + config = InsightConfig( + retrieval_top_k=1, max_chunks_per_document=1, low_survival_top_k=1, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN, doc2]) @@ -580,6 +589,7 @@ def test_pipeline_parallel_chunk_extraction_produces_all_records(): config = InsightConfig( retrieval_top_k=4, max_chunks_per_document=4, + low_survival_top_k=4, chunk_workers=4, ) pipeline = InsightPipeline(llm_client=fake, config=config) @@ -603,10 +613,12 @@ def test_pipeline_sequential_and_parallel_produce_same_record_count(): of records when the fake LLM is content-keyed (so result depends on chunk content, not worker order).""" config_seq = InsightConfig( - retrieval_top_k=4, max_chunks_per_document=4, chunk_workers=1, + retrieval_top_k=4, max_chunks_per_document=4, low_survival_top_k=4, + chunk_workers=1, ) config_par = InsightConfig( - retrieval_top_k=4, max_chunks_per_document=4, chunk_workers=4, + retrieval_top_k=4, max_chunks_per_document=4, low_survival_top_k=4, + chunk_workers=4, ) seq_pipeline = InsightPipeline( @@ -653,7 +665,8 @@ def embed(self, texts, *, model): fake = _IntermittentFake() config = InsightConfig( - retrieval_top_k=4, max_chunks_per_document=4, chunk_workers=4, + retrieval_top_k=4, max_chunks_per_document=4, low_survival_top_k=4, + chunk_workers=4, ) pipeline = InsightPipeline(llm_client=fake, config=config) # Must not raise — failed chunk is logged and skipped @@ -684,7 +697,9 @@ def test_pipeline_multi_document(): H5N1_TABLE_RESPONSE, ]) - config = InsightConfig(retrieval_top_k=2, max_chunks_per_document=2) + config = InsightConfig( + retrieval_top_k=2, max_chunks_per_document=2, low_survival_top_k=2, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_H5N1, [DOC_WHO_SUDAN, DOC_CDC_H5N1]) @@ -709,7 +724,9 @@ def test_pipeline_output_records_valid(): SUDAN_TABLE_RESPONSE, ]) - config = InsightConfig(retrieval_top_k=2, max_chunks_per_document=2) + config = InsightConfig( + retrieval_top_k=2, max_chunks_per_document=2, low_survival_top_k=2, + ) pipeline = InsightPipeline(llm_client=client, config=config) result = pipeline.run(QUESTION_SUDAN, [DOC_WHO_SUDAN])